132 files changed, 9704 insertions, 4065 deletions
diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
index 54183f0..2066a20 100644
--- a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
@@ -177,7 +177,7 @@ loop_16:
     vmlsl.s16     q15, d23, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
     vqrshrun.s32  d22, q1, #10
     vqrshrun.s32  d23, q15, #10
-    vqshrun.s16   d22, q11, #0
+    vqmovun.s16   d22, q11
     vst1.u8       {d22}, [r1], r10      @//Store dest row0, column 1; (1/2,1/2)
     vext.16       q11, q13, q14, #2     @//extract a[2]                         (column2)
     vaddl.s16     q1, d20, d26          @// a0 + a5                             (column2)
@@ -196,7 +196,7 @@ loop_16:
     vqrshrun.s32  d20, q1, #10
     vqrshrun.s32  d21, q15, #10
     vld1.u32      {d2, d3, d4}, [r0], r2 @ Vector load from src[6_0]
-    vqshrun.s16   d22, q10, #0
+    vqmovun.s16   d22, q10
     vst1.u8       {d22}, [r1], r7       @//Store dest row0 ,column 2; (1/2,1/2)
 
     @ vERTICAL FILTERING FOR ROW 1
@@ -236,7 +236,7 @@ loop_16:
     vmlsl.s16     q15, d23, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
     vqrshrun.s32  d22, q3, #10
     vqrshrun.s32  d23, q15, #10
-    vqshrun.s16   d22, q11, #0
+    vqmovun.s16   d22, q11
     vst1.u8       {d22}, [r1], r10      @//Store dest row1, column 1; (1/2,1/2)
     vext.16       q11, q13, q14, #2     @//extract a[2]                         (column2)
     vaddl.s16     q3, d20, d26          @// a0 + a5                             (column2)
@@ -254,7 +254,7 @@ loop_16:
     vmlsl.s16     q15, d21, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2)
     vqrshrun.s32  d20, q3, #10
     vqrshrun.s32  d21, q15, #10
-    vqshrun.s16   d22, q10, #0
+    vqmovun.s16   d22, q10
     vst1.u8       {d22}, [r1], r7       @//Store dest row1 ,column 2; (1/2,1/2)
 
     subs          r8, r8, #2            @ 2 rows processed, decrement by 2
@@ -315,7 +315,7 @@ loop_8:
     vaddl.u8      q15, d7, d13          @ temp2 = src[1_0] + src4_0]
     vmla.u16      q13, q14, d0[0]       @ temp += temp1 * 20
     vmls.s16      q13, q15, d1[0]       @ temp -= temp2 * 5
-    vqshrun.s16   d2, q9, #0
+    vqmovun.s16   d2, q9
     @ vERTICAL FILTERING FOR ROW 1
 
     @Q12,Q13 HAVE VERTICAL FILTERED VALUES
@@ -338,7 +338,7 @@ loop_8:
     vmlsl.s16     q15, d5, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
     vqrshrun.s32  d18, q14, #10
     vqrshrun.s32  d19, q15, #10
-    vqshrun.s16   d3, q9, #0
+    vqmovun.s16   d3, q9
     vst1.u8       {d3}, [r1], r3        @//Store dest row1, column 1; (1/2,1/2)
 
     subs          r8, r8, #2            @ 2 rows processed, decrement by 2
@@ -398,7 +398,7 @@ loop_4:
     vmla.u16      q12, q10, d0[0]       @ temp += temp1 * 20
     vmls.s16      q12, q11, d1[0]       @ temp -= temp2 * 5
     vaddl.u8      q15, d7, d13          @ temp2 = src[1_0] + src4_0]
-    vqshrun.s16   d2, q9, #0
+    vqmovun.s16   d2, q9
     vmla.u16      q13, q14, d0[0]       @ temp += temp1 * 20
     vmls.s16      q13, q15, d1[0]       @ temp -= temp2 * 5
 
@@ -424,7 +424,7 @@ loop_4:
     vmlsl.s16     q15, d5, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
     vqrshrun.s32  d18, q14, #10
     vqrshrun.s32  d19, q15, #10
-    vqshrun.s16   d4, q9, #0
+    vqmovun.s16   d4, q9
     vst1.u32      {d4[0]}, [r1], r3     @//Store dest row1, column 1; (1/2,1/2)
 
     subs          r8, r8, #2            @ 2 rows processed, decrement by 2
diff --git a/common/arm/ih264_iquant_itrans_recon_a9.s b/common/arm/ih264_iquant_itrans_recon_a9.s
index 4e49f6a..a6af1cb 100644
--- a/common/arm/ih264_iquant_itrans_recon_a9.s
+++ b/common/arm/ih264_iquant_itrans_recon_a9.s
@@ -106,6 +106,7 @@
 @r8 =>  iq_start_idx
 @r10=>  pi2_dc_ld_addr
 .text
+.syntax unified
 .p2align 2
 
     .global ih264_iquant_itrans_recon_4x4_a9
@@ -141,7 +142,7 @@ ih264_iquant_itrans_recon_4x4_a9:
     vmul.s16      q11, q11, q14         @x[i]=(scale[i] * dequant[i]) where i = 8..15
 
     subs          r8, r8, #1            @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
-    ldreqsh       r9, [r10]             @ Loads signed halfword pi2_dc_ld_addr[0], if r8==1
+    ldrsheq       r9, [r10]             @ Loads signed halfword pi2_dc_ld_addr[0], if r8==1
 
     vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
     vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
diff --git a/common/arm/ih264_iquant_itrans_recon_dc_a9.s b/common/arm/ih264_iquant_itrans_recon_dc_a9.s
index 97c4724..d12665f 100644
--- a/common/arm/ih264_iquant_itrans_recon_dc_a9.s
+++ b/common/arm/ih264_iquant_itrans_recon_dc_a9.s
@@ -108,6 +108,7 @@
 @unused =>  pi2_dc_ld_addr
 
 .text
+.syntax unified
 .p2align 2
 
     .global ih264_iquant_itrans_recon_4x4_dc_a9
@@ -136,7 +137,7 @@ ih264_iquant_itrans_recon_4x4_dc_a9:
     asr           r6, r6, #4            @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4)
 
     subs          r9, r9, #1            @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
-    ldreqsh       r10, [r0]             @ Loads signed halfword pi2_src[0], if r9==1
+    ldrsheq       r10, [r0]             @ Loads signed halfword pi2_src[0], if r9==1
     moveq         r6, r10               @ Restore dc value in case of intra, i.e. r9 == 1
 
     add           r6, r6, #32           @i_macro = q0 + 32
diff --git a/common/arm/ih264_itrans_recon_a9.s b/common/arm/ih264_itrans_recon_a9.s
deleted file mode 100644
index 769d5d7..0000000
--- a/common/arm/ih264_itrans_recon_a9.s
+++ /dev/null
@@ -1,216 +0,0 @@
-@/******************************************************************************
-@ *
-@ * Copyright (C) 2015 The Android Open Source Project
-@ *
-@ * Licensed under the Apache License, Version 2.0 (the "License");
-@ * you may not use this file except in compliance with the License.
-@ * You may obtain a copy of the License at:
-@ *
-@ * http://www.apache.org/licenses/LICENSE-2.0
-@ *
-@ * Unless required by applicable law or agreed to in writing, software
-@ * distributed under the License is distributed on an "AS IS" BASIS,
-@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ * See the License for the specific language governing permissions and
-@ * limitations under the License.
-@ *
-@ *****************************************************************************
-@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
-@*/
-@**
-@ *******************************************************************************
-@ * @file
-@ *  ih264_itrans_recon_neon_a9.s
-@ *
-@ * @brief
-@ *  Contains function definitions for single stage  inverse transform
-@ *
-@ *
-@ * @par List of Functions:
-@ *  - ih264_itrans_recon_4x4_a9()
-@ *
-@ * @remarks
-@ *  None
-@ *
-@ *******************************************************************************
-@*
-@**
-@ *******************************************************************************
-@ *
-@ * @brief
-@ *  This function performs Inverse transform type Ci4 for 4*4 block
-@ *
-@ * @par Description:
-@ *  Performs inverse transform Ci4 and adds the residue to get the
-@ *  reconstructed block
-@ *
-@ * @param[in] pi16_levelBlock
-@ *  Input 4x4 coefficients
-@ *
-@ * @param[in] puc_predBuffer
-@ *  Prediction 4x4 block
-@ *
-@ * @param[out] puc_reconPic
-@ *  Output 4x4 block
-@ *
-@ * @param[in] ui16_picWidth
-@ *  Input stride
-@ *
-@ * @param[in] pred_strd
-@ *  Prediction stride
-@ *
-@ * @param[in] dst_strd
-@ *  Output Stride
-@ *
-@ * @param[in] zero_cols
-@ *  Zero columns in pi2_src
-@ *
-@ * @returns  Void
-@ *
-@ * @remarks
-@ *  None
-@ *
-@ *
-@ *******************************************************************************
-@ *
-@void ih264_itrans_recon_4x4(
-@       WORD16 *pi2_src,
-@       UWORD8 *pu1_pred,
-@       UWORD8 *pu1_recon,
-@       WORD32 src_strd,
-@       WORD32 pred_strd,
-@       WORD32 dst_strd,
-@       UWORD32 q_lev,          //quantizer level
-@       WORD32 *pi4_tmp)
-@**************Variables Vs Registers*****************************************
-@r0 => *pi2_src
-@r1 => *pu1_pred
-@r2 => *pu1_recon
-@r3 =>  src_strd
-@r4 =>  pred_strd
-@r5 =>  dst_strd
-@r6 =>  q_lev
-@r7 =>  *pi4_tmp
-
-.text
-.p2align 2
-
-
-    .global ih264_itrans_recon_4x4_a9
-
-ih264_itrans_recon_4x4_a9:
-    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
-    lsl           r3, r3, #1
-
-    vld1.16       d0, [r0], r3          @0th row pi2_src_tmp[0]
-    ldr           r4, [sp, #40]         @Loads pred_strd
-
-    vld1.16       d1, [r0], r3          @I row pi2_src_tmp[0]
-    ldr           r5, [sp, #44]         @Loads *dst_strd
-
-    vld1.16       d2, [r0], r3          @II row pi2_src_tmp[0]
-
-    vld1.16       d3, [r0]              @III row pi2_src_tmp[0]
-    ldr           r7, [sp, #52]         @Loads *pi4_tmp
-
-    vpush         {d8-d15}
-
-    vtrn.16       d0, d1                @Transpose to get all the 0th element in the single D register
-    vtrn.16       d2, d3
-    vtrn.32       d0, d2
-    vtrn.32       d1, d3                @D0 --> pi2_src_tmp[0], D1 --> pi2_src_tmp[1]
-                                        @D2 --> pi2_src_tmp[2], D3 --> pi2_src_tmp[3]
-
-    vaddl.s16     q3, d0, d2            @x0 = (pi2_src_tmp[0] +  pi2_src_tmp[2])
-    vsubl.s16     q4, d0, d2            @x1 = (pi2_src_tmp[0] -  pi2_src_tmp[2])
-    vshr.s16      d4, d1, #1            @pi2_src_tmp[1] >> 1
-    vshr.s16      d5, d3, #1            @pi2_src_tmp[3] >> 1
-
-    vsubl.s16     q5, d4, d3            @x2 = D_SHIFT(pi2_src_tmp[1],1,shft) -  pi2_src_tmp[3]
-
-    vaddl.s16     q6, d1, d5            @x3 = pi2_src_tmp[1] + D_SHIFT(pi2_src_tmp[3],1,shft)
-
-    vadd.s32      q8, q4, q5            @x1 + x2
-    vsub.s32      q9, q4, q5            @x1 - x2
-
-    vadd.s32      q7, q3, q6            @x0 + x3
-    vsub.s32      q10, q3, q6           @x0 - x3
-
-    vtrn.32       q7, q8                @Transpose the register to have the adjacent values
-
-    vtrn.32       q9, q10
-    vadd.s32      d6, d14, d15          @x0(0,1) = (pi4_tblk[0,1]     +  pi4_tblk[8,9])
-
-    vsub.s32      d7, d14, d15          @x1(0,1) = (pi4_tblk[0,1]     -  pi4_tblk[8,9])
-
-    vshr.s32      d4, d16, #1           @pi4_tblk[4,5] >> 1
-    vshr.s32      d5, d17, #1           @pi4_tblk[12,13] >> 1
-
-    vsub.s32      d8, d4, d17           @x2(0,1) = D_SHIFT(pi4_tblk[4,5],1,shft) -  pi4_tblk[12,13]
-    vadd.s32      d9, d16, d5           @x3(0,1) =  pi4_tblk[4,5] + D_SHIFT(pi4_tblk[12,13],1,shft)
-
-    vadd.s32      d10, d18, d19         @x0(2,3) = (pi4_tblk[2,3]     +  pi4_tblk[10,11])
-    vsub.s32      d11, d18, d19         @x1(2,3) = (pi4_tblk[2,3]     -  pi4_tblk[10,11])
-    vshr.s32      d4, d20, #1           @pi4_tblk[6,7] >> 1
-    vshr.s32      d5, d21, #1           @pi4_tblk[14,15] >> 1
-
-    vld1.32       d30[0], [r1], r4      @I row Load pu1_pred buffer
-    vsub.s32      d12, d4, d21          @x2(2,3) = D_SHIFT(pi4_tblk[6,7],1,shft) -  pi4_tblk[14,15]
-
-    vmovl.u8      q15, d30              @I row Convert 8 bit pred buffer to 16 bit
-    vadd.s32      d13, d20, d5          @x3(2,3) =  pi4_tblk[6,7] + D_SHIFT(pi4_tblk[14,15],1,shft)
-
-    vadd.s32      d16, d6, d9           @I row i_macro(0,1) = x0(0,1) + x3(0,1)
-
-    vld1.32       d28[0], [r1], r4      @II row Load pu1_pred buffer
-    vadd.s32      d17, d10, d13         @I row i_macro(2,3) = x0(2,3) + x3(2,3)
-
-    vqrshrn.s32   d16, q8, #6           @I row i_macro = D_SHIFT(i_macro,6,shft)
-
-    vmovl.u8      q14, d28              @II row Convert 8 bit pred buffer to 16 bit
-    vadd.u16      d16, d16, d30         @I row i_macro += *pu1_pred_tmp
-
-    vqmovun.s16   d16, q8               @I row CLIP_U8(i_macro)
-    vadd.s32      d18, d7, d8           @II row i_macro(0,1) = x1(0,1) + x2(0,1)
-
-    vld1.32       d26[0], [r1], r4      @III row Load pu1_pred buffer
-    vadd.s32      d19, d11, d12         @II row i_macro(2,3) = x1(2,3) + x2(2,3)
-
-    vqrshrn.s32   d18, q9, #6           @II row i_macro = D_SHIFT(i_macro,6,shft)
-
-    vmovl.u8      q13, d26              @III row Convert 8 bit pred buffer to 16 bit
-    vadd.u16      d18, d18, d28         @II row i_macro += *pu1_pred_tmp
-
-    vst1.32       d16[0], [r2], r5      @I row store the value
-    vsub.s32      d20, d7, d8           @III row i_macro(0,1) = x1(0,1) - x2(0,1)
-
-    vqmovun.s16   d18, q9               @II row CLIP_U8(i_macro)
-    vsub.s32      d21, d11, d12         @III row i_macro(2,3) = x1(2,3) - x2(2,3)
-
-    vld1.32       d24[0], [r1], r4      @IV row Load pu1_pred buffer
-    vqrshrn.s32   d20, q10, #6          @III row i_macro = D_SHIFT(i_macro,6,shft)
-
-    vmovl.u8      q12, d24              @IV row Convert 8 bit pred buffer to 16 bit
-    vadd.u16      d20, d20, d26         @III row i_macro += *pu1_pred_tmp
-
-    vqmovun.s16   d20, q10              @III row CLIP_U8(i_macro)
-    vsub.s32      d22, d6, d9           @IV row i_macro(0,1) = x0(0,1) - x3(0,1)
-
-    vst1.32       d18[0], [r2], r5      @II row store the value
-    vsub.s32      d23, d10, d13         @IV row i_macro(2,3) = x0(2,3) - x3(2,3)
-
-    vqrshrn.s32   d22, q11, #6          @IV row i_macro = D_SHIFT(i_macro,6,shft)
-
-    vst1.32       d20[0], [r2], r5      @III row store the value
-    vadd.u16      d22, d22, d24         @IV row i_macro += *pu1_pred_tmp
-
-    vqmovun.s16   d22, q11              @IV row CLIP_U8(i_macro)
-    vst1.32       d22[0], [r2], r5      @IV row store the value
-
-
-    vpop          {d8-d15}
-    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
-
-
-
-
diff --git a/common/armv8/ih264_deblk_chroma_av8.s b/common/armv8/ih264_deblk_chroma_av8.s
index 3021556..a4dbd23 100644
--- a/common/armv8/ih264_deblk_chroma_av8.s
+++ b/common/armv8/ih264_deblk_chroma_av8.s
@@ -337,7 +337,7 @@ ih264_deblk_chroma_horz_bslt4_av8:
     ldr       x9, [sp, #80]
     sub       x0, x0, x1, lsl #1        //x0 = uc_edgePixelU pointing to p1 of chroma U
     rev       w7, w7                    //
-    mov       v12.2s[0], w7             //D12[0] = ui_Bs
+    mov       v12.s[0], w7              //D12[0] = ui_Bs
     ld1       {v16.s}[0], [x8]          //D16[0] contains cliptab_cb
     ld1       {v17.s}[0], [x9]          //D17[0] contains cliptab_cr
     ld2       {v6.8b, v7.8b}, [x0], x1  //Q3=p1
diff --git a/common/armv8/ih264_deblk_luma_av8.s b/common/armv8/ih264_deblk_luma_av8.s
index bcdb03f..1b3950d 100644
--- a/common/armv8/ih264_deblk_luma_av8.s
+++ b/common/armv8/ih264_deblk_luma_av8.s
@@ -97,7 +97,7 @@ ih264_deblk_luma_horz_bslt4_av8:
     sub       x0, x0, x1                //x0 pointer to p2
     rev       w4, w4                    //
     ld1       {v10.8b, v11.8b}, [x0], x1 //p2 values are loaded into q5
-    mov       v12.2s[0], w4             //d12[0] = ui_Bs
+    mov       v12.s[0], w4              //d12[0] = ui_Bs
     mov       x6, x0                    //keeping backup of pointer to p1
     ld1       {v8.8b, v9.8b}, [x0], x1  //p1 values are loaded into q4
     mov       x7, x0                    //keeping backup of pointer to p0
@@ -364,8 +364,8 @@ ih264_deblk_luma_horz_bs4_av8:
     mov       v26.d[1] , v27.d[0]
     mov       v2.d[1] , v3.d[0]
     uaddl     v16.8h, v31.8b, v25.8b    //p2+p3 H
-    mla       v12.8h, v8.8h , v1.4h[0]  //(p0+q0+p1)+3*p2+2*p3 L
-    mla       v4.8h, v16.8h , v1.4h[0]  //(p0+q0+p1)+3*p2+2*p3 H
+    mla       v12.8h, v8.8h , v1.h[0]   //(p0+q0+p1)+3*p2+2*p3 L
+    mla       v4.8h, v16.8h , v1.h[0]   //(p0+q0+p1)+3*p2+2*p3 H
     bic       v16.16b, v20.16b , v18.16b //((ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
     mov       v17.d[0] , v16.d[1]       //&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
     bit       v2.16b, v28.16b , v20.16b //choosing between po' and p0"
@@ -443,7 +443,7 @@ ih264_deblk_luma_vert_bslt4_av8:
     ld1       {v4.8b}, [x0], x1         //row3
     rev       w12, w12                  //reversing ui_bs
     ld1       {v6.8b}, [x0], x1         //row4
-    mov       v18.2s[0], w12            //d12[0] = ui_Bs
+    mov       v18.s[0], w12             //d12[0] = ui_Bs
     ld1       {v16.s}[0], [x14]         //D16[0] contains cliptab
     ld1       {v8.8b}, [x0], x1         //row5
     uxtl      v18.8h, v18.8b            //q6 = uc_Bs in each 16 bt scalar
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
index 202c516..d2897b6 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
@@ -146,7 +146,7 @@ loop_16:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
     ext       v24.16b, v20.16b , v22.16b , #4
     ext       v26.16b, v20.16b , v22.16b , #6
@@ -174,7 +174,7 @@ loop_16:
 
     uqxtn     v19.8b, v19.8h
     uqxtn     v25.8b, v25.8h
-    mov       v19.2s[1], v25.2s[0]
+    mov       v19.s[1], v25.s[0]
 
     uaddl     v22.8h, v4.8b, v10.8b
     ld1       {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0]
@@ -228,7 +228,7 @@ loop_16:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
     ext       v24.16b, v20.16b , v22.16b , #4
     ext       v26.16b, v20.16b , v22.16b , #6
@@ -253,7 +253,7 @@ loop_16:
 
     uqxtn     v19.8b, v19.8h
     uqxtn     v25.8b, v25.8h
-    mov       v19.2s[1], v25.2s[0]
+    mov       v19.s[1], v25.s[0]
 
 
     uaddl     v22.8h, v6.8b, v0.8b
@@ -306,7 +306,7 @@ loop_16:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
 
     ext       v24.16b, v20.16b , v22.16b , #4
@@ -334,7 +334,7 @@ loop_16:
 
     uqxtn     v19.8b, v19.8h
     uqxtn     v25.8b, v25.8h
-    mov       v19.2s[1], v25.2s[0]
+    mov       v19.s[1], v25.s[0]
 
 
 
@@ -387,7 +387,7 @@ loop_16:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
 
     ext       v24.16b, v20.16b , v22.16b , #4
@@ -427,7 +427,7 @@ loop_16:
 
     uqxtn     v19.8b, v19.8h
     uqxtn     v25.8b, v25.8h
-    mov       v19.2s[1], v25.2s[0]
+    mov       v19.s[1], v25.s[0]
 
 
 
@@ -501,7 +501,7 @@ loop_8:
     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[7_0]
     uqxtn     v25.8b, v12.8h
     uqxtn     v13.8b, v13.8h
-    mov       v25.2s[1], v13.2s[0]
+    mov       v25.s[1], v13.s[0]
     uaddl     v16.8h, v8.8b, v10.8b
 
 
@@ -535,7 +535,7 @@ loop_8:
     uaddl     v28.8h, v9.8b, v11.8b
     uqxtn     v13.8b, v16.8h
     uqxtn     v17.8b, v17.8h
-    mov       v13.2s[1], v17.2s[0]
+    mov       v13.s[1], v17.s[0]
 
 
     uaddl     v14.8h, v5.8b, v3.8b
@@ -576,7 +576,7 @@ loop_8:
     mls       v16.8h, v30.8h , v24.8h
     uqxtn     v27.8b, v12.8h
     uqxtn     v13.8b, v13.8h
-    mov       v27.2s[1], v13.2s[0]
+    mov       v27.s[1], v13.s[0]
 
 
     ext       v22.16b, v28.16b , v16.16b , #10
@@ -616,7 +616,7 @@ loop_8:
     subs      x4, x4, #4
     uqxtn     v13.8b, v16.8h
     uqxtn     v17.8b, v17.8h
-    mov       v13.2s[1], v17.2s[0]
+    mov       v13.s[1], v17.s[0]
 
 
     mov       v0.16b, v8.16b
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
index 38f971b..546c807 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
@@ -275,7 +275,7 @@ loop_16_lowhalf:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
     ld1       {v0.2s, v1.2s}, [x0], x2  // row 5 load for horizontal filter
 
@@ -313,7 +313,7 @@ loop_16_lowhalf:
     uaddl     v2.8h, v1.8b, v4.8b
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
     add       v30.8h, v14.8h , v16.8h
     mls       v8.8h, v2.8h , v24.8h
     ld1       {v0.2s, v1.2s}, [x0], x2  // row 6 load for horizontal filter
@@ -355,7 +355,7 @@ loop_16_lowhalf:
     mls       v28.8h, v2.8h , v24.8h
     uqxtn     v27.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v27.2s[1], v19.2s[0]
+    mov       v27.s[1], v19.s[0]
     saddl     v18.4s, v12.4h, v28.4h
     saddl2    v6.4s, v12.8h, v28.8h
 
@@ -384,7 +384,7 @@ loop_16_lowhalf:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
     mov       v12.16b, v8.16b
     mov       v13.16b, v9.16b
@@ -523,7 +523,7 @@ loop_16_highhalf:
     mls       v20.8h, v2.8h , v24.8h
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
     ld1       {v0.2s, v1.2s}, [x8], x2
 
     urhadd    v26.8b, v18.8b , v26.8b
@@ -558,7 +558,7 @@ loop_16_highhalf:
     uaddl     v2.8h, v1.8b, v4.8b
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
     add       v30.8h, v14.8h , v16.8h
     mls       v8.8h, v2.8h , v24.8h
     ld1       {v0.2s, v1.2s}, [x8], x2
@@ -598,7 +598,7 @@ loop_16_highhalf:
     mls       v28.8h, v2.8h , v24.8h
     uqxtn     v27.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v27.2s[1], v19.2s[0]
+    mov       v27.s[1], v19.s[0]
 
 
     saddl     v18.4s, v12.4h, v28.4h
@@ -627,7 +627,7 @@ loop_16_highhalf:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
     mov       v12.16b, v8.16b
     mov       v13.16b, v9.16b
@@ -768,7 +768,7 @@ loop_8:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
     ld1       {v0.2s, v1.2s}, [x0], x2  // row 5 load for horizontal filter
 
@@ -812,7 +812,7 @@ loop_8:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
     add       v30.8h, v14.8h , v16.8h
     mls       v8.8h, v2.8h , v24.8h
@@ -855,7 +855,7 @@ loop_8:
     uqxtn     v27.8b, v18.8h
     uqxtn     v19.8b, v19.8h
 
-    mov       v27.2s[1], v19.2s[0]
+    mov       v27.s[1], v19.s[0]
 
     saddl     v18.4s, v12.4h, v28.4h
     saddl2    v6.4s, v12.8h, v28.8h
@@ -885,7 +885,7 @@ loop_8:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
 
     mov       v12.16b, v8.16b
@@ -1024,7 +1024,7 @@ loop_4:
 
     sqrshrun  v9.8b, v6.8h, #5
     sqrshrun  v7.8b, v7.8h, #5
-    mov       v9.2s[1], v7.2s[0]
+    mov       v9.s[1], v7.s[0]
 
     ext       v20.8b, v18.8b , v19.8b , #2
 
@@ -1089,7 +1089,7 @@ loop_4:
 
     sqrshrun  v10.8b, v8.8h, #5
     sqrshrun  v9.8b, v9.8h, #5
-    mov       v10.2s[1], v9.2s[0]
+    mov       v10.s[1], v9.s[0]
 
     mov       v12.8b, v28.8b
 
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
index b1e4866..3f3e297 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
@@ -209,7 +209,7 @@ loop_16:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
     ext       v24.16b, v20.16b , v22.16b , #4
     ext       v26.16b, v20.16b , v22.16b , #6
@@ -238,7 +238,7 @@ loop_16:
 
     uqxtn     v19.8b, v19.8h
     uqxtn     v18.8b, v18.8h
-    mov       v19.2s[1], v18.2s[0]
+    mov       v19.s[1], v18.s[0]
 
     ld1       {v18.2s}, [x1]
     sqrshrun  v20.8b, v20.8h, #5
@@ -297,7 +297,7 @@ loop_16:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
     ext       v24.16b, v20.16b , v22.16b , #4
     ext       v26.16b, v20.16b , v22.16b , #6
@@ -323,7 +323,7 @@ loop_16:
     ld1       {v22.4s}, [x6], x7
     uqxtn     v19.8b, v19.8h
     uqxtn     v18.8b, v18.8h
-    mov       v19.2s[1], v18.2s[0]
+    mov       v19.s[1], v18.s[0]
     ld1       {v18.4s}, [x1]
     sqrshrun  v20.8b, v20.8h, #5
     sqrshrun  v21.8b, v22.8h, #5
@@ -380,7 +380,7 @@ loop_16:
 
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
 
     ext       v24.16b, v20.16b , v22.16b , #4
@@ -409,7 +409,7 @@ loop_16:
 
     uqxtn     v19.8b, v19.8h
     uqxtn     v18.8b, v18.8h
-    mov       v19.2s[1], v18.2s[0]
+    mov       v19.s[1], v18.s[0]
 
     ld1       {v18.2s}, [x1]
     sqrshrun  v20.8b, v20.8h, #5
@@ -466,7 +466,7 @@ loop_16:
     ld1       {v22.4s}, [x9], #16
     uqxtn     v18.8b, v18.8h
     uqxtn     v19.8b, v19.8h
-    mov       v18.2s[1], v19.2s[0]
+    mov       v18.s[1], v19.s[0]
 
 
     ext       v24.16b, v20.16b , v22.16b , #4
@@ -506,7 +506,7 @@ loop_16:
 
     uqxtn     v19.8b, v19.8h
     uqxtn     v18.8b, v18.8h
-    mov       v19.2s[1], v18.2s[0]
+    mov       v19.s[1], v18.s[0]
 
     ld1       {v20.4s}, [x6], #16
     ld1       {v22.4s}, [x6], x7
@@ -586,7 +586,7 @@ loop_8:
     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[7_0]
     uqxtn     v25.8b, v12.8h
     uqxtn     v13.8b, v13.8h
-    mov       v25.2s[1], v13.2s[0]
+    mov       v25.s[1], v13.s[0]
     uaddl     v16.8h, v8.8b, v10.8b
 
 
@@ -620,7 +620,7 @@ loop_8:
     uaddl     v28.8h, v9.8b, v11.8b
     uqxtn     v13.8b, v16.8h
     uqxtn     v17.8b, v17.8h
-    mov       v13.2s[1], v17.2s[0]
+    mov       v13.s[1], v17.s[0]
 
     urhadd    v12.16b, v12.16b , v14.16b
     urhadd    v13.16b, v13.16b , v15.16b
@@ -662,7 +662,7 @@ loop_8:
     mls       v16.8h, v30.8h , v24.8h
     uqxtn     v27.8b, v12.8h
     uqxtn     v13.8b, v13.8h
-    mov       v27.2s[1], v13.2s[0]
+    mov       v27.s[1], v13.s[0]
 
     sqrshrun  v14.8b, v14.8h, #5
     ext       v22.16b, v28.16b , v16.16b , #10
@@ -702,7 +702,7 @@ loop_8:
     subs      x4, x4, #4
     uqxtn     v13.8b, v16.8h
     uqxtn     v17.8b, v17.8h
-    mov       v13.2s[1], v17.2s[0]
+    mov       v13.s[1], v17.s[0]
     urhadd    v12.16b, v12.16b , v14.16b
     urhadd    v13.16b, v13.16b , v15.16b
 
diff --git a/common/armv8/ih264_intra_pred_chroma_av8.s b/common/armv8/ih264_intra_pred_chroma_av8.s
index 2c5efb3..8f0f282 100644
--- a/common/armv8/ih264_intra_pred_chroma_av8.s
+++ b/common/armv8/ih264_intra_pred_chroma_av8.s
@@ -501,7 +501,7 @@ ih264_intra_pred_chroma_8x8_mode_plane_av8:
     add       v16.8h, v0.8h , v16.8h
     dup       v20.8h, v22.h[0]
     mul       v4.8h, v6.8h , v20.8h
-    dup       v30.8h, v22.4h[1]
+    dup       v30.8h, v22.h[1]
     mul       v18.8h, v6.8h , v20.8h
     mul       v14.8h, v6.8h , v30.8h
     mul       v8.8h, v6.8h , v30.8h
@@ -511,7 +511,7 @@ ih264_intra_pred_chroma_8x8_mode_plane_av8:
     sqrshrun  v28.8b, v24.8h, #5
     add       v26.8h, v16.8h , v8.8h
     sqrshrun  v29.8b, v0.8h, #5
-    dup       v20.8h, v22.4h[2]
+    dup       v20.8h, v22.h[2]
     st1       {v28.8b, v29.8b}, [x1], x3
     sqrshrun  v28.8b, v2.8h, #5
     sqrshrun  v29.8b, v26.8h, #5
@@ -520,7 +520,7 @@ ih264_intra_pred_chroma_8x8_mode_plane_av8:
     st1       {v28.8b, v29.8b}, [x1], x3
     add       v24.8h, v12.8h , v4.8h
     add       v0.8h, v16.8h , v18.8h
-    dup       v30.8h, v22.4h[3]
+    dup       v30.8h, v22.h[3]
     sqrshrun  v28.8b, v24.8h, #5
     sqrshrun  v29.8b, v0.8h, #5
     mul       v14.8h, v6.8h , v30.8h
diff --git a/common/armv8/ih264_intra_pred_luma_16x16_av8.s b/common/armv8/ih264_intra_pred_luma_16x16_av8.s
index a9eb165..c1847b5 100644
--- a/common/armv8/ih264_intra_pred_luma_16x16_av8.s
+++ b/common/armv8/ih264_intra_pred_luma_16x16_av8.s
@@ -467,7 +467,7 @@ ih264_intra_pred_luma_16x16_mode_plane_av8:
     ldrb      w5, [x7], #-1
     sxtw      x5, w5
     add       x8, x8, x8, lsl #1
-    dup       v4.8h, v0.4h[0]
+    dup       v4.8h, v0.h[0]
     add       x12, x12, x8
     ldrb      w9, [x0], #1
     sxtw      x9, w9
diff --git a/common/armv8/ih264_intra_pred_luma_8x8_av8.s b/common/armv8/ih264_intra_pred_luma_8x8_av8.s
index 2b972ca..bf9a4c1 100644
--- a/common/armv8/ih264_intra_pred_luma_8x8_av8.s
+++ b/common/armv8/ih264_intra_pred_luma_8x8_av8.s
@@ -337,7 +337,7 @@ ih264_intra_pred_luma_8x8_mode_dc_av8:
     uaddlp    v3.2s, v1.4h
     uaddlp    v2.1d, v3.2s
     dup       v10.8h, w5
-    dup       v8.8h, v2.4h[0]
+    dup       v8.8h, v2.h[0]
     add       v12.8h, v8.8h , v10.8h
     sqrshrun  v31.8b, v12.8h, #4
     st1       {v31.8b}, [x1], x3
@@ -360,7 +360,7 @@ top_available: // ONLT TOP AVAILABLE
     uaddlp    v13.2s, v14.4h
     uaddlp    v12.1d, v13.2s
     rshrn     v4.8b, v12.8h, #3
-    dup       v31.8b, v4.8b[0]
+    dup       v31.8b, v4.b[0]
     st1       {v31.8b}, [x1], x3
     st1       {v31.8b}, [x1], x3
     st1       {v31.8b}, [x1], x3
@@ -1059,7 +1059,7 @@ ih264_intra_pred_luma_8x8_mode_horz_u_av8:
     mov       v30.16b, v4.16b
     mov       v31.16b, v6.16b
     tbl       v12.8b, {v30.16b, v31.16b}, v10.8b
-    dup       v14.16b, v5.8b[7]         //
+    dup       v14.16b, v5.b[7]          //
     tbl       v13.8b, {v30.16b, v31.16b}, v11.8b
     mov       v12.d[1], v13.d[0]
     ext       v16.16b, v12.16b , v14.16b , #2
diff --git a/common/armv8/ih264_mem_fns_neon_av8.s b/common/armv8/ih264_mem_fns_neon_av8.s
index f5c2e29..4e9020d 100644
--- a/common/armv8/ih264_mem_fns_neon_av8.s
+++ b/common/armv8/ih264_mem_fns_neon_av8.s
@@ -119,7 +119,7 @@ loop_neon_memcpy:
 
     subs      x2, x2, #8
     bge       loop_neon_memcpy
-    cmp       x2, #-8
+    cmn       x2, #8
     beq       end_func1
 
 arm_memcpy:
@@ -184,7 +184,7 @@ loop_neon_memset:
 
     subs      x2, x2, #8
     bge       loop_neon_memset
-    cmp       x2, #-8
+    cmn       x2, #8
     beq       end_func2
 
 arm_memset:
@@ -254,7 +254,7 @@ loop_neon_memset_16bit:
 
     subs      x2, x2, #8
     bge       loop_neon_memset_16bit
-    cmp       x2, #-8
+    cmn       x2, #8
     beq       end_func3
 
 arm_memset_16bit:
diff --git a/common/armv8/ih264_resi_trans_quant_av8.s b/common/armv8/ih264_resi_trans_quant_av8.s
index dc1c680..316c220 100644
--- a/common/armv8/ih264_resi_trans_quant_av8.s
+++ b/common/armv8/ih264_resi_trans_quant_av8.s
@@ -665,7 +665,7 @@ ih264_hadamard_quant_2x2_uv_av8:
     ld2       {v0.4h-v1.4h}, [x0]       //load src
 
     ld1       {v30.h}[0], [x2]          //load pu2_scale_matrix[0]
-    dup       v30.4h, v30.4h[0]         //pu2_scale_matrix
+    dup       v30.4h, v30.h[0]          //pu2_scale_matrix
     uxtl      v30.4s, v30.4h            //pu2_scale_matrix
 
     neg       w4, w4
diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s
index 96ef50a..b039fba 100644
--- a/common/armv8/ih264_weighted_bi_pred_av8.s
+++ b/common/armv8/ih264_weighted_bi_pred_av8.s
@@ -173,10 +173,10 @@ loop_4:                                 //each iteration processes four rows
     ld1       {v10.s}[1], [x1], x4      //load row 4 in source 2
     uxtl      v8.8h, v8.8b              //converting rows 3,4 in source 1 to 16-bit
     uxtl      v10.8h, v10.8b            //converting rows 3,4 in source 2 to 16-bit
-    mul       v4.8h, v4.8h , v2.4h[0]   //weight 1 mult. for rows 1,2
-    mla       v4.8h, v6.8h , v2.4h[2]   //weight 2 mult. for rows 1,2
-    mul       v8.8h, v8.8h , v2.4h[0]   //weight 1 mult. for rows 3,4
-    mla       v8.8h, v10.8h , v2.4h[2]  //weight 2 mult. for rows 3,4
+    mul       v4.8h, v4.8h , v2.h[0]    //weight 1 mult. for rows 1,2
+    mla       v4.8h, v6.8h , v2.h[2]    //weight 2 mult. for rows 1,2
+    mul       v8.8h, v8.8h , v2.h[0]    //weight 1 mult. for rows 3,4
+    mla       v8.8h, v10.8h , v2.h[2]   //weight 2 mult. for rows 3,4
     subs      w11, w11, #4              //decrement ht by 4
     srshl     v4.8h, v4.8h , v0.8h      //rounds off the weighted samples from rows 1,2
     srshl     v8.8h, v8.8h , v0.8h      //rounds off the weighted samples from rows 3,4
@@ -205,18 +205,18 @@ loop_8:                                 //each iteration processes four rows
     ld1       {v18.8b}, [x1], x4        //load row 4 in source 2
     uxtl      v8.8h, v8.8b              //converting row 2 in source 1 to 16-bit
     uxtl      v10.8h, v10.8b            //converting row 2 in source 2 to 16-bit
-    mul       v4.8h, v4.8h , v2.4h[0]   //weight 1 mult. for row 1
-    mla       v4.8h, v6.8h , v2.4h[2]   //weight 2 mult. for row 1
+    mul       v4.8h, v4.8h , v2.h[0]    //weight 1 mult. for row 1
+    mla       v4.8h, v6.8h , v2.h[2]    //weight 2 mult. for row 1
     uxtl      v12.8h, v12.8b            //converting row 3 in source 1 to 16-bit
     uxtl      v14.8h, v14.8b            //converting row 3 in source 2 to 16-bit
-    mul       v8.8h, v8.8h , v2.4h[0]   //weight 1 mult. for row 2
-    mla       v8.8h, v10.8h , v2.4h[2]  //weight 2 mult. for row 2
+    mul       v8.8h, v8.8h , v2.h[0]    //weight 1 mult. for row 2
+    mla       v8.8h, v10.8h , v2.h[2]   //weight 2 mult. for row 2
     uxtl      v16.8h, v16.8b            //converting row 4 in source 1 to 16-bit
     uxtl      v18.8h, v18.8b            //converting row 4 in source 2 to 16-bit
-    mul       v12.8h, v12.8h , v2.4h[0] //weight 1 mult. for row 3
-    mla       v12.8h, v14.8h , v2.4h[2] //weight 2 mult. for row 3
-    mul       v16.8h, v16.8h , v2.4h[0] //weight 1 mult. for row 4
-    mla       v16.8h, v18.8h , v2.4h[2] //weight 2 mult. for row 4
+    mul       v12.8h, v12.8h , v2.h[0]  //weight 1 mult. for row 3
+    mla       v12.8h, v14.8h , v2.h[2]  //weight 2 mult. for row 3
+    mul       v16.8h, v16.8h , v2.h[0]  //weight 1 mult. for row 4
+    mla       v16.8h, v18.8h , v2.h[2]  //weight 2 mult. for row 4
     srshl     v4.8h, v4.8h , v0.8h      //rounds off the weighted samples from row 1
     srshl     v8.8h, v8.8h , v0.8h      //rounds off the weighted samples from row 2
     srshl     v12.8h, v12.8h , v0.8h    //rounds off the weighted samples from row 3
@@ -251,35 +251,35 @@ loop_16:                                //each iteration processes two rows
     ld1       {v18.8b, v19.8b}, [x1], x4 //load row 4 in source 2
     uxtl      v4.8h, v5.8b              //converting row 1H in source 1 to 16-bit
     uxtl      v6.8h, v7.8b              //converting row 1H in source 2 to 16-bit
-    mul       v20.8h, v20.8h , v2.4h[0] //weight 1 mult. for row 1L
-    mla       v20.8h, v22.8h , v2.4h[2] //weight 2 mult. for row 1L
+    mul       v20.8h, v20.8h , v2.h[0]  //weight 1 mult. for row 1L
+    mla       v20.8h, v22.8h , v2.h[2]  //weight 2 mult. for row 1L
     uxtl      v24.8h, v8.8b             //converting row 2L in source 1 to 16-bit
     uxtl      v26.8h, v10.8b            //converting row 2L in source 2 to 16-bit
-    mul       v4.8h, v4.8h , v2.4h[0]   //weight 1 mult. for row 1H
-    mla       v4.8h, v6.8h , v2.4h[2]   //weight 2 mult. for row 1H
+    mul       v4.8h, v4.8h , v2.h[0]    //weight 1 mult. for row 1H
+    mla       v4.8h, v6.8h , v2.h[2]    //weight 2 mult. for row 1H
     uxtl      v8.8h, v9.8b              //converting row 2H in source 1 to 16-bit
     uxtl      v10.8h, v11.8b            //converting row 2H in source 2 to 16-bit
-    mul       v24.8h, v24.8h , v2.4h[0] //weight 1 mult. for row 2L
-    mla       v24.8h, v26.8h , v2.4h[2] //weight 2 mult. for row 2L
+    mul       v24.8h, v24.8h , v2.h[0]  //weight 1 mult. for row 2L
+    mla       v24.8h, v26.8h , v2.h[2]  //weight 2 mult. for row 2L
     uxtl      v28.8h, v12.8b            //converting row 3L in source 1 to 16-bit
     uxtl      v30.8h, v14.8b            //converting row 3L in source 2 to 16-bit
-    mul       v8.8h, v8.8h , v2.4h[0]   //weight 1 mult. for row 2H
-    mla       v8.8h, v10.8h , v2.4h[2]  //weight 2 mult. for row 2H
+    mul       v8.8h, v8.8h , v2.h[0]    //weight 1 mult. for row 2H
+    mla       v8.8h, v10.8h , v2.h[2]   //weight 2 mult. for row 2H
     uxtl      v12.8h, v13.8b            //converting row 3H in source 1 to 16-bit
     uxtl      v14.8h, v15.8b            //converting row 3H in source 2 to 16-bit
-    mul       v28.8h, v28.8h , v2.4h[0] //weight 1 mult. for row 3L
-    mla       v28.8h, v30.8h , v2.4h[2] //weight 2 mult. for row 3L
+    mul       v28.8h, v28.8h , v2.h[0]  //weight 1 mult. for row 3L
+    mla       v28.8h, v30.8h , v2.h[2]  //weight 2 mult. for row 3L
     uxtl      v22.8h, v16.8b            //converting row 4L in source 1 to 16-bit
     uxtl      v6.8h, v18.8b             //converting row 4L in source 2 to 16-bit
-    mul       v12.8h, v12.8h , v2.4h[0] //weight 1 mult. for row 3H
-    mla       v12.8h, v14.8h , v2.4h[2] //weight 2 mult. for row 3H
+    mul       v12.8h, v12.8h , v2.h[0]  //weight 1 mult. for row 3H
+    mla       v12.8h, v14.8h , v2.h[2]  //weight 2 mult. for row 3H
     uxtl      v16.8h, v17.8b            //converting row 4H in source 1 to 16-bit
     uxtl      v18.8h, v19.8b            //converting row 4H in source 2 to 16-bit
-    mul       v22.8h, v22.8h , v2.4h[0] //weight 1 mult. for row 4L
-    mla       v22.8h, v6.8h , v2.4h[2]  //weight 2 mult. for row 4L
+    mul       v22.8h, v22.8h , v2.h[0]  //weight 1 mult. for row 4L
+    mla       v22.8h, v6.8h , v2.h[2]   //weight 2 mult. for row 4L
     srshl     v20.8h, v20.8h , v0.8h    //rounds off the weighted samples from row 1L
-    mul       v16.8h, v16.8h , v2.4h[0] //weight 1 mult. for row 4H
-    mla       v16.8h, v18.8h , v2.4h[2] //weight 2 mult. for row 4H
+    mul       v16.8h, v16.8h , v2.h[0]  //weight 1 mult. for row 4H
+    mla       v16.8h, v18.8h , v2.h[2]  //weight 2 mult. for row 4H
     srshl     v4.8h, v4.8h , v0.8h      //rounds off the weighted samples from row 1H
     srshl     v24.8h, v24.8h , v0.8h    //rounds off the weighted samples from row 2L
     saddw     v20.8h, v20.8h , v3.8b    //adding offset for row 1L
diff --git a/common/armv8/ih264_weighted_pred_av8.s b/common/armv8/ih264_weighted_pred_av8.s
index ec5bb7a..69ed3b0 100644
--- a/common/armv8/ih264_weighted_pred_av8.s
+++ b/common/armv8/ih264_weighted_pred_av8.s
@@ -143,8 +143,8 @@ loop_4:                                 //each iteration processes four rows
     uxtl      v4.8h, v4.8b              //converting rows 1,2 to 16-bit
     uxtl      v6.8h, v6.8b              //converting rows 3,4 to 16-bit
 
-    mul       v4.8h, v4.8h , v2.4h[0]   //weight mult. for rows 1,2
-    mul       v6.8h, v6.8h , v2.4h[0]   //weight mult. for rows 3,4
+    mul       v4.8h, v4.8h , v2.h[0]    //weight mult. for rows 1,2
+    mul       v6.8h, v6.8h , v2.h[0]    //weight mult. for rows 3,4
 
     subs      w7, w7, #4                //decrement ht by 4
     srshl     v4.8h, v4.8h , v0.8h      //rounds off the weighted samples from rows 1,2
@@ -175,11 +175,11 @@ loop_8:                                 //each iteration processes four rows
     uxtl      v6.8h, v6.8b              //converting row 2 to 16-bit
 
     uxtl      v8.8h, v8.8b              //converting row 3 to 16-bit
-    mul       v4.8h, v4.8h , v2.4h[0]   //weight mult. for row 1
+    mul       v4.8h, v4.8h , v2.h[0]    //weight mult. for row 1
     uxtl      v10.8h, v10.8b            //converting row 4 to 16-bit
-    mul       v6.8h, v6.8h , v2.4h[0]   //weight mult. for row 2
-    mul       v8.8h, v8.8h , v2.4h[0]   //weight mult. for row 3
-    mul       v10.8h, v10.8h , v2.4h[0] //weight mult. for row 4
+    mul       v6.8h, v6.8h , v2.h[0]    //weight mult. for row 2
+    mul       v8.8h, v8.8h , v2.h[0]    //weight mult. for row 3
+    mul       v10.8h, v10.8h , v2.h[0]  //weight mult. for row 4
 
     srshl     v4.8h, v4.8h , v0.8h      //rounds off the weighted samples from row 1
     srshl     v6.8h, v6.8h , v0.8h      //rounds off the weighted samples from row 2
@@ -214,20 +214,20 @@ loop_16:                                //each iteration processes two rows
     uxtl      v14.8h, v5.8b             //converting row 1H to 16-bit
     ld1       {v10.8b, v11.8b}, [x0], x2 //load row 4 in source
     uxtl      v16.8h, v6.8b             //converting row 2L to 16-bit
-    mul       v12.8h, v12.8h , v2.4h[0] //weight mult. for row 1L
+    mul       v12.8h, v12.8h , v2.h[0]  //weight mult. for row 1L
     uxtl      v18.8h, v7.8b             //converting row 2H to 16-bit
-    mul       v14.8h, v14.8h , v2.4h[0] //weight mult. for row 1H
+    mul       v14.8h, v14.8h , v2.h[0]  //weight mult. for row 1H
     uxtl      v20.8h, v8.8b             //converting row 3L to 16-bit
-    mul       v16.8h, v16.8h , v2.4h[0] //weight mult. for row 2L
+    mul       v16.8h, v16.8h , v2.h[0]  //weight mult. for row 2L
     uxtl      v22.8h, v9.8b             //converting row 3H to 16-bit
-    mul       v18.8h, v18.8h , v2.4h[0] //weight mult. for row 2H
+    mul       v18.8h, v18.8h , v2.h[0]  //weight mult. for row 2H
     uxtl      v24.8h, v10.8b            //converting row 4L to 16-bit
-    mul       v20.8h, v20.8h , v2.4h[0] //weight mult. for row 3L
+    mul       v20.8h, v20.8h , v2.h[0]  //weight mult. for row 3L
     uxtl      v26.8h, v11.8b            //converting row 4H to 16-bit
-    mul       v22.8h, v22.8h , v2.4h[0] //weight mult. for row 3H
-    mul       v24.8h, v24.8h , v2.4h[0] //weight mult. for row 4L
+    mul       v22.8h, v22.8h , v2.h[0]  //weight mult. for row 3H
+    mul       v24.8h, v24.8h , v2.h[0]  //weight mult. for row 4L
     srshl     v12.8h, v12.8h , v0.8h    //rounds off the weighted samples from row 1L
-    mul       v26.8h, v26.8h , v2.4h[0] //weight mult. for row 4H
+    mul       v26.8h, v26.8h , v2.h[0]  //weight mult. for row 4H
     srshl     v14.8h, v14.8h , v0.8h    //rounds off the weighted samples from row 1H
     srshl     v16.8h, v16.8h , v0.8h    //rounds off the weighted samples from row 2L
     saddw     v12.8h, v12.8h , v3.8b    //adding offset for row 1L
diff --git a/common/ih264_chroma_intra_pred_filters.c b/common/ih264_chroma_intra_pred_filters.c
index ee145e5..1894bfc 100644
--- a/common/ih264_chroma_intra_pred_filters.c
+++ b/common/ih264_chroma_intra_pred_filters.c
@@ -117,7 +117,6 @@ void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
     WORD32 top_avail; /* availability of top predictors (only for DC) */
     UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
     UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
-    UNUSED(src_strd);
 
     /* temporary variables to store accumulated first left half,second left half,
      * first top half,second top half of U and  V values*/
@@ -127,6 +126,7 @@ void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
     WORD32 val_u1 = 0, val_u2 = 0, val_v1 = 0, val_v2 = 0;
 
     WORD32 col, row; /*loop variables*/
+    UNUSED(src_strd);
 
     left_avail = ngbr_avail & 0x11;
     left_avail1 = ngbr_avail & 1;
diff --git a/common/ih264_defs.h b/common/ih264_defs.h
index 6bf74d1..b26a5a4 100644
--- a/common/ih264_defs.h
+++ b/common/ih264_defs.h
@@ -270,6 +270,9 @@ typedef enum
     P8x8        = 6,
     PSKIP       = 7,
     IPCM        = 8,
+    B16x16      = 9,
+    BSKIP       = 10,
+    BDIRECT     = 11,
     MAX_MBTYPES,
 }MBTYPES_T;
 
diff --git a/common/ih264_itrans_recon.h b/common/ih264_itrans_recon.h
deleted file mode 100644
index fd1f239..0000000
--- a/common/ih264_itrans_recon.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/******************************************************************************
- *
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- *****************************************************************************
- * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
-*/
-/**
-*******************************************************************************
-* @file
-*  ih264_itrans_recon.h
-*
-* @brief
-*  Contains function declarations for inverse transform  and reconstruction of
-*  the quantized macro blocks
-*
-* @author
-*  Ittiam
-*
-* @par List of Functions:
-*  - ih264_itrans_recon_ft
-*  - ih264_itrans_recon_4x4
-*  - ih264_itrans_recon_8x8
-*  - ih264_itrans_recon_4x4_a9
-*
-*
-* @remarks
-*  None
-*
-*******************************************************************************
-*/
-
-#ifndef IH264_ITRANS_RECON_H_
-#define IH264_ITRANS_RECON_H_
-
-/*****************************************************************************/
-/* Extern Function Declarations                                              */
-/*****************************************************************************/
-
-typedef void ih264_itrans_recon_ft(WORD16 *pi2_src,
-                                   UWORD8 *pu1_pred,
-                                   UWORD8 *pu1_recon,
-                                   WORD32 src_strd,
-                                   WORD32 pred_strd,
-                                   WORD32 dst_strd,
-                                   UWORD32 q_lev,
-                                   WORD32 *pi4_tmp);
-
-/*C declarations*/
-
-ih264_itrans_recon_ft ih264_itrans_recon_4x4;
-
-ih264_itrans_recon_ft ih264_itrans_recon_8x8;
-
-/*A9 declarations */
-
-ih264_itrans_recon_ft ih264_itrans_recon_4x4_a9;
-
-#endif /* IH264_ITRANS_RECON_H_ */
diff --git a/common/ih264_structs.h b/common/ih264_structs.h
index fa4e142..0a7c940 100644
--- a/common/ih264_structs.h
+++ b/common/ih264_structs.h
@@ -1353,6 +1353,11 @@ typedef struct
      */
     UWORD8  u1_ref_idx_reordering_flag_l0;
 
+    /*
+     * ref_pic_list_reordering_flag_l1
+     */
+    UWORD8  u1_ref_idx_reordering_flag_l1;
+
     /**
      *  Reference prediction list modification
      */
@@ -1369,11 +1374,6 @@ typedef struct
     ref_list_t as_ref_pic_list1[MAX_DPB_SIZE];
 
     /*
-     *  weighted_bipred_idc
-     */
-    WORD8   u1_weighted_bipred_idc;
-
-    /*
      * no_output_of_prior_pics_flag
      */
     UWORD8   u1_no_output_of_prior_pics_flag;
diff --git a/common/ithread.c b/common/ithread.c
index f7335d9..d19bdec 100644
--- a/common/ithread.c
+++ b/common/ithread.c
@@ -38,12 +38,6 @@
 #include <string.h>
 #include "ih264_typedefs.h"
 
-/*
- * If the end target is bare metal, then there shall be no OS.
- * In this case, the functions ithread_* used inside the h264 encoder library to assist multicore
- * will not longer be functional. To resolve link issues, the functions are re-defined with no body.
- */
-#ifndef BAREMETAL
 
 
 #include "ithread.h"
@@ -52,7 +46,6 @@
 
 #define UNUSED(x) ((void)(x))
 
-#ifndef X86_MSVC
 //#define PTHREAD_AFFINITY
 //#define SYSCALL_AFFINITY
 
@@ -69,270 +62,6 @@
 #include <sys/prctl.h>
 #endif
 
-#endif
-
-#if defined(X86_MSVC) || defined (X86_MINGW)
-
-#include <windows.h>
-#define SEM_MAX_COUNT       100
-#define SEM_INCREMENT_COUNT 1
-
-UWORD32 ithread_get_handle_size(void)
-{
-    return (sizeof(HANDLE));
-}
-
-UWORD32 ithread_get_mutex_lock_size(void)
-{
-    return (sizeof(HANDLE));
-}
-
-WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
-{
-    HANDLE *ppv_thread_handle;
-    HANDLE thread_handle_value;
-
-    UNUSED(attribute);
-
-    if(0 == thread_handle)
-        return -1;
-
-    ppv_thread_handle = (HANDLE *)thread_handle;
-    thread_handle_value = (void *)CreateThread
-            (NULL,                             /* Attributes      */
-            1024*128,                          /* Stack i4_size      */
-            (LPTHREAD_START_ROUTINE)strt,      /* Thread function */
-            argument,                          /* Parameters      */
-            0,                                 /* Creation flags  */
-            NULL);                             /* Thread ID       */
-    *ppv_thread_handle = (HANDLE)thread_handle_value;
-
-    return 0;
-}
-
-WORD32 ithread_join(void *thread_handle, void ** val_ptr)
-{
-    HANDLE *ppv_thread_handle;
-    HANDLE thread_handle_value;
-
-    UNUSED(val_ptr);
-
-    if(0 == thread_handle)
-        return -1;
-
-    ppv_thread_handle = (HANDLE *)thread_handle;
-    thread_handle_value = *ppv_thread_handle;
-
-    if(WAIT_OBJECT_0 == WaitForSingleObject(thread_handle_value, INFINITE))
-    {
-        CloseHandle(thread_handle_value);
-    }
-
-    return 0;
-}
-
-void ithread_exit(void *thread_handle)
-{
-    HANDLE *ppv_thread_handle;
-    HANDLE thread_handle_value;
-    DWORD thread_exit_code;
-
-    if(0 == thread_handle)
-        return;
-
-    ppv_thread_handle = (HANDLE *)thread_handle;
-    thread_handle_value = *ppv_thread_handle;
-    /* Get exit code for thread. If the return value is 0, means thread is busy */
-    if( 0 != GetExitCodeThread(thread_handle_value, &thread_exit_code))
-    {
-        TerminateThread(thread_handle_value, thread_exit_code);
-    }
-
-    return;
-}
-
-WORD32 ithread_get_mutex_struct_size(void)
-{
-    return (sizeof(HANDLE));
-}
-
-WORD32 ithread_mutex_init(void *mutex)
-{
-    HANDLE *ppv_mutex_handle;
-    HANDLE mutex_handle_value;
-
-    if(0 == mutex)
-        return -1;
-
-    ppv_mutex_handle = (HANDLE *)mutex;
-    mutex_handle_value = CreateSemaphore(NULL, 1, 1, NULL);
-    *ppv_mutex_handle = mutex_handle_value;
-    return 0;
-}
-
-WORD32 ithread_mutex_destroy(void *mutex)
-{
-    HANDLE *ppv_mutex_handle;
-    HANDLE mutex_handle_value;
-
-    if(0 == mutex)
-        return -1;
-
-    ppv_mutex_handle = (HANDLE *)mutex;
-    mutex_handle_value = *ppv_mutex_handle;
-    CloseHandle(mutex_handle_value);
-    return 0;
-}
-
-WORD32 ithread_mutex_lock(void *mutex)
-{
-    HANDLE *ppv_mutex_handle;
-    HANDLE mutex_handle_value;
-    DWORD  result = 0;
-
-    if(0 == mutex)
-        return -1;
-
-    ppv_mutex_handle = (HANDLE *)mutex;
-    mutex_handle_value = *ppv_mutex_handle;
-    result = WaitForSingleObject(mutex_handle_value, INFINITE);
-
-    if(WAIT_OBJECT_0 == result)
-        return 0;
-
-    return 1;
-
-}
-
-WORD32 ithread_mutex_unlock(void *mutex)
-{
-    HANDLE *ppv_mutex_handle;
-    HANDLE mutex_handle_value;
-    DWORD  result = 0;
-
-    if(0 == mutex)
-        return -1;
-
-    ppv_mutex_handle = (HANDLE *)mutex;
-    mutex_handle_value = *ppv_mutex_handle;
-    result = ReleaseSemaphore(mutex_handle_value, 1, NULL);
-
-    if(0 == result)
-        return -1;
-
-    return 0;
-}
-
-void ithread_yield(void) { }
-
-void ithread_usleep(UWORD32 u4_time_us)
-{
-    UWORD32 u4_time_ms = u4_time_us / 1000;
-    Sleep(u4_time_ms);
-}
-
-void ithread_msleep(UWORD32 u4_time_ms)
-{
-    Sleep(u4_time_ms);
-}
-
-void ithread_sleep(UWORD32 u4_time)
-{
-    UWORD32 u4_time_ms = u4_time * 1000;
-    Sleep(u4_time_ms);
-}
-
-UWORD32 ithread_get_sem_struct_size(void)
-{
-    return (sizeof(HANDLE));
-}
-
-WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value)
-{
-    HANDLE *sem_handle = (HANDLE *)sem;
-    HANDLE sem_handle_value;
-
-    if(0 == sem)
-        return -1;
-
-    sem_handle_value = CreateSemaphore(NULL,  /* Security Attribute*/
-                         value,  /* Initial count     */
-                        SEM_MAX_COUNT,/* Max value         */
-                        NULL);        /* Name, not used    */
-    *sem_handle = sem_handle_value;
-    return 0;
-}
-
-WORD32 ithread_sem_post(void *sem)
-{
-    HANDLE *sem_handle = (HANDLE *)sem;
-    HANDLE sem_handle_value;
-
-    if(0 == sem)
-        return -1;
-
-    sem_handle_value = *sem_handle;
-
-    /* Post on Semaphore by releasing the lock on mutex */
-    if(ReleaseSemaphore(sem_handle_value, SEM_INCREMENT_COUNT, NULL))
-        return 0;
-
-    return -1;
-}
-
-WORD32 ithread_sem_wait(void *sem)
-{
-    DWORD          result = 0;
-    HANDLE *sem_handle = (HANDLE *)sem;
-    HANDLE sem_handle_value;
-
-    if(0 == sem)
-        return -1;
-
-    sem_handle_value = *sem_handle;
-
-    /* Wait on Semaphore object infinitly */
-    result = WaitForSingleObject(sem_handle_value, INFINITE);
-
-    /* If lock on semaphore is acquired, return SUCCESS */
-    if(WAIT_OBJECT_0 == result)
-        return 0;
-
-    /* If call timeouts, return FAILURE */
-    if(WAIT_TIMEOUT == result)
-        return -1;
-
-    return 0;
-}
-
-WORD32 ithread_sem_destroy(void *sem)
-{
-    HANDLE *sem_handle = (HANDLE *)sem;
-    HANDLE sem_handle_value;
-
-    if(0 == sem)
-        return -1;
-
-    sem_handle_value = *sem_handle;
-
-    if(FALSE == CloseHandle(sem_handle_value) )
-    {
-        return -1;
-    }
-    return 0;
-}
-
-WORD32 ithread_set_affinity(WORD32 core_id)
-{
-        return 1;
-}
-
-void ithread_set_name(CHAR *pc_thread_name)
-{
-    return;
-}
-
-#else
 
 UWORD32 ithread_get_handle_size(void)
 {
@@ -358,11 +87,6 @@ WORD32 ithread_join(void *thread_handle, void ** val_ptr)
     return pthread_join(*pthread_handle, NULL);
 }
 
-void ithread_exit(void *val_ptr)
-{
-    return pthread_exit(val_ptr);
-}
-
 WORD32 ithread_get_mutex_struct_size(void)
 {
     return(sizeof(pthread_mutex_t));
@@ -485,125 +209,3 @@ WORD32 ithread_set_affinity(WORD32 core_id)
     return 1;
 
 }
-#endif
-
-#else
-
-UWORD32 ithread_get_handle_size(void)
-{
-    return sizeof(int);
-}
-
-UWORD32 ithread_get_mutex_lock_size(void)
-{
-    return sizeof(int);
-}
-
-UWORD32 ithread_get_cond_size(void)
-{
-    return(sizeof(int));
-}
-WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
-{
-    return 0;
-}
-
-WORD32 ithread_join(void *thread_handle, void ** val_ptr)
-{
-    return 0;
-}
-
-void ithread_exit(void *val_ptr)
-{
-    return;
-}
-
-WORD32 ithread_mutex_init(void *mutex)
-{
-    return 0;
-}
-
-WORD32 ithread_mutex_destroy(void *mutex)
-{
-    return 0;
-}
-
-WORD32 ithread_mutex_lock(void *mutex)
-{
-    return 0;
-}
-
-WORD32 ithread_mutex_unlock(void *mutex)
-{
-    return 0;
-}
-
-void ithread_yield(void)
-{
-    return;
-}
-
-void ithread_sleep(UWORD32 u4_time_in_us)
-{
-    return;
-}
-
-void ithread_usleep(UWORD32 u4_time_us)
-{
-    return;
-}
-
-UWORD32 ithread_get_sem_strcut_size(void)
-{
-    return(sizeof(int));
-}
-
-
-WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value)
-{
-    return 0;
-}
-
-WORD32 ithread_sem_post(void *sem)
-{
-    return 0;
-}
-
-
-WORD32 ithread_sem_wait(void *sem)
-{
-    return 0;
-}
-
-WORD32 ithread_sem_destroy(void *sem)
-{
-    return 0;
-}
-
-void ithread_set_name(UWORD8 *pu1_thread_name)
-{
-    return;
-}
-
-void ithread_condition_init(void *condition)
-{
-    return;
-}
-
-void ithread_condition_signal(void * condition)
-{
-    return;
-}
-
-
-
-void ithread_condition_wait(void *condition,void *mutex)
-{
-    return;
-}
-
-WORD32 ithread_set_affinity(WORD32 core_id)
-{
-    return 1;
-}
-#endif
diff --git a/common/ithread.h b/common/ithread.h
index f926f83..3e5aa9c 100644
--- a/common/ithread.h
+++ b/common/ithread.h
@@ -29,7 +29,6 @@
 /*  List of Functions :     ithread_get_handle_size                          */
 /*                          ithread_get_mutex_lock_size                      */
 /*                          ithread_create                                   */
-/*                          ithread_exit                                     */
 /*                          ithread_join                                     */
 /*                          ithread_get_mutex_struct_size                    */
 /*                          ithread_mutex_init                               */
@@ -65,8 +64,6 @@ UWORD32 ithread_get_mutex_lock_size(void);
 
 WORD32  ithread_create(void *thread_handle, void *attribute, void *strt, void *argument);
 
-void    ithread_exit(void *val_ptr);
-
 WORD32  ithread_join(void *thread_id, void ** val_ptr);
 
 WORD32  ithread_get_mutex_struct_size(void);
diff --git a/common/x86/ih264_chroma_intra_pred_filters_ssse3.c b/common/x86/ih264_chroma_intra_pred_filters_ssse3.c
index 45101a4..d43ce20 100644
--- a/common/x86/ih264_chroma_intra_pred_filters_ssse3.c
+++ b/common/x86/ih264_chroma_intra_pred_filters_ssse3.c
@@ -103,47 +103,35 @@ void ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 *pu1_src,
     UWORD8 *pu1_left; /* Pointer to start of top predictors */
     WORD32 dst_strd2;
 
-    __m128i left_16x8b, left_sh_16x8b;
     __m128i row1_16x8b, row2_16x8b;
-    __m128i const_14_15_16x8b;
 
     UNUSED(src_strd);
     UNUSED(ngbr_avail);
 
     pu1_left = pu1_src + 2 * BLK8x8SIZE - 2;
 
-    left_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 14));
-
-    const_14_15_16x8b = _mm_set1_epi16(0x0f0e);
 
     dst_strd2 = dst_strd << 1;
-    left_sh_16x8b = _mm_slli_si128(left_16x8b, 2);
-    row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);
-    row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);
+    row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left)));
+    row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 2)));
     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
 
-    left_16x8b = _mm_slli_si128(left_16x8b, 4);
-    left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
     pu1_dst += dst_strd2;
-    row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);
-    row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);
+    row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 4)));
+    row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 6)));
     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
 
-    left_16x8b = _mm_slli_si128(left_16x8b, 4);
-    left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
     pu1_dst += dst_strd2;
-    row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);
-    row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);
+    row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 8)));
+    row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 10)));
     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
 
-    left_16x8b = _mm_slli_si128(left_16x8b, 4);
-    left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
     pu1_dst += dst_strd2;
-    row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);
-    row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);
+    row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 12)));
+    row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 14)));
     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
 }
@@ -273,7 +261,6 @@ void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src,
     //calculating a, b and c
     {
         WORD32 h_u, h_v, v_u, v_v;
-        WORD32 temp1, temp2;
 
         __m128i h_val1_16x8b, h_val2_16x8b;
         __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b;
@@ -302,13 +289,10 @@ void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src,
         h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b);
         v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b);
 
-        temp1 = _mm_extract_epi16(h_val1_16x8b, 3);
-        temp2 = _mm_extract_epi16(v_val1_16x8b, 3);
-
         hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b);
 
-        a_u = ((temp1 & 0xff) + (temp2 & 0xff)) << 4;
-        a_v = ((temp1 >> 8) + (temp2 >> 8)) << 4;
+        a_u = (pu1_left[7 * (-2)] + pu1_top[14]) << 4;
+        a_v = (pu1_left[7 * (-2) + 1] + pu1_top[15]) << 4;
 
         h_u = _mm_extract_epi16(hv_val_4x32b, 0);
         h_v = _mm_extract_epi16(hv_val_4x32b, 2);
diff --git a/common/x86/ih264_inter_pred_filters_ssse3.c b/common/x86/ih264_inter_pred_filters_ssse3.c
index 6d318c9..480a8c7 100644
--- a/common/x86/ih264_inter_pred_filters_ssse3.c
+++ b/common/x86/ih264_inter_pred_filters_ssse3.c
@@ -111,23 +111,12 @@ void ih264_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src,
 
     if(wd == 4)
     {
-        __m128i mask_full_128b, mask_low_32b;
-
-        mask_full_128b = _mm_set1_epi8(0xff);
-        mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
-        // mask for first four bytes
-
         do
         {
-            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
-            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
-            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2));
-            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3));
-
-            _mm_maskmoveu_si128(y_0_16x8b, mask_low_32b, (char*)pu1_dst);
-            _mm_maskmoveu_si128(y_1_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
-            _mm_maskmoveu_si128(y_2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
-            _mm_maskmoveu_si128(y_3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
+            *((WORD32 *)(pu1_dst)) =  *((WORD32 *)(pu1_src));
+            *((WORD32 *)(pu1_dst + dst_strd)) = *((WORD32 *)(pu1_src + src_strd));
+            *((WORD32 *)(pu1_dst + dst_strd2)) = *((WORD32 *)(pu1_src + src_strd2));
+            *((WORD32 *)(pu1_dst + dst_strd3)) = *((WORD32 *)(pu1_src + src_strd3));
 
             ht -= 4;
             pu1_src += src_strd4;
@@ -255,11 +244,6 @@ void ih264_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src,
         __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
         __m128i res_r0r1_16x8b;
 
-        __m128i mask_full_16x8b, mask_low32b;
-
-        mask_full_16x8b = _mm_set1_epi8(0xff);
-        mask_low32b = _mm_srli_si128(mask_full_16x8b, 12); // mask for first four bytes
-
         //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
         //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
 
@@ -307,9 +291,9 @@ void ih264_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src,
 
             res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b);
 
-            _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst);
+            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
             res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
-            _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));
+            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
 
             ht -= 2;
             pu1_src += src_strd << 1;
@@ -525,10 +509,6 @@ void ih264_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src,
 
     if(wd == 4)
     {
-        __m128i mask_low32b;
-
-        mask_low32b = _mm_set1_epi8(0xff);
-
         //Epilogue: Load all the pred rows except sixth and seventh row
         //          for the first and second row processing.
         src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
@@ -542,8 +522,6 @@ void ih264_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src,
         src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
         pu1_src += src_strd;
 
-        mask_low32b = _mm_srli_si128(mask_low32b, 12); // mask for first four bytes
-
         src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
         src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
         src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
@@ -572,9 +550,9 @@ void ih264_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src,
             res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
             res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
 
-            _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst);
+            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
             res_16x8b = _mm_srli_si128(res_16x8b, 4);
-            _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));
+            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
 
             src_r0_16x8b = src_r2_16x8b;
             src_r1_16x8b = src_r3_16x8b;
@@ -893,15 +871,12 @@ void ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3(UWORD8 *pu1_src,
             __m128i res_8x16b, res_16x8b;
 
             __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
-            __m128i const_val512_4x32b, mask_low32b;
-
-            mask_low32b = _mm_set1_epi8(0xff);
+            __m128i const_val512_4x32b;
 
             coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
             coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
             coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
 
-            mask_low32b = _mm_srli_si128(mask_low32b, 12);
             const_val512_4x32b = _mm_set1_epi32(512);
 
             src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp));
@@ -947,9 +922,9 @@ void ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3(UWORD8 *pu1_src,
                 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
 
-                _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst);
+                *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
                 res_16x8b = _mm_srli_si128(res_16x8b, 4);
-                _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));
+                *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
 
                 src_r0_8x16b = src_r2_8x16b;
                 src_r1_8x16b = src_r3_8x16b;
@@ -1196,8 +1171,6 @@ void ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3(UWORD8 *pu1_src,
 
         // Horizontal 6-tap filtering
         {
-            ht_tmp = ht + 5;
-
             __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
             __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
 
@@ -1206,6 +1179,8 @@ void ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3(UWORD8 *pu1_src,
 
             __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
 
+            ht_tmp = ht + 5;
+
             coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
             coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
             coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
@@ -1551,11 +1526,6 @@ void ih264_inter_pred_luma_horz_qpel_ssse3(UWORD8 *pu1_src,
         __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
         __m128i res_r0r1_16x8b;
 
-        __m128i mask_full_16x8b, mask_low32b;
-
-        mask_full_16x8b = _mm_set1_epi8(0xff);
-        mask_low32b = _mm_srli_si128(mask_full_16x8b, 12); // mask for first four bytes
-
         //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
         //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
 
@@ -1607,9 +1577,9 @@ void ih264_inter_pred_luma_horz_qpel_ssse3(UWORD8 *pu1_src,
             res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b);
             res_r0r1_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_r0r1_16x8b);              //computing q-pel
 
-            _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst);
+            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
             res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
-            _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));
+            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
 
             ht -= 2;
             pu1_src += src_strd << 1;
@@ -1849,10 +1819,6 @@ void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src,
 
     if(wd == 4)
     {
-        __m128i mask_low32b;
-
-        mask_low32b = _mm_set1_epi8(0xff);
-
         //Epilogue: Load all the pred rows except sixth and seventh row
         //          for the first and second row processing.
         src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
@@ -1866,8 +1832,6 @@ void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src,
         src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
         pu1_src += src_strd;
 
-        mask_low32b = _mm_srli_si128(mask_low32b, 12); // mask for first four bytes
-
         src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
         src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
         src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
@@ -1904,9 +1868,9 @@ void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src,
 
             res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
 
-            _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst);
+            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
             res_16x8b = _mm_srli_si128(res_16x8b, 4);
-            _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));
+            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
 
             src_r0_16x8b = src_r2_16x8b;
             src_r1_16x8b = src_r3_16x8b;
@@ -2157,6 +2121,9 @@ void ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3(UWORD8 *pu1_src,
     UWORD8 *pu1_tmp1, *pu1_tmp2;
     WORD32 x_offset, y_offset;
 
+    __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+    __m128i const_val16_8x16b;
+
     pu1_tmp1 = pu1_tmp;
 
     dydx &= 0xf;
@@ -2169,9 +2136,6 @@ void ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3(UWORD8 *pu1_src,
     pu1_pred_horiz = pu1_src + (y_offset >> 1) * src_strd - 2;
     //the filter input starts from x[-2] (till x[3])
 
-    __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
-    __m128i const_val16_8x16b;
-
     coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
     coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
     coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
@@ -2257,11 +2221,6 @@ void ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3(UWORD8 *pu1_src,
             __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
             __m128i res_r0r1_16x8b;
 
-            __m128i mask_low32b;
-
-            mask_low32b = _mm_set1_epi8(0xff);
-            mask_low32b = _mm_srli_si128(mask_low32b, 12);
-
             //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
             //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
 
@@ -2313,9 +2272,9 @@ void ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3(UWORD8 *pu1_src,
 
                 res_r0r1_16x8b = _mm_avg_epu8(res_r0r1_16x8b,src_r0r1_vpel_16x8b);
 
-                _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst);
+                *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
                 res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
-                _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));
+                *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
 
                 ht -= 2;
                 pu1_pred_horiz += src_strd << 1;
@@ -2852,16 +2811,11 @@ void ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3(UWORD8 *pu1_src,
 
             __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
             __m128i const_val512_4x32b, const_val16_8x16b;
-            __m128i mask_low32b;
-
-            mask_low32b = _mm_set1_epi8(0xff);
 
             coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
             coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
             coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
 
-            mask_low32b = _mm_srli_si128(mask_low32b, 12);
-
             const_val512_4x32b = _mm_set1_epi32(512);
             const_val16_8x16b = _mm_set1_epi16(16);
 
@@ -2897,7 +2851,7 @@ void ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3(UWORD8 *pu1_src,
 
                 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
 
-                _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst);
+                *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
 
                 ht--;
                 pi2_temp2 = pi2_temp2 + 4 + 5;
@@ -3424,12 +3378,9 @@ void ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3(UWORD8 *pu1_src,
 
             __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
             __m128i const_val512_4x32b, const_val16_8x16b;
-            __m128i mask_low32b;
 
-            mask_low32b = _mm_set1_epi8(0xff);
             const_val512_4x32b = _mm_set1_epi32(512);
             const_val16_8x16b = _mm_set1_epi16(16);
-            mask_low32b = _mm_srli_si128(mask_low32b, 12);
 
             coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
             coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
@@ -3483,9 +3434,9 @@ void ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3(UWORD8 *pu1_src,
 
                 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
 
-                _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst));
+                *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
                 res_16x8b = _mm_srli_si128(res_16x8b, 4);
-                _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst + dst_strd));
+                *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
 
                 src_r0_8x16b = src_r2_8x16b;
                 src_r1_8x16b = src_r3_8x16b;
@@ -4106,65 +4057,6 @@ void ih264_inter_pred_chroma_ssse3(UWORD8 *pu1_src,
         }
         while(ht > 0);
 
-        /*
-        WORD32 AB, CD;
-
-        __m128i src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
-        __m128i src_r1r2_16x8b, src_r2r3_16x8b;
-        __m128i res_AB_8x16b, res_CD_8x16b, res_8x16b, res_16x8b;
-        __m128i mask_low32b;
-
-        __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b;
-        __m128i const_shuff_16x8b;
-
-        AB = (B << 8) + A;
-        CD = (D << 8) + C;
-
-        coeffAB_16x8b = _mm_set1_epi16(AB);
-        coeffCD_16x8b = _mm_set1_epi16(CD);
-
-        round_add32_8x16b = _mm_set1_epi16(32);
-
-        mask_low32b = _mm_set1_epi8(0xff);
-        src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);                       //u1[0] v1[0] u1[1] v1[1] u1[2] v1[2] u1[3] v1[3]
-        pu1_src += src_strd;
-
-        const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x0b090a08, 0x0d0b0c0a);
-        mask_low32b = _mm_srli_si128(mask_low32b, 12);
-
-        do
-        {
-            src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);                   //u2[0] v2[0] u2[1] v2[1] u1[2] v2[2] u2[3] v2[3]
-            src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));      //u3[0] v3[0] u3[1] v3[1] u3[2] v3[2] u3[3] v3[3]
-
-            src_r1r2_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
-            src_r2r3_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
-
-            src_r1r2_16x8b = _mm_shuffle_epi8(src_r1r2_16x8b, const_shuff_16x8b); //u1[0] u1[1] v1[0] v1[1] u1[1] u1[2] v1[1] v1[2]
-                                                                                  //u2[0] u2[1] v2[0] v2[1] u2[1] u2[2] v2[1] v2[2]
-            src_r2r3_16x8b = _mm_shuffle_epi8(src_r2r3_16x8b, const_shuff_16x8b); //u2[0] u2[1] v2[0] v2[1] u2[1] u2[2] v2[1] v2[2]
-                                                                                  //u3[0] u3[1] v3[0] v3[1] u3[1] u3[2] v3[1] v3[2]
-            res_AB_8x16b = _mm_maddubs_epi16(src_r1r2_16x8b, coeffAB_16x8b);
-            res_CD_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeffCD_16x8b);
-
-            res_8x16b = _mm_add_epi16(res_AB_8x16b, round_add32_8x16b);
-            res_8x16b = _mm_add_epi16(res_8x16b, res_CD_8x16b);
-            res_8x16b = _mm_srai_epi16(res_8x16b, 6);
-            res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
-
-            _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)pu1_dst);
-
-            ht -= 2;
-            pu1_src += src_strd << 1;
-            res_16x8b = _mm_srli_si128(res_16x8b, 4);
-            src_r1_16x8b = src_r3_16x8b;
-
-            _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst + dst_strd));
-
-            pu1_dst += dst_strd << 1;
-        }
-        while(ht > 0);
-        */
     }
     else if(wd == 4)
     {
diff --git a/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c
index 565cc75..bcfe503 100644
--- a/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c
+++ b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c
@@ -30,8 +30,8 @@
  *  Mohit [100664]
  *
  * @par List of Functions:
- *  - ihevc_iquant_itrans_recon_4x4_dc_ssse3()
- *  - ihevc_iquant_itrans_recon_8x8_dc_ssse3()
+ *  - ih264_iquant_itrans_recon_4x4_dc_ssse3()
+ *  - ih264_iquant_itrans_recon_8x8_dc_ssse3()
  *
  * @remarks
  *  None
@@ -113,6 +113,13 @@ void ih264_iquant_itrans_recon_4x4_dc_ssse3(WORD16 *pi2_src,
     UWORD32 *pu4_out = (UWORD32 *)pu1_out;
     WORD32 q0 = pi2_src[0];
     WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
+
+    __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3;
+    __m128i sign_reg;
+    __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
+    __m128i temp4, temp5, temp6, temp7;
+    __m128i value_add;
+
     UNUSED (pi2_tmp);
 
     INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
@@ -122,11 +129,7 @@ void ih264_iquant_itrans_recon_4x4_dc_ssse3(WORD16 *pi2_src,
 
     i_macro = ((q0 + 32) >> 6);
 
-    __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3;
-    __m128i sign_reg;
-    __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
-    __m128i temp4, temp5, temp6, temp7;
-    __m128i value_add = _mm_set1_epi16(i_macro);
+    value_add = _mm_set1_epi16(i_macro);
 
     zero_8x16b = _mm_setzero_si128();                  // all bits reset to zero
     //Load pred buffer
@@ -235,6 +238,13 @@ void ih264_iquant_itrans_recon_8x8_dc_ssse3 (WORD16 *pi2_src,
 {
     WORD32 q0 = pi2_src[0];
     WORD16 i_macro, rnd_fact = (qp_div < 6) ? 1 << (5 - qp_div) : 0;
+
+    __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3,pred_r4,pred_r5,pred_r6,pred_r7;
+    __m128i sign_reg;
+    __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
+    __m128i temp1,temp2,temp3,temp4, temp5, temp6, temp7,temp8;
+    __m128i value_add;
+
     UNUSED (pi2_tmp);
     UNUSED (iq_start_idx);
     UNUSED (pi2_dc_ld_addr);
@@ -242,11 +252,7 @@ void ih264_iquant_itrans_recon_8x8_dc_ssse3 (WORD16 *pi2_src,
     INV_QUANT(q0, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
     i_macro = ((q0 + 32) >> 6);
 
-    __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3,pred_r4,pred_r5,pred_r6,pred_r7;
-    __m128i sign_reg;
-    __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
-    __m128i temp1,temp2,temp3,temp4, temp5, temp6, temp7,temp8;
-    __m128i value_add = _mm_set1_epi16(i_macro);
+    value_add = _mm_set1_epi16(i_macro);
 
     //Load pred buffer row 0
     predload_r = _mm_loadl_epi64((__m128i *)(&pu1_pred[0])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
@@ -397,6 +403,7 @@ void ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 *pi2_src,
     __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
     __m128i chroma_mask = _mm_set1_epi16 (0xFF);
     __m128i value_add = _mm_set1_epi16(i_macro);
+    __m128i out_r0, out_r1, out_r2, out_r3;
 
     UNUSED (pi2_src);
     UNUSED (pu2_iscal_mat);
@@ -438,12 +445,26 @@ void ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 *pi2_src,
     pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b); //p20 p21 p22 p23 -- all 16 bits
     pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b); //p30 p31 p32 p33 -- all 16 bits
 
-    chroma_mask = _mm_unpacklo_epi64(chroma_mask, zero_8x16b);  //1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0  -- 8 bits
-
-    _mm_maskmoveu_si128(pred_r0, chroma_mask, (char *)(&pu1_out[0]));
-    _mm_maskmoveu_si128(pred_r1, chroma_mask, (char *)(&pu1_out[out_strd]));
-    _mm_maskmoveu_si128(pred_r2, chroma_mask, (char *)(&pu1_out[2*out_strd]));
-    _mm_maskmoveu_si128(pred_r3, chroma_mask, (char *)(&pu1_out[3*out_strd]));
+    chroma_mask = _mm_set1_epi16 (0xFF00);
+    out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
+    out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[out_strd]));
+    out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * out_strd]));
+    out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * out_strd]));
+
+    out_r0 = _mm_and_si128(out_r0, chroma_mask);
+    out_r1 = _mm_and_si128(out_r1, chroma_mask);
+    out_r2 = _mm_and_si128(out_r2, chroma_mask);
+    out_r3 = _mm_and_si128(out_r3, chroma_mask);
+
+    out_r0 = _mm_add_epi8(out_r0, pred_r0);
+    out_r1 = _mm_add_epi8(out_r1, pred_r1);
+    out_r2 = _mm_add_epi8(out_r2, pred_r2);
+    out_r3 = _mm_add_epi8(out_r3, pred_r3);
+
+    _mm_storel_epi64((__m128i *)(&pu1_out[0]), out_r0);
+    _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), out_r1);
+    _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), out_r2);
+    _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), out_r3);
 }
 
 
diff --git a/common/x86/ih264_iquant_itrans_recon_sse42.c b/common/x86/ih264_iquant_itrans_recon_sse42.c
index 6399b65..f27111f 100644
--- a/common/x86/ih264_iquant_itrans_recon_sse42.c
+++ b/common/x86/ih264_iquant_itrans_recon_sse42.c
@@ -30,8 +30,8 @@
  *  Mohit [100664]
  *
  * @par List of Functions:
- *  - ihevc_iquant_itrans_recon_4x4_sse42()
- *  - ihevc_iquant_itrans_recon_chroma_4x4_sse42()
+ *  - ih264_iquant_itrans_recon_4x4_sse42()
+ *  - ih264_iquant_itrans_recon_chroma_4x4_sse42()
  *
  * @remarks
  *  None
@@ -370,6 +370,7 @@ void ih264_iquant_itrans_recon_chroma_4x4_sse42(WORD16 *pi2_src,
     __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6)));
     __m128i value_32 = _mm_set1_epi32(32);
     __m128i chroma_mask = _mm_set1_epi16 (0xFF);
+    __m128i out_r0, out_r1, out_r2, out_r3;
     UNUSED (pi2_tmp);
 
     /*************************************************************/
@@ -548,10 +549,24 @@ void ih264_iquant_itrans_recon_chroma_4x4_sse42(WORD16 *pi2_src,
     resq_r2 = _mm_cvtepu8_epi16(resq_r2); //p20 p21 p22 p23 -- all 16 bits
     resq_r3 = _mm_cvtepu8_epi16(resq_r3); //p30 p31 p32 p33 -- all 16 bits
 
-    chroma_mask = _mm_unpacklo_epi64(chroma_mask, zero_8x16b);
-
-    _mm_maskmoveu_si128(resq_r0, chroma_mask, (char *)(&pu1_out[0]));
-    _mm_maskmoveu_si128(resq_r1, chroma_mask, (char *)(&pu1_out[out_strd]));
-    _mm_maskmoveu_si128(resq_r2, chroma_mask, (char *)(&pu1_out[2*out_strd]));
-    _mm_maskmoveu_si128(resq_r3, chroma_mask, (char *)(&pu1_out[3*out_strd]));
+    chroma_mask = _mm_set1_epi16 (0xFF00);
+    out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
+    out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[out_strd]));
+    out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * out_strd]));
+    out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * out_strd]));
+
+    out_r0 = _mm_and_si128(out_r0, chroma_mask);
+    out_r1 = _mm_and_si128(out_r1, chroma_mask);
+    out_r2 = _mm_and_si128(out_r2, chroma_mask);
+    out_r3 = _mm_and_si128(out_r3, chroma_mask);
+
+    out_r0 = _mm_add_epi8(out_r0, resq_r0);
+    out_r1 = _mm_add_epi8(out_r1, resq_r1);
+    out_r2 = _mm_add_epi8(out_r2, resq_r2);
+    out_r3 = _mm_add_epi8(out_r3, resq_r3);
+
+    _mm_storel_epi64((__m128i *)(&pu1_out[0]), out_r0);
+    _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), out_r1);
+    _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), out_r2);
+    _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), out_r3);
 }
diff --git a/common/x86/ih264_iquant_itrans_recon_ssse3.c b/common/x86/ih264_iquant_itrans_recon_ssse3.c
index 388cafe..30f7e59 100644
--- a/common/x86/ih264_iquant_itrans_recon_ssse3.c
+++ b/common/x86/ih264_iquant_itrans_recon_ssse3.c
@@ -30,8 +30,8 @@
  *  Mohit [100664]
  *
  * @par List of Functions:
- *  - ihevc_iquant_itrans_recon_4x4_ssse3()
- *  - ihevc_iquant_itrans_recon_8x8_ssse3()
+ *  - ih264_iquant_itrans_recon_4x4_ssse3()
+ *  - ih264_iquant_itrans_recon_8x8_ssse3()
  *
  * @remarks
  *  None
diff --git a/common/x86/ih264_luma_intra_pred_filters_ssse3.c b/common/x86/ih264_luma_intra_pred_filters_ssse3.c
index 5a35372..a1721d5 100644
--- a/common/x86/ih264_luma_intra_pred_filters_ssse3.c
+++ b/common/x86/ih264_luma_intra_pred_filters_ssse3.c
@@ -122,28 +122,22 @@ void ih264_intra_pred_luma_4x4_mode_vert_ssse3(UWORD8 *pu1_src,
 {
     UWORD8 *pu1_top;
     WORD32 dst_strd2, dst_strd3;
-
-    __m128i top_16x8b;
-    __m128i mask_full_128b, mask_low_32b;
+    WORD32 i4_top;
 
     UNUSED(src_strd);
     UNUSED(ngbr_avail);
 
-    mask_full_128b = _mm_set1_epi8(0xff);
-
     pu1_top = pu1_src + BLK_SIZE + 1;
 
-    mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
-
-    top_16x8b = _mm_loadl_epi64((__m128i *)pu1_top);
+    i4_top = *((WORD32 *)pu1_top);
 
     dst_strd2 = dst_strd << 1;
     dst_strd3 = dst_strd + dst_strd2;
 
-    _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)pu1_dst);
-    _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
-    _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
-    _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd3));
+    *((WORD32 *)(pu1_dst)) = i4_top;
+    *((WORD32 *)(pu1_dst + dst_strd)) = i4_top;
+    *((WORD32 *)(pu1_dst + dst_strd2)) = i4_top;
+    *((WORD32 *)(pu1_dst + dst_strd3)) = i4_top;
 }
 
 /**
@@ -185,39 +179,31 @@ void ih264_intra_pred_luma_4x4_mode_horz_ssse3(UWORD8 *pu1_src,
                                                WORD32 dst_strd,
                                                WORD32 ngbr_avail)
 {
-    UWORD8 *pu1_left;
+    UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+    WORD32 row1,row2,row3,row4;
+    UWORD8 val;
     WORD32 dst_strd2, dst_strd3;
-    WORD32 val1, val2;
-
-    __m128i left_16x8b;
-    __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
-    __m128i mask_full_128b, mask_low_32b;
 
     UNUSED(src_strd);
     UNUSED(ngbr_avail);
-
-    mask_full_128b = _mm_set1_epi8(0xff);
-
     pu1_left = pu1_src + BLK_SIZE - 1;
 
-    mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
-    left_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3));
-
-    val1 = _mm_extract_epi16(left_16x8b, 1);
-    val2 = _mm_extract_epi16(left_16x8b, 0);
-
-    row1_16x8b = _mm_set1_epi8(val1 >> 8);
-    row2_16x8b = _mm_set1_epi8(val1 & 0xff);
-    row3_16x8b = _mm_set1_epi8(val2 >> 8);
-    row4_16x8b = _mm_set1_epi8(val2 & 0xff);
+    val  = *pu1_left;
+    row1 = val + (val << 8) + (val << 16) + (val << 24);
+    val  = *(pu1_left - 1);
+    row2 = val + (val << 8) + (val << 16) + (val << 24);
+    val  = *(pu1_left - 2);
+    row3 = val + (val << 8) + (val << 16) + (val << 24);
+    val  = *(pu1_left - 3);
+    row4 = val + (val << 8) + (val << 16) + (val << 24);
 
     dst_strd2 = dst_strd << 1;
     dst_strd3 = dst_strd + dst_strd2;
 
-    _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst);
-    _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
-    _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
-    _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd3));
+    *((WORD32 *)(pu1_dst)) = row1;
+    *((WORD32 *)(pu1_dst + dst_strd)) = row2;
+    *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
+    *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
 }
 
 /**
@@ -259,72 +245,43 @@ void ih264_intra_pred_luma_4x4_mode_dc_ssse3(UWORD8 *pu1_src,
                                              WORD32 ngbr_avail)
 {
     UWORD8 u1_useleft; /* availability of left predictors (only for DC) */
-    UWORD8 u1_usetop;  /* availability of top predictors (only for DC) */
-    UWORD8 *pu1_left, *pu1_top;
-    WORD32 dc_val, flag;
+    UWORD8 u1_usetop; /* availability of top predictors (only for DC) */
+    UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+    UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
     WORD32 dst_strd2, dst_strd3;
-
-    __m128i mask_full_128b, mask_low_32b;
-    __m128i dcval_16x8b;
-
+    WORD32 val = 0;
     UNUSED(src_strd);
     UNUSED(ngbr_avail);
-
-    mask_full_128b = _mm_set1_epi8(0xff);
-
     u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK);
     u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK);
-
-    pu1_left = pu1_src + BLK_SIZE - 1;
     pu1_top = pu1_src + BLK_SIZE + 1;
+    pu1_left = pu1_src + BLK_SIZE - 1;
 
-    mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
-
-    flag = u1_useleft + u1_usetop;
-
-    if(flag)
+    if(u1_useleft)
     {
-        WORD32 shft, ofst = 0;
-
-        __m128i left_16x8b, top_16x8b, val_16x8b, tmp_8x16b, zero_vector;
-
-        if(u1_useleft)
-        {
-            left_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3));
-            ofst += 2;
-        }
-        else
-            left_16x8b = _mm_setzero_si128();
-
-        zero_vector = _mm_setzero_si128();
-
-        if(u1_usetop)
-        {
-            top_16x8b = _mm_loadl_epi64((__m128i *)pu1_top);
-            ofst += 2;
-        }
-        else
-            top_16x8b = _mm_setzero_si128();
-
-        shft = flag + 1;
-        val_16x8b = _mm_unpacklo_epi32(left_16x8b, top_16x8b);
-        tmp_8x16b = _mm_sad_epu8(val_16x8b, zero_vector);
-
-        dc_val = _mm_extract_epi16(tmp_8x16b, 0);
-        dc_val = (dc_val + ofst) >> shft;
+        val += *pu1_left--;
+        val += *pu1_left--;
+        val += *pu1_left--;
+        val += *pu1_left + 2;
     }
-    else
-        dc_val = 128;
+    if(u1_usetop)
+    {
+        val += *pu1_top + *(pu1_top + 1) + *(pu1_top + 2) + *(pu1_top + 3)
+                        + 2;
+    }
+    /* Since 2 is added if either left/top pred is there,
+     val still being zero implies both preds are not there */
+    val = (val) ? (val >> (1 + u1_useleft + u1_usetop)) : 128;
+
+    val = val + (val << 8) + (val << 16) + (val << 24);
 
     dst_strd2 = dst_strd << 1;
     dst_strd3 = dst_strd + dst_strd2;
 
-    dcval_16x8b = _mm_set1_epi8(dc_val);
-
-    _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)pu1_dst);
-    _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
-    _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd2));
-    _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd3));
+    *((WORD32 *)(pu1_dst)) = val;
+    *((WORD32 *)(pu1_dst + dst_strd)) = val;
+    *((WORD32 *)(pu1_dst + dst_strd2)) = val;
+    *((WORD32 *)(pu1_dst + dst_strd3)) = val;
 }
 
 /**
@@ -371,7 +328,7 @@ void ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3(UWORD8 *pu1_src,
     __m128i top_16x8b, top_8x16b, top_sh_8x16b;
     __m128i res1_8x16b, res2_8x16b, res_16x8b;
     __m128i zero_vector, const_2_8x16b;
-    __m128i mask_full_128b, mask_low_32b;
+    WORD32 row1,row2,row3,row4;
 
     UNUSED(src_strd);
     UNUSED(ngbr_avail);
@@ -382,13 +339,11 @@ void ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3(UWORD8 *pu1_src,
     zero_vector = _mm_setzero_si128();
     top_8x16b = _mm_unpacklo_epi8(top_16x8b, zero_vector);    //t0 t1 t2 t3 t4 t5 t6 t7
 
-    mask_full_128b = _mm_set1_epi8(0xff);
     top_sh_8x16b = _mm_srli_si128(top_8x16b, 2);              //t1 t2 t3 t4 t5 t6 t7 0
     const_2_8x16b = _mm_set1_epi16(2);
 
     top_sh_8x16b = _mm_shufflehi_epi16(top_sh_8x16b, 0xa4);   //t1 t2 t3 t4 t5 t6 t7 t7
     res1_8x16b = _mm_add_epi16(top_8x16b, top_sh_8x16b);
-    mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
     res2_8x16b = _mm_srli_si128(res1_8x16b, 2);
 
     res1_8x16b = _mm_add_epi16(res1_8x16b, const_2_8x16b);
@@ -399,13 +354,18 @@ void ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3(UWORD8 *pu1_src,
     dst_strd3 = dst_strd + dst_strd2;
 
     res_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b);
-    _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)pu1_dst);
+    row1 = _mm_cvtsi128_si32(res_16x8b);
     res_16x8b = _mm_srli_si128(res_16x8b, 1);
-    _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
+    row2 = _mm_cvtsi128_si32(res_16x8b);
     res_16x8b = _mm_srli_si128(res_16x8b, 1);
-    _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd2));
+    row3 = _mm_cvtsi128_si32(res_16x8b);
     res_16x8b = _mm_srli_si128(res_16x8b, 1);
-    _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd3));
+    row4 = _mm_cvtsi128_si32(res_16x8b);
+
+    *((WORD32 *)(pu1_dst)) = row1;
+    *((WORD32 *)(pu1_dst + dst_strd)) = row2;
+    *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
+    *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
 }
 
 /**
@@ -454,7 +414,7 @@ void ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3(UWORD8 *pu1_src,
     __m128i res1_8x16b, res2_8x16b;
     __m128i res1_16x8b, res2_16x8b;
     __m128i zero_vector, const_2_8x16b;
-    __m128i mask_full_128b, mask_low_32b;
+    WORD32 row1,row2,row3,row4;
 
     UNUSED(src_strd);
     UNUSED(ngbr_avail);
@@ -468,13 +428,11 @@ void ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3(UWORD8 *pu1_src,
     top_left_8x16b = _mm_unpacklo_epi8(top_left_16x8b, zero_vector);
     top_left_sh_8x16b = _mm_unpacklo_epi8(top_left_sh_16x8b, zero_vector);
 
-    mask_full_128b = _mm_set1_epi8(0xff);
     res1_8x16b = _mm_add_epi16(top_left_8x16b, top_left_sh_8x16b);           //l3+l2 l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3...
     const_2_8x16b = _mm_set1_epi16(2);
     res2_8x16b = _mm_srli_si128(res1_8x16b, 2);                              //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3...
 
     res1_8x16b = _mm_add_epi16(res1_8x16b, const_2_8x16b);
-    mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
     res1_8x16b = _mm_add_epi16(res2_8x16b, res1_8x16b);                      //l3+2*l2+l1+2 l2+2*l1+l0+2...
     res1_8x16b = _mm_srai_epi16(res1_8x16b, 2);
     res1_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b);
@@ -483,12 +441,18 @@ void ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3(UWORD8 *pu1_src,
     dst_strd3 = dst_strd + dst_strd2;
 
     res2_16x8b = _mm_srli_si128(res1_16x8b, 3);
-    _mm_maskmoveu_si128(res2_16x8b, mask_low_32b, (char*)pu1_dst);
+
+    row1 = _mm_cvtsi128_si32(res2_16x8b);
     res2_16x8b = _mm_srli_si128(res1_16x8b, 2);
-    _mm_maskmoveu_si128(res2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
+    row2 = _mm_cvtsi128_si32(res2_16x8b);
     res2_16x8b = _mm_srli_si128(res1_16x8b, 1);
-    _mm_maskmoveu_si128(res2_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd2));
-    _mm_maskmoveu_si128(res1_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd3));
+    row3 = _mm_cvtsi128_si32(res2_16x8b);
+    row4 = _mm_cvtsi128_si32(res1_16x8b);
+
+    *((WORD32 *)(pu1_dst)) = row1;
+    *((WORD32 *)(pu1_dst + dst_strd)) = row2;
+    *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
+    *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
 }
 
 /**
@@ -537,14 +501,11 @@ void ih264_intra_pred_luma_4x4_mode_vert_r_ssse3(UWORD8 *pu1_src,
     __m128i w121_a1_8x16b, w121_a2_8x16b, w121_sh_8x16b;
     __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
     __m128i zero_vector, const_2_8x16b;
-    __m128i mask_full_128b, mask_low_32b;
+    WORD32 row1,row2,row3,row4;
 
     UNUSED(src_strd);
     UNUSED(ngbr_avail);
 
-    mask_full_128b = _mm_set1_epi8(0xff);
-    mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
-
     pu1_left = pu1_src + BLK_SIZE - 1;
 
     val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 2));
@@ -575,10 +536,15 @@ void ih264_intra_pred_luma_4x4_mode_vert_r_ssse3(UWORD8 *pu1_src,
     dst_strd2 = dst_strd << 1;
     dst_strd3 = dst_strd + dst_strd2;
 
-    _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst);
-    _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
-    _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd2));
-    _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd3));
+    row1 = _mm_cvtsi128_si32(row1_16x8b);
+    row2 = _mm_cvtsi128_si32(row2_16x8b);
+    row3 = _mm_cvtsi128_si32(row3_16x8b);
+    row4 = _mm_cvtsi128_si32(row4_16x8b);
+
+    *((WORD32 *)(pu1_dst)) = row1;
+    *((WORD32 *)(pu1_dst + dst_strd)) = row2;
+    *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
+    *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
 }
 
 /*
@@ -629,14 +595,11 @@ void ih264_intra_pred_luma_4x4_mode_horz_d_ssse3(UWORD8 *pu1_src,
     __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
 
     __m128i zero_vector, const_2_8x16b;
-    __m128i mask_full_128b, mask_low_32b;
+    WORD32 row1,row2,row3,row4;
 
     UNUSED(src_strd);
     UNUSED(ngbr_avail);
 
-    mask_full_128b = _mm_set1_epi8(0xff);
-    mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
-
     pu1_left = pu1_src + BLK_SIZE - 1;
 
     val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3));
@@ -669,10 +632,15 @@ void ih264_intra_pred_luma_4x4_mode_horz_d_ssse3(UWORD8 *pu1_src,
     row2_16x8b = _mm_srli_si128(row4_16x8b, 4);
     row3_16x8b = _mm_srli_si128(row4_16x8b, 2);
 
-    _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst);
-    _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
-    _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd2));
-    _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd3));
+    row1 = _mm_cvtsi128_si32(row1_16x8b);
+    row2 = _mm_cvtsi128_si32(row2_16x8b);
+    row3 = _mm_cvtsi128_si32(row3_16x8b);
+    row4 = _mm_cvtsi128_si32(row4_16x8b);
+
+    *((WORD32 *)(pu1_dst)) = row1;
+    *((WORD32 *)(pu1_dst + dst_strd)) = row2;
+    *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
+    *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
 }
 
 /**
@@ -721,14 +689,11 @@ void ih264_intra_pred_luma_4x4_mode_vert_l_ssse3(UWORD8 *pu1_src,
     __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
 
     __m128i zero_vector, const_2_8x16b;
-    __m128i mask_full_128b, mask_low_32b;
+    WORD32 row1,row2,row3,row4;
 
     UNUSED(src_strd);
     UNUSED(ngbr_avail);
 
-    mask_full_128b = _mm_set1_epi8(0xff);
-    mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
-
     pu1_top = pu1_src +BLK_SIZE + 1;
 
     val_16x8b = _mm_loadl_epi64((__m128i *)pu1_top);
@@ -756,10 +721,15 @@ void ih264_intra_pred_luma_4x4_mode_vert_l_ssse3(UWORD8 *pu1_src,
     row3_16x8b = _mm_srli_si128(row1_16x8b, 1);
     row4_16x8b = _mm_srli_si128(row2_16x8b, 1);
 
-    _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst);
-    _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
-    _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd2));
-    _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd3));
+    row1 = _mm_cvtsi128_si32(row1_16x8b);
+    row2 = _mm_cvtsi128_si32(row2_16x8b);
+    row3 = _mm_cvtsi128_si32(row3_16x8b);
+    row4 = _mm_cvtsi128_si32(row4_16x8b);
+
+    *((WORD32 *)(pu1_dst)) = row1;
+    *((WORD32 *)(pu1_dst + dst_strd)) = row2;
+    *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
+    *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
 }
 
 /**
@@ -809,14 +779,11 @@ void ih264_intra_pred_luma_4x4_mode_horz_u_ssse3(UWORD8 *pu1_src,
     __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
 
     __m128i zero_vector, const_2_8x16b, rev_16x8b;
-    __m128i mask_full_128b, mask_low_32b;
+    WORD32 row1,row2,row3,row4;
 
     UNUSED(src_strd);
     UNUSED(ngbr_avail);
 
-    mask_full_128b = _mm_set1_epi8(0xff);
-    mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
-
     pu1_left = pu1_src + BLK_SIZE - 1;
 
     zero_vector = _mm_setzero_si128();
@@ -851,10 +818,15 @@ void ih264_intra_pred_luma_4x4_mode_horz_u_ssse3(UWORD8 *pu1_src,
     row3_16x8b = _mm_srli_si128(row1_16x8b, 4);
     row4_16x8b = _mm_srli_si128(row1_16x8b, 6);
 
-    _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst);
-    _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
-    _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd2));
-    _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst  + dst_strd3));
+    row1 = _mm_cvtsi128_si32(row1_16x8b);
+    row2 = _mm_cvtsi128_si32(row2_16x8b);
+    row3 = _mm_cvtsi128_si32(row3_16x8b);
+    row4 = _mm_cvtsi128_si32(row4_16x8b);
+
+    *((WORD32 *)(pu1_dst)) = row1;
+    *((WORD32 *)(pu1_dst + dst_strd)) = row2;
+    *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
+    *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
 }
 
 /*******************    8x8 Modes    *******************/
@@ -1814,9 +1786,7 @@ void ih264_intra_pred_luma_16x16_mode_horz_ssse3(UWORD8 *pu1_src,
 {
     UWORD8 *pu1_left;
     WORD32 dst_strd2, dst_strd3, dst_strd4;
-    WORD32 val1, val2;
 
-    __m128i val_16x8b;
     __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
 
     UNUSED(src_strd);
@@ -1826,60 +1796,46 @@ void ih264_intra_pred_luma_16x16_mode_horz_ssse3(UWORD8 *pu1_src,
 
     dst_strd4 = dst_strd << 2;
 
-    val_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 15));
-
     dst_strd2 = dst_strd << 1;
     dst_strd3 = dst_strd4 - dst_strd;
 
-    val1 =  _mm_extract_epi16(val_16x8b, 7);
-    val2 =  _mm_extract_epi16(val_16x8b, 6);
-
-    row1_16x8b = _mm_set1_epi8(val1 >> 8);
-    row2_16x8b = _mm_set1_epi8(val1 & 0xff);
-    row3_16x8b = _mm_set1_epi8(val2 >> 8);
-    row4_16x8b = _mm_set1_epi8(val2 & 0xff);
+    row1_16x8b = _mm_set1_epi8(*(pu1_left));
+    row2_16x8b = _mm_set1_epi8(*(pu1_left - 1));
+    row3_16x8b = _mm_set1_epi8(*(pu1_left - 2));
+    row4_16x8b = _mm_set1_epi8(*(pu1_left - 3));
 
     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b);
 
-    val1 =  _mm_extract_epi16(val_16x8b, 5);
-    val2 =  _mm_extract_epi16(val_16x8b, 4);
-
     pu1_dst += dst_strd4;
-    row1_16x8b = _mm_set1_epi8(val1 >> 8);
-    row2_16x8b = _mm_set1_epi8(val1 & 0xff);
-    row3_16x8b = _mm_set1_epi8(val2 >> 8);
-    row4_16x8b = _mm_set1_epi8(val2 & 0xff);
+    row1_16x8b = _mm_set1_epi8(*(pu1_left - 4));
+    row2_16x8b = _mm_set1_epi8(*(pu1_left - 5));
+    row3_16x8b = _mm_set1_epi8(*(pu1_left - 6));
+    row4_16x8b = _mm_set1_epi8(*(pu1_left - 7));
 
     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b);
 
-    val1 =  _mm_extract_epi16(val_16x8b, 3);
-    val2 =  _mm_extract_epi16(val_16x8b, 2);
-
     pu1_dst += dst_strd4;
-    row1_16x8b = _mm_set1_epi8(val1 >> 8);
-    row2_16x8b = _mm_set1_epi8(val1 & 0xff);
-    row3_16x8b = _mm_set1_epi8(val2 >> 8);
-    row4_16x8b = _mm_set1_epi8(val2 & 0xff);
+    row1_16x8b = _mm_set1_epi8(*(pu1_left - 8));
+    row2_16x8b = _mm_set1_epi8(*(pu1_left - 9));
+    row3_16x8b = _mm_set1_epi8(*(pu1_left - 10));
+    row4_16x8b = _mm_set1_epi8(*(pu1_left - 11));
 
     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b);
 
-    val1 =  _mm_extract_epi16(val_16x8b, 1);
-    val2 =  _mm_extract_epi16(val_16x8b, 0);
-
     pu1_dst += dst_strd4;
-    row1_16x8b = _mm_set1_epi8(val1 >> 8);
-    row2_16x8b = _mm_set1_epi8(val1 & 0xff);
-    row3_16x8b = _mm_set1_epi8(val2 >> 8);
-    row4_16x8b = _mm_set1_epi8(val2 & 0xff);
+    row1_16x8b = _mm_set1_epi8(*(pu1_left - 12));
+    row2_16x8b = _mm_set1_epi8(*(pu1_left - 13));
+    row3_16x8b = _mm_set1_epi8(*(pu1_left - 14));
+    row4_16x8b = _mm_set1_epi8(*(pu1_left - 15));
 
     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
diff --git a/common/x86/ih264_padding_ssse3.c b/common/x86/ih264_padding_ssse3.c
index 6dadd39..43ded8e 100644
--- a/common/x86/ih264_padding_ssse3.c
+++ b/common/x86/ih264_padding_ssse3.c
@@ -97,9 +97,6 @@ void ih264_pad_left_luma_ssse3(UWORD8 *pu1_src,
     WORD32 row;
     WORD32 i;
     UWORD8 *pu1_dst;
-    __m128i const0_16x8b;
-
-    const0_16x8b = _mm_setzero_si128();
 
     ASSERT(pad_size % 8 == 0);
 
@@ -107,9 +104,8 @@ void ih264_pad_left_luma_ssse3(UWORD8 *pu1_src,
     {
         __m128i src_temp0_16x8b;
 
-        src_temp0_16x8b =  _mm_loadu_si128((__m128i *)pu1_src);
         pu1_dst = pu1_src - pad_size;
-        src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+        src_temp0_16x8b = _mm_set1_epi8(*pu1_src);
         for(i = 0; i < pad_size; i += 8)
         {
             _mm_storel_epi64((__m128i *)(pu1_dst + i), src_temp0_16x8b);
@@ -168,20 +164,14 @@ void ih264_pad_left_chroma_ssse3(UWORD8 *pu1_src,
     WORD32 row;
     WORD32 col;
     UWORD8 *pu1_dst;
-    __m128i const0_16x8b, const1_16x8b;
-    const0_16x8b = _mm_setzero_si128();
-    const1_16x8b = _mm_set1_epi8(1);
-    const0_16x8b = _mm_unpacklo_epi8(const0_16x8b, const1_16x8b);
 
     ASSERT(pad_size % 8 == 0);
     for(row = 0; row < ht; row++)
     {
         __m128i src_temp0_16x8b;
 
-        src_temp0_16x8b =  _mm_loadu_si128((__m128i *)pu1_src);
         pu1_dst = pu1_src - pad_size;
-        src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
-
+        src_temp0_16x8b = _mm_set1_epi16(*((UWORD16 *)pu1_src));
         for(col = 0; col < pad_size; col += 8)
         {
             _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b);
@@ -240,7 +230,6 @@ void ih264_pad_right_luma_ssse3(UWORD8 *pu1_src,
     WORD32 row;
     WORD32 col;
     UWORD8 *pu1_dst;
-    __m128i const0_16x8b;
 
     ASSERT(pad_size % 8 == 0);
 
@@ -248,10 +237,8 @@ void ih264_pad_right_luma_ssse3(UWORD8 *pu1_src,
     {
         __m128i src_temp0_16x8b;
 
-        src_temp0_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src - 1));
-        const0_16x8b = _mm_setzero_si128();
         pu1_dst = pu1_src;
-        src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+        src_temp0_16x8b = _mm_set1_epi8(*(pu1_src - 1));
         for(col = 0; col < pad_size; col += 8)
         {
             _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b);
@@ -310,10 +297,6 @@ void ih264_pad_right_chroma_ssse3(UWORD8 *pu1_src,
     WORD32 row;
     WORD32 col;
     UWORD8 *pu1_dst;
-    __m128i const0_16x8b, const1_16x8b;
-    const0_16x8b = _mm_setzero_si128();
-    const1_16x8b = _mm_set1_epi8(1);
-    const0_16x8b = _mm_unpacklo_epi8(const0_16x8b, const1_16x8b);
 
     ASSERT(pad_size % 8 == 0);
 
@@ -321,9 +304,8 @@ void ih264_pad_right_chroma_ssse3(UWORD8 *pu1_src,
     {
         __m128i src_temp0_16x8b;
 
-        src_temp0_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src - 2));
         pu1_dst = pu1_src;
-        src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+        src_temp0_16x8b = _mm_set1_epi16(*((UWORD16 *)(pu1_src - 2)));
         for(col = 0; col < pad_size; col += 8)
         {
             _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b);
diff --git a/common/x86/ih264_weighted_pred_sse42.c b/common/x86/ih264_weighted_pred_sse42.c
index b1684b7..48f1f54 100644
--- a/common/x86/ih264_weighted_pred_sse42.c
+++ b/common/x86/ih264_weighted_pred_sse42.c
@@ -96,12 +96,6 @@ void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1,
 
     if(wd == 4)
     {
-        __m128i mask_full_16x8b, mask_ll4B_16x8b;
-
-        mask_full_16x8b = _mm_set1_epi8(0xff);
-        mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12);
-        // mask for first four bytes
-
         do
         {
             y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
@@ -121,13 +115,10 @@ void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1,
             y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
             y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
 
-            _mm_maskmoveu_si128(y0_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst);
-            _mm_maskmoveu_si128(y0_1_16x8b, mask_ll4B_16x8b,
-                                (char*)(pu1_dst + dst_strd));
-            _mm_maskmoveu_si128(y0_2_16x8b, mask_ll4B_16x8b,
-                                (char*)(pu1_dst + (dst_strd << 1)));
-            _mm_maskmoveu_si128(y0_3_16x8b, mask_ll4B_16x8b,
-                                (char*)(pu1_dst + dst_strd * 3));
+            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b);
+            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b);
+            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b);
+            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b);
 
             ht -= 4;
             pu1_src1 += src_strd1 << 2;
@@ -268,12 +259,6 @@ void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1,
 
     if(wd == 2)
     {
-        __m128i mask_full_16x8b, mask_ll4B_16x8b;
-
-        mask_full_16x8b = _mm_set1_epi8(0xff);
-        mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12);
-        // mask for first four bytes
-
         do
         {
             uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
@@ -285,9 +270,8 @@ void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1,
             uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
             uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
 
-            _mm_maskmoveu_si128(uv0_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst);
-            _mm_maskmoveu_si128(uv0_1_16x8b, mask_ll4B_16x8b,
-                                (char*)(pu1_dst + dst_strd));
+            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b);
+            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b);
 
             ht -= 2;
             pu1_src1 += src_strd1 << 1;
@@ -419,12 +403,6 @@ void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src,
     {
         __m128i y_0_8x16b, y_2_8x16b;
 
-        __m128i mask_full_16x8b, mask_ll4B_16x8b;
-
-        mask_full_16x8b = _mm_set1_epi8(0xff);
-        mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12);
-        // mask for first four bytes
-
         do
         {
             y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
@@ -455,13 +433,10 @@ void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src,
             y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8);
             y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12);
 
-            _mm_maskmoveu_si128(y_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst);
-            _mm_maskmoveu_si128(y_1_16x8b, mask_ll4B_16x8b,
-                                (char*)(pu1_dst + dst_strd));
-            _mm_maskmoveu_si128(y_2_16x8b, mask_ll4B_16x8b,
-                                (char*)(pu1_dst + (dst_strd << 1)));
-            _mm_maskmoveu_si128(y_3_16x8b, mask_ll4B_16x8b,
-                                (char*)(pu1_dst + dst_strd * 3));
+            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
+            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
+            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b);
+            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b);
 
             ht -= 4;
             pu1_src += src_strd << 2;
@@ -660,12 +635,6 @@ void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src,
     {
         __m128i y_0_8x16b;
 
-        __m128i mask_full_16x8b, mask_ll4B_16x8b;
-
-        mask_full_16x8b = _mm_set1_epi8(0xff);
-        mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12);
-        // mask for first four bytes
-
         do
         {
             y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
@@ -686,9 +655,8 @@ void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src,
             y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b);
             y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
 
-            _mm_maskmoveu_si128(y_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst);
-            _mm_maskmoveu_si128(y_1_16x8b, mask_ll4B_16x8b,
-                                (char*)(pu1_dst + dst_strd));
+            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
+            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
 
             ht -= 2;
             pu1_src += src_strd << 1;
@@ -890,12 +858,6 @@ void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1,
         __m128i y1_0_8x16b, y1_2_8x16b;
         __m128i y2_0_8x16b, y2_2_8x16b;
 
-        __m128i mask_ll4B_16x8b;
-
-        mask_ll4B_16x8b = _mm_set1_epi8(0xff);
-        mask_ll4B_16x8b = _mm_srli_si128(mask_ll4B_16x8b, 12);
-        // mask for first four bytes
-
         do
         {
             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
@@ -942,13 +904,11 @@ void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1,
             y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
             y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12);
 
-            _mm_maskmoveu_si128(y1_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst);
-            _mm_maskmoveu_si128(y1_1_16x8b, mask_ll4B_16x8b,
-                                (char*)(pu1_dst + dst_strd));
-            _mm_maskmoveu_si128(y1_2_16x8b, mask_ll4B_16x8b,
-                                (char*)(pu1_dst + (dst_strd << 1)));
-            _mm_maskmoveu_si128(y1_3_16x8b, mask_ll4B_16x8b,
-                                (char*)(pu1_dst + dst_strd * 3));
+            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
+            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
+            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b);
+            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b);
+
 
             ht -= 4;
             pu1_src1 += src_strd1 << 2;
@@ -1187,11 +1147,6 @@ void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1,
     {
         __m128i y1_0_8x16b, y2_0_8x16b;
 
-        __m128i mask_full_16x8b, mask_ll4B_16x8b;
-
-        mask_full_16x8b = _mm_set1_epi8(0xff);
-        mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12);
-
         do
         {
             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
@@ -1218,9 +1173,8 @@ void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1,
             y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
             y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
 
-            _mm_maskmoveu_si128(y1_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst);
-            _mm_maskmoveu_si128(y1_1_16x8b, mask_ll4B_16x8b,
-                                (char*)(pu1_dst + dst_strd));
+            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
+            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
 
             ht -= 2;
             pu1_src1 += src_strd1 << 1;
diff --git a/decoder.arm.mk b/decoder.arm.mk
index 556e838..e5ac2d4 100644
--- a/decoder.arm.mk
+++ b/decoder.arm.mk
@@ -2,7 +2,7 @@ libavcd_inc_dir_arm +=  $(LOCAL_PATH)/decoder/arm
 libavcd_inc_dir_arm +=  $(LOCAL_PATH)/common/arm
 
 libavcd_srcs_c_arm  += decoder/arm/ih264d_function_selector.c
-libavcd_cflags_arm  += -DDISABLE_NEONINTR  -DARM -DARMGCC
+libavcd_cflags_arm  += -DARM
 
 #LOCAL_ARM_MODE         := arm
 
@@ -43,7 +43,3 @@ libavcd_srcs_asm_arm    +=  common/arm/ih264_arm_memory_barrier.s
 LOCAL_SRC_FILES_arm += $(libavcd_srcs_c_arm) $(libavcd_srcs_asm_arm)
 LOCAL_C_INCLUDES_arm += $(libavcd_inc_dir_arm)
 LOCAL_CFLAGS_arm += $(libavcd_cflags_arm)
-
-# CLANG WORKAROUNDS
-LOCAL_CLANG_ASFLAGS_arm += -no-integrated-as
-LOCAL_CLANG_ASFLAGS_arm += $(addprefix -Wa$(comma)-I,$(libavcd_inc_dir_arm))
diff --git a/decoder.arm64.mk b/decoder.arm64.mk
index 423c34d..2140b94 100644
--- a/decoder.arm64.mk
+++ b/decoder.arm64.mk
@@ -1,5 +1,5 @@
 libavcd_cflags_arm64 += -DARMV8
-libavcd_cflags_arm64 += -DDISABLE_NEONINTR  -DARM -DARMGCC
+libavcd_cflags_arm64 += -DARM
 
 libavcd_inc_dir_arm64   +=  $(LOCAL_PATH)/decoder/arm
 libavcd_inc_dir_arm64   +=  $(LOCAL_PATH)/common/armv8
@@ -46,5 +46,4 @@ LOCAL_C_INCLUDES_arm64 += $(libavcd_inc_dir_arm64)
 LOCAL_CFLAGS_arm64 += $(libavcd_cflags_arm64)
 
 # CLANG WORKAROUNDS
-LOCAL_CLANG_ASFLAGS_arm64 += -no-integrated-as
 LOCAL_CLANG_ASFLAGS_arm64 += $(addprefix -Wa$(comma)-I,$(libavcd_inc_dir_arm64))
diff --git a/decoder.mk b/decoder.mk
index 7df8d17..8b9bd55 100644
--- a/decoder.mk
+++ b/decoder.mk
@@ -9,8 +9,8 @@ LOCAL_MODULE := libavcdec
 
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
-LOCAL_CFLAGS += -D_LIB -DMULTICORE -fPIC -UAPPLY_CONCEALMENT -UINSERT_LOGO  -DTHREAD_QUAD_CORE
-LOCAL_CFLAGS += -O3 -DANDROID
+LOCAL_CFLAGS += -fPIC
+LOCAL_CFLAGS += -O3
 
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/decoder $(LOCAL_PATH)/common
 
diff --git a/decoder.x86.mk b/decoder.x86.mk
index 309bc23..e7a4686 100644
--- a/decoder.x86.mk
+++ b/decoder.x86.mk
@@ -1,4 +1,4 @@
-libavcd_cflags_x86 += -DX86 -DDISABLE_AVX2 -m32 -msse4.2 -mno-avx -DDEFAULT_ARCH=D_ARCH_X86_SSE42
+libavcd_cflags_x86 += -DX86 -msse4.2 -mno-avx -DDEFAULT_ARCH=D_ARCH_X86_SSE42
 
 libavcd_inc_dir_x86     +=  $(LOCAL_PATH)/decoder/x86
 libavcd_inc_dir_x86     +=  $(LOCAL_PATH)/common/x86
diff --git a/decoder.x86_64.mk b/decoder.x86_64.mk
index 1b018f7..b265f4f 100644
--- a/decoder.x86_64.mk
+++ b/decoder.x86_64.mk
@@ -1,5 +1,4 @@
-libavcd_cflags_x86_64 += -DX86 -DDISABLE_AVX2 -m64 -msse4.2 -mno-avx  -DDEFAULT_ARCH=D_ARCH_X86_SSE42
-libavcd_cflags_x86_64 += -UAPPLY_CONCEALMENT -ULOGO_EN  -DTHREAD_QUAD_CORE
+libavcd_cflags_x86_64 += -DX86 -msse4.2 -mno-avx  -DDEFAULT_ARCH=D_ARCH_X86_SSE42
 
 libavcd_inc_dir_x86_64   +=  $(LOCAL_PATH)/decoder/x86
 libavcd_inc_dir_x86_64   +=  $(LOCAL_PATH)/common/x86
diff --git a/decoder/ih264d_api.c b/decoder/ih264d_api.c
index 18e4c2e..6ea75c6 100644
--- a/decoder/ih264d_api.c
+++ b/decoder/ih264d_api.c
@@ -107,18 +107,9 @@
 #define CODEC_VENDOR            "ITTIAM"
 #define MAXVERSION_STRLEN       511
 #define VERSION(version_string, codec_name, codec_release_type, codec_release_ver, codec_vendor)    \
-    strncpy(version_string,"@(#)Id:", MAXVERSION_STRLEN);                                                               \
-    strncat(version_string,codec_name, MAXVERSION_STRLEN);                                                              \
-    strncat(version_string,"_", MAXVERSION_STRLEN);                                                                     \
-    strncat(version_string,codec_release_type, MAXVERSION_STRLEN);                                                      \
-    strncat(version_string," Ver:", MAXVERSION_STRLEN);                                                                 \
-    strncat(version_string,codec_release_ver, MAXVERSION_STRLEN);                                                       \
-    strncat(version_string," Released by ", MAXVERSION_STRLEN);                                                         \
-    strncat(version_string,codec_vendor, MAXVERSION_STRLEN);                                                            \
-    strncat(version_string," Build: ", MAXVERSION_STRLEN);                                                              \
-    strncat(version_string,__DATE__, MAXVERSION_STRLEN);                                                                \
-    strncat(version_string," @ ", MAXVERSION_STRLEN);                                                                       \
-    strncat(version_string,__TIME__, MAXVERSION_STRLEN);
+    snprintf(version_string, MAXVERSION_STRLEN,                                                     \
+             "@(#)Id:%s_%s Ver:%s Released by %s Build: %s @ %s",                                   \
+             codec_name, codec_release_type, codec_release_ver, codec_vendor, __DATE__, __TIME__)
 
 #define MAX_NAL_UNIT_SIZE       MAX((H264_MAX_FRAME_HEIGHT * H264_MAX_FRAME_HEIGHT),MIN_NALUNIT_SIZE)
 #define MIN_NALUNIT_SIZE        200000
@@ -1501,7 +1492,6 @@ void ih264d_init_decoder(void * ps_dec_params)
     ps_dec->u2_mbx = 0xffff;
     ps_dec->u2_mby = 0;
     ps_dec->u2_total_mbs_coded = 0;
-    ps_cur_slice->u1_end_of_frame_signal = 0;
 
     /* POC initializations */
     ps_prev_poc = &ps_dec->s_prev_pic_poc;
@@ -2441,9 +2431,9 @@ WORD32 ih264d_init(iv_obj_t *dec_hdl, void *pv_api_ip, void *pv_api_op)
 {
     ih264d_init_ip_t *ps_init_ip;
     ih264d_init_op_t *ps_init_op;
+    WORD32 init_status = IV_SUCCESS;
     ps_init_ip = (ih264d_init_ip_t *)pv_api_ip;
     ps_init_op = (ih264d_init_op_t *)pv_api_op;
-    WORD32 init_status = IV_SUCCESS;
 
     init_status = ih264d_init_video_decoder(dec_hdl, ps_init_ip, ps_init_op);
 
@@ -2602,11 +2592,11 @@ WORD32 ih264d_video_decode(iv_obj_t *dec_hdl, void *pv_api_ip, void *pv_api_op)
     WORD32 ret,api_ret_value = IV_SUCCESS;
     WORD32 header_data_left = 0,frame_data_left = 0;
     UWORD8 *pu1_bitstrm_buf;
-    ithread_set_name((void*)"Parse_thread");
-
-
     ivd_video_decode_ip_t *ps_dec_ip;
     ivd_video_decode_op_t *ps_dec_op;
+
+    ithread_set_name((void*)"Parse_thread");
+
     ps_dec_ip = (ivd_video_decode_ip_t *)pv_api_ip;
     ps_dec_op = (ivd_video_decode_op_t *)pv_api_op;
     ps_dec->pv_dec_out = ps_dec_op;
@@ -2859,8 +2849,9 @@ WORD32 ih264d_video_decode(iv_obj_t *dec_hdl, void *pv_api_ip, void *pv_api_op)
     ps_dec->u2_cur_slice_num = 0;
     ps_dec->cur_dec_mb_num = 0;
     ps_dec->cur_recon_mb_num = 0;
-    ps_dec->u4_first_slice_in_pic = 1;
+    ps_dec->u4_first_slice_in_pic = 2;
     ps_dec->u1_slice_header_done = 0;
+    ps_dec->u1_dangling_field = 0;
 
     ps_dec->u4_dec_thread_created = 0;
     ps_dec->u4_bs_deblk_thread_created = 0;
@@ -2914,7 +2905,6 @@ WORD32 ih264d_video_decode(iv_obj_t *dec_hdl, void *pv_api_ip, void *pv_api_op)
                 {
                     ps_dec->u2_total_mbs_coded =
                                     ps_dec->ps_cur_sps->u2_max_mb_addr + 1;
-                    ps_dec->ps_cur_slice->u1_end_of_frame_signal = 1;
                 }
 
                 /* close deblock thread if it is not closed yet*/
@@ -3029,16 +3019,39 @@ WORD32 ih264d_video_decode(iv_obj_t *dec_hdl, void *pv_api_ip, void *pv_api_op)
             ps_dec_op->u4_error_code = error | ret;
             api_ret_value = IV_FAIL;
 
-            if((ret == IVD_RES_CHANGED)||(ret == IVD_STREAM_WIDTH_HEIGHT_NOT_SUPPORTED))
+            if((ret == IVD_RES_CHANGED) || (ret == IVD_STREAM_WIDTH_HEIGHT_NOT_SUPPORTED))
             {
                 /*dont consume the SPS*/
                 ps_dec_op->u4_num_bytes_consumed -= bytes_consumed;
                 return IV_FAIL;
             }
-            if(ret == ERROR_IN_LAST_SLICE_OF_PIC)
+
+            if((ret == IVD_RES_CHANGED) || (ret == IVD_STREAM_WIDTH_HEIGHT_NOT_SUPPORTED))
+            {
+                /*dont consume the SPS*/
+                ps_dec_op->u4_num_bytes_consumed -= bytes_consumed;
+                return IV_FAIL;
+            }
+
+            if((ret == ERROR_UNAVAIL_PICBUF_T) || (ret == ERROR_UNAVAIL_MVBUF_T))
+            {
+                ps_dec_op->u4_num_bytes_consumed -= bytes_consumed;
+                return IV_FAIL;
+            }
+
+            if((ret == ERROR_INCOMPLETE_FRAME) || (ret == ERROR_DANGLING_FIELD_IN_PIC))
             {
                 ps_dec_op->u4_num_bytes_consumed -= bytes_consumed;
+                api_ret_value = IV_FAIL;
+                break;
+            }
+
+            if(ret == ERROR_IN_LAST_SLICE_OF_PIC)
+            {
+                api_ret_value = IV_FAIL;
+                break;
             }
+
         }
 
         if(ps_dec->u4_return_to_app)
@@ -3079,11 +3092,24 @@ WORD32 ih264d_video_decode(iv_obj_t *dec_hdl, void *pv_api_ip, void *pv_api_op)
     {
         // last slice - missing/corruption
         WORD32 num_mb_skipped;
+        WORD32 prev_slice_err;
         pocstruct_t temp_poc;
 
         num_mb_skipped = (ps_dec->u2_frm_ht_in_mbs * ps_dec->u2_frm_wd_in_mbs)
                             - ps_dec->u2_total_mbs_coded;
-        ih264d_mark_err_slice_skip(ps_dec, num_mb_skipped, ps_dec->u1_nal_unit_type == IDR_SLICE_NAL,&temp_poc,3);
+
+        if(ps_dec->u4_first_slice_in_pic)
+            prev_slice_err = 1;
+        else
+            prev_slice_err = 2;
+
+        ret = ih264d_mark_err_slice_skip(ps_dec, num_mb_skipped, ps_dec->u1_nal_unit_type == IDR_SLICE_NAL, ps_dec->ps_cur_slice->u2_frame_num,
+                                   &temp_poc, prev_slice_err);
+
+        if((ret == ERROR_UNAVAIL_PICBUF_T) || (ret == ERROR_UNAVAIL_MVBUF_T))
+        {
+            return IV_FAIL;
+        }
     }
 
 
@@ -3181,19 +3207,6 @@ WORD32 ih264d_video_decode(iv_obj_t *dec_hdl, void *pv_api_ip, void *pv_api_op)
          * For field pictures, set the bottom and top picture decoded u4_flag correctly.
          */
 
-        if(ps_dec->u4_pic_buf_got == 0)
-        {
-            ih264d_fill_output_struct_from_context(ps_dec, ps_dec_op);
-
-            ps_dec_op->u4_frame_decoded_flag = 0;
-            /* close deblock thread if it is not closed yet*/
-            if(ps_dec->u4_num_cores == 3)
-            {
-                ih264d_signal_bs_deblk_thread(ps_dec);
-            }
-            return (IV_FAIL);
-        }
-
         if(ps_dec->ps_cur_slice->u1_field_pic_flag)
         {
             if(1 == ps_dec->ps_cur_slice->u1_bottom_field_flag)
@@ -3206,10 +3219,19 @@ WORD32 ih264d_video_decode(iv_obj_t *dec_hdl, void *pv_api_ip, void *pv_api_op)
             }
         }
 
-        /* Calling Function to deblock Picture and Display */
-        ret = ih264d_deblock_display(ps_dec);
-        if(ret != 0)
-            return IV_FAIL;
+        /* if new frame in not found (if we are still getting slices from previous frame)
+         * ih264d_deblock_display is not called. Such frames will not be added to reference /display
+         */
+        if((ps_dec->ps_dec_err_status->u1_err_flag & REJECT_CUR_PIC) == 0)
+        {
+            /* Calling Function to deblock Picture and Display */
+            ret = ih264d_deblock_display(ps_dec);
+            if(ret != 0)
+            {
+                return IV_FAIL;
+            }
+        }
+
 
         /*set to complete ,as we dont support partial frame decode*/
         if(ps_dec->i4_header_decoded == 3)
@@ -3334,7 +3356,7 @@ WORD32 ih264d_get_version(iv_obj_t *dec_hdl, void *pv_api_ip, void *pv_api_op)
         return (IV_FAIL);
     }
 
-    version_string_len = strnlen(version_string, MAXVERSION_STRLEN) + 1;
+    version_string_len = strlen(version_string) + 1;
 
     if(ps_ip->u4_version_buffer_size >= version_string_len) //(WORD32)sizeof(sizeof(version_string)))
     {
diff --git a/decoder/ih264d_error_handler.h b/decoder/ih264d_error_handler.h
index 1ff5c7d..5b1bc84 100644
--- a/decoder/ih264d_error_handler.h
+++ b/decoder/ih264d_error_handler.h
@@ -110,14 +110,18 @@ typedef enum
     ERROR_LEVEL_UNSUPPORTED = 0x90,
     ERROR_START_CODE_NOT_FOUND = 0x91,
     ERROR_PIC_NUM_IS_REPEATED = 0x92,
-    ERROR_IN_LAST_SLICE_OF_PIC = 0x93
+    ERROR_IN_LAST_SLICE_OF_PIC = 0x93,
+    ERROR_NEW_FRAME_EXPECTED = 0x94,
+    ERROR_INCOMPLETE_FRAME = 0x95
 
 } h264_decoder_error_code_t;
 
 WORD32 ih264d_mark_err_slice_skip(dec_struct_t * ps_dec,
                                   WORD32 num_mb_skip,
                                   UWORD8 u1_is_idr_slice,
+                                  UWORD16 u2_frame_num,
                                   pocstruct_t *ps_cur_poc,
                                   WORD32 prev_slice_err);
 
+void ih264d_err_pic_dispbuf_mgr(dec_struct_t *ps_dec);
 #endif /* _IH264D_ERROR_HANDLER_H_ */
diff --git a/decoder/ih264d_function_selector.h b/decoder/ih264d_function_selector.h
index 92ad959..22e2efe 100644
--- a/decoder/ih264d_function_selector.h
+++ b/decoder/ih264d_function_selector.h
@@ -65,10 +65,6 @@ void ih264d_init_function_ptr_generic(dec_struct_t *ps_codec);
 void ih264d_init_function_ptr_ssse3(dec_struct_t *ps_codec);
 void ih264d_init_function_ptr_sse42(dec_struct_t *ps_codec);
 
-#ifndef DISABLE_AVX2
-void ih264d_init_function_ptr_avx2(dec_struct_t *ps_codec);
-#endif
-
 void ih264d_init_function_ptr_a9q(dec_struct_t *ps_codec);
 void ih264d_init_function_ptr_av8(dec_struct_t *ps_codec);
 
diff --git a/decoder/ih264d_parse_headers.c b/decoder/ih264d_parse_headers.c
index f7ae612..743b573 100644
--- a/decoder/ih264d_parse_headers.c
+++ b/decoder/ih264d_parse_headers.c
@@ -545,10 +545,6 @@ WORD32 ih264d_parse_sps(dec_struct_t *ps_dec, dec_bit_stream_t *ps_bitstrm)
     u1_level_idc = ih264d_get_bits_h264(ps_bitstrm, 8);
 
 
-     if(ps_dec->u4_level_at_init < u1_level_idc)
-     {
-         return IH264D_UNSUPPORTED_LEVEL;
-     }
 
     COPYTHECONTEXT("SPS: u4_level_idc",u1_level_idc);
 
@@ -934,6 +930,10 @@ WORD32 ih264d_parse_sps(dec_struct_t *ps_dec, dec_bit_stream_t *ps_bitstrm)
         ps_dec->u2_disp_width = i4_cropped_wd;
 
     }
+     if(ps_dec->u4_level_at_init < u1_level_idc)
+     {
+         return IH264D_UNSUPPORTED_LEVEL;
+     }
 
     ps_seq->u1_is_valid = TRUE;
 
@@ -1096,8 +1096,17 @@ WORD32 ih264d_parse_nal_unit(iv_obj_t *dec_hdl,
                                                             == IDR_SLICE_NAL),
                                             u1_nal_ref_idc, ps_dec);
 
+                            if((ps_dec->u4_first_slice_in_pic != 0)&&
+                                ((ps_dec->ps_dec_err_status->u1_err_flag & REJECT_CUR_PIC) == 0))
+                            {
+                                /*  if the first slice header was not valid set to 1 */
+                                ps_dec->u4_first_slice_in_pic = 1;
+                            }
+
                             if(i_status != OK)
+                            {
                                 return i_status;
+                            }
                         }
                         else
                         {
diff --git a/decoder/ih264d_parse_islice.c b/decoder/ih264d_parse_islice.c
index 534c785..1e4fdfa 100644
--- a/decoder/ih264d_parse_islice.c
+++ b/decoder/ih264d_parse_islice.c
@@ -132,21 +132,23 @@ WORD32 ih264d_parse_imb_cavlc(dec_struct_t * ps_dec,
         /*--------------------------------------------------------------------*/
         if (!ps_cur_mb_info->u1_tran_form8x8)
         {
+            UWORD8 *pu1_temp;
             ih264d_read_intra_pred_modes(ps_dec,
                                           ((UWORD8 *)ps_dec->pv_parse_tu_coeff_data),
                                           ((UWORD8 *)ps_dec->pv_parse_tu_coeff_data+16),
                                           ps_cur_mb_info->u1_tran_form8x8);
-            UWORD8 *pu1_temp = (UWORD8 *)ps_dec->pv_parse_tu_coeff_data;
+            pu1_temp = (UWORD8 *)ps_dec->pv_parse_tu_coeff_data;
             pu1_temp += 32;
             ps_dec->pv_parse_tu_coeff_data = (void *)pu1_temp;
         }
         else
         {
+            UWORD8 *pu1_temp;
             ih264d_read_intra_pred_modes(ps_dec,
                                           ((UWORD8 *)ps_dec->pv_parse_tu_coeff_data),
                                           ((UWORD8 *)ps_dec->pv_parse_tu_coeff_data+4),
                                           ps_cur_mb_info->u1_tran_form8x8);
-            UWORD8 *pu1_temp = (UWORD8 *)ps_dec->pv_parse_tu_coeff_data;
+            pu1_temp = (UWORD8 *)ps_dec->pv_parse_tu_coeff_data;
             pu1_temp += 8;
             ps_dec->pv_parse_tu_coeff_data = (void *)pu1_temp;
         }
@@ -403,8 +405,8 @@ WORD32 ih264d_parse_imb_cavlc(dec_struct_t * ps_dec,
                                 (tu_sblk4x4_coeff_data_t *)ps_dec->pv_parse_tu_coeff_data;
                 WORD16 *pi2_coeff_block =
                                 (WORD16 *)ps_dec->pv_parse_tu_coeff_data;
-                ps_tu_4x4->u2_sig_coeff_map = 0;
                 UWORD32 u4_num_coeff;
+                ps_tu_4x4->u2_sig_coeff_map = 0;
 
                 ret = ps_dec->pf_cavlc_parse4x4coeff[(ui_N > 7)](pi2_dc_coef, 0, ui_N,
                                                                  ps_dec, &u4_num_coeff);
@@ -542,23 +544,25 @@ WORD32 ih264d_parse_imb_cabac(dec_struct_t * ps_dec,
         /*--------------------------------------------------------------------*/
         if (!ps_cur_mb_info->u1_tran_form8x8)
         {
+            UWORD8 *pu1_temp;
             ih264d_read_intra_pred_modes_cabac(
                             ps_dec,
                             ((UWORD8 *)ps_dec->pv_parse_tu_coeff_data),
                             ((UWORD8 *)ps_dec->pv_parse_tu_coeff_data+16),
                             ps_cur_mb_info->u1_tran_form8x8);
-            UWORD8 *pu1_temp = (UWORD8 *)ps_dec->pv_parse_tu_coeff_data;
+            pu1_temp = (UWORD8 *)ps_dec->pv_parse_tu_coeff_data;
             pu1_temp += 32;
             ps_dec->pv_parse_tu_coeff_data = (void *)pu1_temp;
         }
         else
         {
+            UWORD8 *pu1_temp;
             ih264d_read_intra_pred_modes_cabac(
                             ps_dec,
                             ((UWORD8 *)ps_dec->pv_parse_tu_coeff_data),
                             ((UWORD8 *)ps_dec->pv_parse_tu_coeff_data+4),
                             ps_cur_mb_info->u1_tran_form8x8);
-            UWORD8 *pu1_temp = (UWORD8 *)ps_dec->pv_parse_tu_coeff_data;
+            pu1_temp = (UWORD8 *)ps_dec->pv_parse_tu_coeff_data;
             pu1_temp += 8;
             ps_dec->pv_parse_tu_coeff_data = (void *)pu1_temp;
         }
diff --git a/decoder/ih264d_parse_pslice.c b/decoder/ih264d_parse_pslice.c
index 02110eb..d56f44e 100644
--- a/decoder/ih264d_parse_pslice.c
+++ b/decoder/ih264d_parse_pslice.c
@@ -1432,6 +1432,7 @@ WORD32 ih264d_parse_inter_slice_data_cavlc(dec_struct_t * ps_dec,
 WORD32 ih264d_mark_err_slice_skip(dec_struct_t * ps_dec,
                                 WORD32 num_mb_skip,
                                 UWORD8 u1_is_idr_slice,
+                                UWORD16 u2_frame_num,
                                 pocstruct_t *ps_cur_poc,
                                 WORD32 prev_slice_err)
 {
@@ -1457,14 +1458,20 @@ WORD32 ih264d_mark_err_slice_skip(dec_struct_t * ps_dec,
     UWORD16 u2_total_mbs_coded;
     UWORD32 u1_mbaff = ps_slice->u1_mbaff_frame_flag;
     parse_part_params_t *ps_part_info;
+    WORD32 ret;
+
+
+    if(ps_dec->ps_dec_err_status->u1_err_flag & REJECT_CUR_PIC)
+    {
+        ih264d_err_pic_dispbuf_mgr(ps_dec);
+        return 0;
+    }
 
     if(prev_slice_err == 1)
     {
-        // first slice - missing/header corruption
-        if(u1_is_idr_slice)
-            ps_dec->ps_cur_slice->u2_frame_num = 0;
-        else
-            ps_dec->ps_cur_slice->u2_frame_num++;
+        /* first slice - missing/header corruption */
+        ps_dec->ps_cur_slice->u2_frame_num = u2_frame_num;
+
 
         if(!ps_dec->u1_first_slice_in_stream)
         {
@@ -1482,7 +1489,6 @@ WORD32 ih264d_mark_err_slice_skip(dec_struct_t * ps_dec,
             ps_dec->pf_mvpred = ih264d_mvpred_nonmbaff;
             ps_dec->p_form_mb_part_info = ih264d_form_mb_part_info_bp;
             ps_dec->p_motion_compensate = ih264d_motion_compensate_bp;
-            ps_dec->ps_pps->ps_sps = ps_dec->ps_cur_sps;
 
             if(ps_dec->ps_cur_pic != NULL)
                 poc = ps_dec->ps_cur_pic->i4_poc + 2;
@@ -1491,10 +1497,16 @@ WORD32 ih264d_mark_err_slice_skip(dec_struct_t * ps_dec,
             for(i = 0; i < MAX_NUM_PIC_PARAMS; i++)
                    if(ps_dec->ps_pps[i].u1_is_valid == TRUE)
                        j = i;
+            {
+                ret = ih264d_start_of_pic(ps_dec, poc, ps_cur_poc,
+                        ps_dec->ps_cur_slice->u2_frame_num,
+                        &ps_dec->ps_pps[j]);
 
-            ih264d_start_of_pic(ps_dec, poc, ps_cur_poc,
-                    ps_dec->ps_cur_slice->u2_frame_num,
-                    &ps_dec->ps_pps[j]);
+                if(ret != OK)
+                {
+                    return ret;
+                }
+            }
 
             ps_dec->ps_ref_pic_buf_lx[0][0]->u1_pic_buf_id = 0;
 
@@ -1617,7 +1629,7 @@ WORD32 ih264d_mark_err_slice_skip(dec_struct_t * ps_dec,
                     >= ps_dec->u2_frm_ht_in_mbs * ps_dec->u2_frm_wd_in_mbs)
             {
                 ps_dec->u1_pic_decode_done = 1;
-                return 1;
+                return 0;
             }
 
             // Inserting new slice
@@ -1685,8 +1697,6 @@ WORD32 ih264d_mark_err_slice_skip(dec_struct_t * ps_dec,
     /******************************************************/
     /* Parsing / decoding the slice                       */
     /******************************************************/
-    ps_dec->u4_first_slice_in_pic = 0;
-    ps_dec->u1_first_slice_in_stream = 0;
     ps_dec->u1_slice_header_done = 2;
     ps_dec->u1_qp = ps_slice->u1_slice_qp;
     ih264d_update_qp(ps_dec, 0);
@@ -1823,6 +1833,11 @@ WORD32 ih264d_mark_err_slice_skip(dec_struct_t * ps_dec,
     H264_DEC_DEBUG_PRINT("Mbs in slice: %d\n", ps_dec->ps_cur_slice->u4_mbs_in_slice);
 
     ps_dec->u2_cur_slice_num++;
+
+    /* incremented here only if first slice is inserted */
+    if(ps_dec->u4_first_slice_in_pic != 0)
+        ps_dec->ps_parse_cur_slice++;
+
     ps_dec->i2_prev_slice_mbx = ps_dec->u2_mbx;
     ps_dec->i2_prev_slice_mby = ps_dec->u2_mby;
 
@@ -1830,7 +1845,6 @@ WORD32 ih264d_mark_err_slice_skip(dec_struct_t * ps_dec,
             >= ps_dec->u2_frm_ht_in_mbs * ps_dec->u2_frm_wd_in_mbs)
     {
         ps_dec->u1_pic_decode_done = 1;
-        return 1;
     }
 
     return 0;
diff --git a/decoder/ih264d_parse_slice.c b/decoder/ih264d_parse_slice.c
index b3a7632..eef9db5 100644
--- a/decoder/ih264d_parse_slice.c
+++ b/decoder/ih264d_parse_slice.c
@@ -447,8 +447,8 @@ WORD32 ih264d_start_of_pic(dec_struct_t *ps_dec,
 
         if(!ps_dec->ps_cur_pic)
         {
-            H264_DEC_DEBUG_PRINT("------- Display Buffers Reset --------\n");
             WORD32 j;
+            H264_DEC_DEBUG_PRINT("------- Display Buffers Reset --------\n");
             for(j = 0; j < MAX_DISP_BUFS_NEW; j++)
             {
 
@@ -849,13 +849,6 @@ WORD32 ih264d_end_of_pic_dispbuf_mgr(dec_struct_t * ps_dec)
                                  ps_cur_slice->u1_field_pic_flag,
                                  ps_dec->u1_second_field);
         }
-        {
-
-            if(!ps_cur_slice->u1_end_of_frame_signal)
-            {
-                ps_cur_slice->u1_end_of_frame_signal = 1;
-            }
-        }
 
         if(!ps_cur_slice->u1_field_pic_flag
                         || ((TOP_FIELD_ONLY | BOT_FIELD_ONLY)
@@ -961,7 +954,6 @@ WORD32 ih264d_end_of_pic(dec_struct_t *ps_dec,
     dec_slice_params_t *ps_cur_slice = ps_dec->ps_cur_slice;
     WORD32 ret;
 
-    ps_dec->u4_first_slice_in_pic = 1;
     ps_dec->u1_first_pb_nal_in_pic = 1;
     ps_dec->u2_mbx = 0xffff;
     ps_dec->u2_mby = 0;
@@ -969,9 +961,8 @@ WORD32 ih264d_end_of_pic(dec_struct_t *ps_dec,
         dec_err_status_t * ps_err = ps_dec->ps_dec_err_status;
         if(ps_err->u1_err_flag & REJECT_CUR_PIC)
         {
-            ps_err->u1_err_flag ^= REJECT_CUR_PIC;
             ih264d_err_pic_dispbuf_mgr(ps_dec);
-            return OK;
+            return ERROR_NEW_FRAME_EXPECTED;
         }
     }
 
@@ -1016,10 +1007,8 @@ WORD32 ih264d_end_of_pic(dec_struct_t *ps_dec,
             ps_prev_poc->u1_bot_field = ps_cur_poc->u1_bot_field;
         }
     }
-    if(!ps_cur_slice->u1_end_of_frame_signal)
-    {
-        return ERROR_END_OF_FRAME_EXPECTED_T;
-    } H264_MUTEX_UNLOCK(&ps_dec->process_disp_mutex);
+
+    H264_MUTEX_UNLOCK(&ps_dec->process_disp_mutex);
 
     return OK;
 }
@@ -1294,6 +1283,22 @@ WORD32 ih264d_parse_decode_slice(UWORD8 u1_is_idr_slice,
                                             u1_field_pic_flag,
                                             u1_bottom_field_flag);
 
+        /* since we support only Full frame decode, every new process should
+         * process a new pic
+         */
+        if((ps_dec->u4_first_slice_in_pic == 2) && (i1_is_end_of_poc == 0))
+        {
+            /* if it is the first slice is process call ,it should be a new frame. If it is not
+             * reject current pic and dont add it to dpb
+             */
+            ps_dec->ps_dec_err_status->u1_err_flag |= REJECT_CUR_PIC;
+            i1_is_end_of_poc = 1;
+        }
+        else
+        {
+            /* reset REJECT_CUR_PIC */
+            ps_dec->ps_dec_err_status->u1_err_flag &= MASK_REJECT_CUR_PIC;
+        }
     }
 
     /*--------------------------------------------------------------------*/
@@ -1310,6 +1315,7 @@ WORD32 ih264d_parse_decode_slice(UWORD8 u1_is_idr_slice,
                    && ps_dec->u1_top_bottom_decoded
                        != (TOP_FIELD_ONLY | BOT_FIELD_ONLY))
         {
+            ps_dec->u1_dangling_field = 1;
             if(ps_dec->u4_first_slice_in_pic)
             {
                 // first slice - dangling field
@@ -1332,7 +1338,7 @@ WORD32 ih264d_parse_decode_slice(UWORD8 u1_is_idr_slice,
 
             u1_is_idr_slice = ps_cur_slice->u1_nal_unit_type == IDR_SLICE_NAL;
         }
-        else if(ps_dec->u4_first_slice_in_pic)
+        else if(ps_dec->u4_first_slice_in_pic == 2)
         {
             if(u2_first_mb_in_slice > 0)
             {
@@ -1355,10 +1361,25 @@ WORD32 ih264d_parse_decode_slice(UWORD8 u1_is_idr_slice,
         }
         else
         {
-            // last slice - missing/corruption
-            prev_slice_err = 2;
-            num_mb_skipped = (ps_dec->u2_frm_ht_in_mbs * ps_dec->u2_frm_wd_in_mbs)
-                    - ps_dec->u2_total_mbs_coded;
+
+            if(ps_dec->u4_first_slice_in_pic)
+            {
+                /* if valid slice header is not decoded do start of pic processing
+                 * since in the current process call, frame num is not updated in the slice structure yet
+                 * ih264d_is_end_of_pic is checked with valid frame num of previous process call,
+                 * although i1_is_end_of_poc is set there could be  more slices in the frame,
+                 * so conceal only till cur slice */
+                prev_slice_err = 1;
+                num_mb_skipped = u2_first_mb_in_slice << u1_mbaff;
+            }
+            else
+            {
+                /* since i1_is_end_of_poc is set ,means new frame num is encountered. so conceal the current frame
+                 * completely */
+                prev_slice_err = 2;
+                num_mb_skipped = (ps_dec->u2_frm_ht_in_mbs * ps_dec->u2_frm_wd_in_mbs)
+                        - ps_dec->u2_total_mbs_coded;
+            }
             ps_cur_poc = &s_tmp_poc;
         }
     }
@@ -1380,13 +1401,40 @@ WORD32 ih264d_parse_decode_slice(UWORD8 u1_is_idr_slice,
 
     if(prev_slice_err)
     {
-        end_of_frame = ih264d_mark_err_slice_skip(ps_dec,num_mb_skipped,u1_is_idr_slice,ps_cur_poc,prev_slice_err);
+        ret = ih264d_mark_err_slice_skip(ps_dec, num_mb_skipped, u1_is_idr_slice, u2_frame_num, ps_cur_poc, prev_slice_err);
+
+        if(ps_dec->u1_dangling_field == 1)
+        {
+            ps_dec->u1_second_field = 1 - ps_dec->u1_second_field;
+            ps_cur_slice->u1_bottom_field_flag = u1_bottom_field_flag;
+            ps_dec->u2_prv_frame_num = u2_frame_num;
+            ps_dec->u1_first_slice_in_stream = 0;
+            return ERROR_DANGLING_FIELD_IN_PIC;
+        }
 
-        if(end_of_frame)
+        if(prev_slice_err == 2)
         {
-            // return if all MBs in frame are parsed
+            ps_dec->u1_first_slice_in_stream = 0;
+            return ERROR_INCOMPLETE_FRAME;
+        }
+
+        if(ps_dec->u2_total_mbs_coded
+                >= ps_dec->u2_frm_ht_in_mbs * ps_dec->u2_frm_wd_in_mbs)
+        {
+            /* return if all MBs in frame are parsed*/
+            ps_dec->u1_first_slice_in_stream = 0;
             return ERROR_IN_LAST_SLICE_OF_PIC;
         }
+
+        if(ps_dec->ps_dec_err_status->u1_err_flag & REJECT_CUR_PIC)
+        {
+            ih264d_err_pic_dispbuf_mgr(ps_dec);
+            return ERROR_NEW_FRAME_EXPECTED;
+        }
+
+        if(ret != OK)
+            return ret;
+
         i1_is_end_of_poc = 0;
     }
 
@@ -1401,13 +1449,6 @@ WORD32 ih264d_parse_decode_slice(UWORD8 u1_is_idr_slice,
     if(!ps_dec->u1_first_slice_in_stream)
     {
         UWORD8 uc_mbs_exceed = 0;
-        /*since we support only Full frame decode, every new process should
-         * process a new pic
-         */
-        if(ps_dec->u4_first_slice_in_pic == 1)
-        {
-            i1_is_end_of_poc = 1;
-        }
 
         if(ps_dec->u2_total_mbs_coded
                         == (ps_dec->ps_cur_sps->u2_max_mb_addr + 1))
@@ -1446,45 +1487,8 @@ WORD32 ih264d_parse_decode_slice(UWORD8 u1_is_idr_slice,
         }
     }
 
-    ps_cur_slice->u1_end_of_frame_signal = 0;
     if(u1_field_pic_flag)
     {
-        /*
-         * Check if the frame number has changed.
-         */
-        H264_DEC_DEBUG_PRINT(
-                        "u2_frame_num: %d ps_dec->u2_prv_frame_num: %d ps_dec->u1_top_bottom_decoded: %d\n",
-                        u2_frame_num, ps_dec->u2_prv_frame_num,
-                        ps_dec->u1_top_bottom_decoded);
-        if((u2_frame_num != ps_dec->u2_prv_frame_num)
-                        && (0 != ps_dec->u1_top_bottom_decoded))
-        {
-            if((TOP_FIELD_ONLY | BOT_FIELD_ONLY)
-                            != ps_dec->u1_top_bottom_decoded)
-            {
-                H264_DEC_DEBUG_PRINT("Dangling Field, toggling second field\n");
-                ps_dec->u1_second_field = 1 - ps_dec->u1_second_field;
-                ps_dec->u1_dangling_field = 1;
-                /*
-                 * Updating the u1_bottom_field_flag since its used in the concealment function.
-                 */
-                ps_cur_slice->u1_bottom_field_flag = u1_bottom_field_flag;
-                ps_dec->u2_prv_frame_num = u2_frame_num;
-
-                ret = ih264d_deblock_display(ps_dec);
-                if(ret != OK)
-                    return ret;
-
-                /*
-                 * The bytes consumed will be handled by the
-                 * video_decode function after the error is handled.
-                 */
-                return ERROR_DANGLING_FIELD_IN_PIC;
-
-            }
-
-        }
-
         ps_dec->u2_prv_frame_num = u2_frame_num;
     }
 
@@ -1513,7 +1517,7 @@ WORD32 ih264d_parse_decode_slice(UWORD8 u1_is_idr_slice,
         ps_dec->ps_cur_pic->i4_poc = i4_temp_poc;
         ps_dec->ps_cur_pic->i4_avg_poc = i4_temp_poc;
     }
-    if(ps_dec->u4_first_slice_in_pic)
+    if(ps_dec->u4_first_slice_in_pic == 2)
     {
         ret = ih264d_decode_pic_order_cnt(u1_is_idr_slice, u2_frame_num,
                                           &ps_dec->s_prev_pic_poc,
@@ -1581,11 +1585,14 @@ WORD32 ih264d_parse_decode_slice(UWORD8 u1_is_idr_slice,
             ps_dec->pf_mvpred = ih264d_mvpred_nonmbaff;
     }
 
-    if(ps_dec->u4_first_slice_in_pic)
+    if(ps_dec->u4_first_slice_in_pic == 2)
     {
-        ret = ih264d_start_of_pic(ps_dec, i4_poc, &s_tmp_poc, u2_frame_num, ps_pps);
-        if(ret != OK)
-            return ret;
+        if(u2_first_mb_in_slice == 0)
+        {
+            ret = ih264d_start_of_pic(ps_dec, i4_poc, &s_tmp_poc, u2_frame_num, ps_pps);
+            if(ret != OK)
+                return ret;
+        }
 
         ps_dec->u4_output_present = 0;
 
@@ -1898,7 +1905,8 @@ WORD32 ih264d_parse_decode_slice(UWORD8 u1_is_idr_slice,
 
     if(ps_dec->u1_slice_header_done)
     {
-        /*set to zero to indicate a valid slice has been decoded*/
+        /* set to zero to indicate a valid slice has been decoded */
+        /* first slice header successfully decoded */
         ps_dec->u4_first_slice_in_pic = 0;
         ps_dec->u1_first_slice_in_stream = 0;
     }
diff --git a/decoder/ih264d_process_intra_mb.c b/decoder/ih264d_process_intra_mb.c
index d2da005..dde2a7e 100644
--- a/decoder/ih264d_process_intra_mb.c
+++ b/decoder/ih264d_process_intra_mb.c
@@ -924,7 +924,10 @@ WORD32 ih264d_process_intra_mb(dec_struct_t * ps_dec,
                                             (u1_intrapred_mode ^ 2);
 
             if((u1_err_code & u1_packed_modes) ^ u1_err_code)
+            {
+                u1_intrapred_mode = 0;
                 ps_dec->i4_error_code = ERROR_INTRAPRED;
+            }
         }
         {
             UWORD8 au1_ngbr_pels[33];
@@ -1242,9 +1245,11 @@ WORD32 ih264d_process_intra_mb(dec_struct_t * ps_dec,
                 {
                     UWORD8 u1_err_code = pu1_intra_err_codes[i1_intra_pred];
 
-                    /*if((u1_err_code & u1_packed_modes) ^ u1_err_code)
+                    if((u1_err_code & u1_packed_modes) ^ u1_err_code)
                      {
-                     }*/
+                        i1_intra_pred = 0;
+                        ps_dec->i4_error_code = ERROR_INTRAPRED;
+                     }
 
                 }
             }
@@ -1649,7 +1654,10 @@ WORD32 ih264d_process_intra_mb(dec_struct_t * ps_dec,
                     UWORD8 u1_err_code = pu1_intra_err_codes[i1_intra_pred];
 
                     if((u1_err_code & u1_packed_modes) ^ u1_err_code)
+                    {
+                        i1_intra_pred = 0;
                         ps_dec->i4_error_code = ERROR_INTRAPRED;
+                    }
                 }
             }
 
@@ -1761,7 +1769,10 @@ WORD32 ih264d_process_intra_mb(dec_struct_t * ps_dec,
                                             u1_intra_chrom_pred_mode :
                                             (u1_intra_chrom_pred_mode ^ 2);
             if((u1_err_code & u1_packed_modes) ^ u1_err_code)
+            {
+                u1_intra_chrom_pred_mode = 0;
                 ps_dec->i4_error_code = ERROR_INTRAPRED;
+            }
         }
 
         /* CHANGED CODE */
@@ -1933,11 +1944,12 @@ WORD32 ih264d_process_intra_mb(dec_struct_t * ps_dec,
                 UWORD8 *pu1_ngbr_pels = (UWORD8 *)au2_ngbr_pels;
                 UWORD16 *pu2_left_uv;
                 UWORD16 *pu2_topleft_uv;
-                pu2_topleft_uv = (UWORD16 *)pu1_u_top_left;
-                pu2_left_uv = (UWORD16 *)pu1_uleft;
                 WORD32 use_left1 = (u2_use_left_mb_pack & 0x0ff);
                 WORD32 use_left2 = (u2_use_left_mb_pack & 0xff00) >> 8;
 
+                pu2_topleft_uv = (UWORD16 *)pu1_u_top_left;
+                pu2_left_uv = (UWORD16 *)pu1_uleft;
+
                 /* Get neighbour pixels */
                 /* left pels */
                 if(u2_use_left_mb_pack)
diff --git a/decoder/ih264d_structs.h b/decoder/ih264d_structs.h
index 4e3f0bb..062747b 100644
--- a/decoder/ih264d_structs.h
+++ b/decoder/ih264d_structs.h
@@ -524,7 +524,6 @@ typedef struct
      unsigned. LSB byte : weight and MSB byte: u4_ofst */
     UWORD32 u4_wt_ofst_lx[2][MAX_REF_BUFS][3];
     void * pv_codec_handle; /* For Error Handling */
-    UWORD8 u1_end_of_frame_signal;
 
     /*  This is used when reordering is done in Forward or    */
     /*  backward lists. This is because reordering can point  */
@@ -607,6 +606,9 @@ typedef struct code_overlay_ctxt
 #define REJECT_CUR_PIC    (0x01)
 #define REJECT_PB_PICS    (0x02)
 
+#define MASK_REJECT_CUR_PIC (0xFE)
+#define MASK_REJECT_PB_PICS (0xFD)
+
 #define PIC_TYPE_UNKNOWN  (0xFF)
 #define PIC_TYPE_I        (0x00)
 #define SYNC_FRM_DEFAULT  (0xFFFFFFFF)
@@ -1351,6 +1353,7 @@ typedef struct _DecStruct
     UWORD32 u4_cur_slice_decode_done;
     UWORD32 u4_extra_mem_used;
 
+    /* 2 first slice not parsed , 1 :first slice parsed , 0 :first valid slice header parsed*/
     UWORD32 u4_first_slice_in_pic;
     UWORD32 u4_num_cores;
     IVD_ARCH_T e_processor_arch;
diff --git a/decoder/ih264d_thread_parse_decode.c b/decoder/ih264d_thread_parse_decode.c
index 910183c..f3da270 100644
--- a/decoder/ih264d_thread_parse_decode.c
+++ b/decoder/ih264d_thread_parse_decode.c
@@ -633,8 +633,6 @@ void ih264d_decode_picture_thread(dec_struct_t *ps_dec )
                               ps_dec->u4_fmt_conv_num_rows);
         ps_dec->u4_fmt_conv_cur_row += ps_dec->u4_fmt_conv_num_rows;
     }
-
-    ithread_exit(0);
 }
 
 void ih264d_signal_decode_thread(dec_struct_t *ps_dec)
diff --git a/decoder/ih264d_utils.c b/decoder/ih264d_utils.c
index 31e9532..1581bd6 100644
--- a/decoder/ih264d_utils.c
+++ b/decoder/ih264d_utils.c
@@ -646,8 +646,11 @@ WORD32 ih264d_get_dpb_size(dec_seq_params_t *ps_seq, dec_struct_t *ps_dec)
         case 51:
             i4_size = 70778880;
             break;
+        case 52:
+            i4_size = 70778880;
+            break;
         default:
-            i4_size = 6912000;
+            i4_size = 70778880;
             break;
     }
 
@@ -712,9 +715,12 @@ WORD32 ih264d_get_dpb_size_new(UWORD32 u4_level_idc,
         case 51:
             i4_size = 70778880;
             break;
+        case 52:
+            i4_size = 70778880;
+            break;
         default:
         {
-            return -1;
+            i4_size = 70778880;
         }
             break;
     }
diff --git a/encoder.arm.mk b/encoder.arm.mk
index 874c81c..f06a6d5 100644
--- a/encoder.arm.mk
+++ b/encoder.arm.mk
@@ -1,7 +1,7 @@
 libavce_inc_dir_arm +=  $(LOCAL_PATH)/encoder/arm
 libavce_inc_dir_arm +=  $(LOCAL_PATH)/common/arm
 
-libavce_cflags_arm  += -DDISABLE_NEONINTR -DARM -DARMGCC
+libavce_cflags_arm  += -DARM
 
 libavce_srcs_c_arm  += encoder/arm/ih264e_function_selector.c
 
@@ -35,10 +35,8 @@ libavce_srcs_asm_arm    +=  encoder/arm/ih264e_fmt_conv.s
 #ME
 libavce_srcs_asm_arm    +=  encoder/arm/ime_distortion_metrics_a9q.s
 
-libavce_cflags_arm += -DDEFAULT_ARCH=D_ARCH_ARM_A9Q
-
 else #No Neon
-libavce_cflags_arm += -DDISABLE_NEON -DDEFAULT_ARCH=D_ARCH_ARM_NONEON
+libavce_cflags_arm += -DDISABLE_NEON
 endif #Neon check
 
 libavce_srcs_asm_arm    +=  common/arm/ih264_arm_memory_barrier.s
@@ -46,7 +44,3 @@ libavce_srcs_asm_arm    +=  common/arm/ih264_arm_memory_barrier.s
 LOCAL_SRC_FILES_arm += $(libavce_srcs_c_arm) $(libavce_srcs_asm_arm)
 LOCAL_C_INCLUDES_arm += $(libavce_inc_dir_arm)
 LOCAL_CFLAGS_arm += $(libavce_cflags_arm)
-
-# CLANG WORKAROUNDS
-LOCAL_CLANG_ASFLAGS_arm += -no-integrated-as
-LOCAL_CLANG_ASFLAGS_arm += $(addprefix -Wa$(comma)-I,$(libavce_inc_dir_arm))
diff --git a/encoder.arm64.mk b/encoder.arm64.mk
index 5d2d045..f95a29f 100644
--- a/encoder.arm64.mk
+++ b/encoder.arm64.mk
@@ -1,5 +1,5 @@
 libavce_cflags_arm64 += -DARMV8
-libavce_cflags_arm64 += -DDISABLE_NEONINTR -DARM -DARMGCC
+libavce_cflags_arm64 += -DARM
 
 libavce_inc_dir_arm64   +=  $(LOCAL_PATH)/encoder/arm
 libavce_inc_dir_arm64   +=  $(LOCAL_PATH)/encoder/armv8
@@ -35,9 +35,8 @@ libavce_srcs_asm_arm64    +=  encoder/armv8/ih264e_half_pel_av8.s
 #ME
 libavce_srcs_asm_arm64    +=  encoder/armv8/ime_distortion_metrics_av8.s
 
-libavce_cflags_arm64 += -DDEFAULT_ARCH=D_ARCH_ARMV8_GENERIC
 else
-libavce_cflags_arm64 += -DDISABLE_NEON -DDEFAULT_ARCH=D_ARCH_ARM_NONEON
+libavce_cflags_arm64 += -DDISABLE_NEON
 endif
 
 
@@ -48,5 +47,4 @@ LOCAL_C_INCLUDES_arm64 += $(libavce_inc_dir_arm64)
 LOCAL_CFLAGS_arm64 += $(libavce_cflags_arm64)
 
 # CLANG WORKAROUNDS
-LOCAL_CLANG_ASFLAGS_arm64 += -no-integrated-as
 LOCAL_CLANG_ASFLAGS_arm64 += $(addprefix -Wa$(comma)-I,$(libavce_inc_dir_arm64))
diff --git a/encoder.mk b/encoder.mk
index 5829118..7efcda2 100644
--- a/encoder.mk
+++ b/encoder.mk
@@ -9,8 +9,8 @@ LOCAL_MODULE := libavcenc
 
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
-LOCAL_CFLAGS += -D_LIB -DMULTICORE -DANDROID -DNDEBUG -UHP_PL -DN_MB_ENABLE -URC_FIXED_POINT -fPIC
-LOCAL_CFLAGS += -O3 -DANDROID
+LOCAL_CFLAGS += -DNDEBUG -UHP_PL -DN_MB_ENABLE -fPIC
+LOCAL_CFLAGS += -O3
 
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/encoder $(LOCAL_PATH)/common
 
@@ -53,6 +53,9 @@ libavce_srcs_c  += encoder/ih264e_utils.c
 libavce_srcs_c  += encoder/ih264e_version.c
 libavce_srcs_c  += encoder/ih264e_bitstream.c
 libavce_srcs_c  += encoder/ih264e_cavlc.c
+libavce_srcs_c  += encoder/ih264e_cabac_init.c
+libavce_srcs_c  += encoder/ih264e_cabac.c
+libavce_srcs_c  += encoder/ih264e_cabac_encode.c
 libavce_srcs_c  += encoder/ih264e_encode_header.c
 libavce_srcs_c  += encoder/ih264e_function_selector_generic.c
 libavce_srcs_c  += encoder/ih264e_fmt_conv.c
diff --git a/encoder.x86.mk b/encoder.x86.mk
index e9b6a5f..f1e2ffa 100644
--- a/encoder.x86.mk
+++ b/encoder.x86.mk
@@ -1,4 +1,4 @@
-libavce_cflags_x86 += -DX86 -DDISABLE_AVX2 -m32 -msse4.2 -mno-avx -DDEFAULT_ARCH=D_ARCH_X86_SSE42
+libavce_cflags_x86 += -DX86 -msse4.2 -mno-avx
 
 libavce_inc_dir_x86     +=  $(LOCAL_PATH)/encoder/x86
 libavce_inc_dir_x86     +=  $(LOCAL_PATH)/common/x86
diff --git a/encoder.x86_64.mk b/encoder.x86_64.mk
index deb004b..14205a3 100644
--- a/encoder.x86_64.mk
+++ b/encoder.x86_64.mk
@@ -1,4 +1,4 @@
-libavce_cflags_x86_64   += -DX86 -DDISABLE_AVX2 -m64 -msse4.2 -mno-avx  -DDEFAULT_ARCH=D_ARCH_X86_SSE42
+libavce_cflags_x86_64   += -DX86 -msse4.2 -mno-avx
 
 libavce_inc_dir_x86_64  +=  $(LOCAL_PATH)/encoder/x86
 libavce_inc_dir_x86_64  +=  $(LOCAL_PATH)/common/x86
diff --git a/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s b/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s
index fe0ce17..9f5bfa9 100644
--- a/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s
+++ b/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s
@@ -17,7 +17,6 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
 
 @/**
 @******************************************************************************
@@ -102,11 +101,11 @@ ih264e_evaluate_intra16x16_modes_a9q:
     vld1.32       {q5}, [r1]!
     mov           r11, #0
     mov           r4, #0
-    @/* Left available ????
+    @/* Left available ???? */
     ands          r7, r5, #01
     movne         r10, #1
 
-    @/* Top  available ????
+    @/* Top  available ???? */
     ands          r8, r5, #04
     lsl           r9, r10, #3
     movne         r11, #1
@@ -114,7 +113,7 @@ ih264e_evaluate_intra16x16_modes_a9q:
     adds          r8, r9, r12
 
 
-    @/* None available :(
+    @/* None available :( */
     moveq         r4, #128
 
 
diff --git a/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s b/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s
index 568e623..6137054 100644
--- a/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s
+++ b/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s
@@ -17,9 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
 
-@/**
 
 .data
 .p2align 2
@@ -39,7 +37,6 @@ scratch_intrapred_luma_4x4_prediction_addr1:
 
 
 @/**
-@/**
 @******************************************************************************
 @*
 @* @brief :Evaluate best intra 4x4 mode
diff --git a/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s b/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s
index e4dfca8..bdbaa02 100644
--- a/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s
+++ b/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s
@@ -17,7 +17,6 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
 
 @/**
 @******************************************************************************
diff --git a/encoder/arm/ih264e_fmt_conv.s b/encoder/arm/ih264e_fmt_conv.s
index 2c04141..f8f5e42 100644
--- a/encoder/arm/ih264e_fmt_conv.s
+++ b/encoder/arm/ih264e_fmt_conv.s
@@ -17,11 +17,9 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
 
 .text
 .p2align 2
-@/**
 
 @/*****************************************************************************
 @*                                                                            *
@@ -268,7 +266,6 @@ ih264e_fmt_conv_422i_to_420sp_a9q:
 @   SUB         r10,r10,r7,ASR #1           ;// u2_offset3          = u4_stride_v - u4_width >> 1
     mov           r14, r14, lsl #1      @// u2_offset_yuv422i   = u2_offset_yuv422i * 2
 
-    mov           r7, r7, asr #4        @// u4_width = u4_width / 16 (u4_width >> 4)
     mov           r11, r11, asr #1      @// u4_width = u4_width / 2 (u4_width >> 1)
 
     add           r4, r12, r4           @// u2_offset1 = u2_offset1 + u4_stride_y
@@ -288,14 +285,14 @@ ih264e_fmt_conv_422i_to_420sp_a9q:
 @// u4_width / 16       - r7
 @// u4_height / 2       - r11
 @// inner loop count    - r12
-yuv420_to_yuv422i_hight_loop:
+yuv422i_to_420sp_height_loop:
 
     mov           r12, r7               @// Inner loop count = u4_width / 16
 
-yuv420_to_yuv422i_width_loop:
+yuv422i_to_420sp_width_loop:
     vld4.8        {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
     vld4.8        {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
-    subs          r12, r12, #1
+    sub           r12, r12, #16
 
     vrhadd.u8     d0, d0, d4
     vrhadd.u8     d2, d2, d6
@@ -305,8 +302,34 @@ yuv420_to_yuv422i_width_loop:
 
     vst2.8        {d0, d2}, [r1]!       @// Store the 8 elements of row1/2 U
 
-    bgt           yuv420_to_yuv422i_width_loop
+    cmp           r12, #15
+    bgt           yuv422i_to_420sp_width_loop
+    cmp           r12, #0
+    beq           yuv422i_to_420sp_row_loop_end
 
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    rsb           r12, r12, #16
+    sub           r3, r3, r12, lsl #1
+    sub           r8, r8, r12, lsl #1
+    sub           r0, r0, r12
+    sub           r6, r6, r12
+    sub           r1, r1, r12
+
+    vld4.8        {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
+    vld4.8        {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
+
+    vrhadd.u8     d0, d0, d4
+    vrhadd.u8     d2, d2, d6
+
+    vst2.8        {d1, d3}, [r0]!       @// Store the 16 elements of row1 Y
+    vst2.8        {d5, d7}, [r6]!       @// Store the 16 elements of row2 Y
+
+    vst2.8        {d0, d2}, [r1]!       @// Store the 8 elements of row1/2 U
+
+yuv422i_to_420sp_row_loop_end:
     @// Update the buffer pointer so that they will refer to next pair of rows
     add           r0, r0, r4            @// pu1_y               = pu1_y                 + u2_offset1
     add           r6, r6, r4            @// pu1_y_nxt_row       = pu1_y_nxt_row         + u2_offset1
@@ -317,7 +340,7 @@ yuv420_to_yuv422i_width_loop:
     add           r3, r3, r5            @// pu2_yuv422i         = pu2_yuv422i           + u2_offset_yuv422i
 
     add           r8, r8, r5            @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row   + u2_offset_yuv422i
-    bgt           yuv420_to_yuv422i_hight_loop
+    bgt           yuv422i_to_420sp_height_loop
     ldmfd         sp!, {r4-r12, pc}     @// Restore the register which are used
 
 
diff --git a/encoder/arm/ih264e_function_selector.c b/encoder/arm/ih264e_function_selector.c
index e4f67a0..0486200 100644
--- a/encoder/arm/ih264e_function_selector.c
+++ b/encoder/arm/ih264e_function_selector.c
@@ -58,8 +58,8 @@
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -68,14 +68,15 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
-
+#include "ih264_cabac_tables.h"
 #include "ih264_macros.h"
 #include "ih264_platform_macros.h"
-#include "ih264e_defs.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
+#include "ih264e_cabac.h"
 #include "ih264e_platform_macros.h"
 
 /**
diff --git a/encoder/arm/ih264e_function_selector_a9q.c b/encoder/arm/ih264e_function_selector_a9q.c
index 8b2879b..30d7795 100644
--- a/encoder/arm/ih264e_function_selector_a9q.c
+++ b/encoder/arm/ih264e_function_selector_a9q.c
@@ -58,8 +58,8 @@
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -68,23 +68,18 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
-
+#include "ih264_cabac_tables.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_platform_macros.h"
-#include "ih264_intra_pred_filters.h"
-#include "ih264_trans_quant_itrans_iquant.h"
-#include "ih264e_defs.h"
-#include "ih264e_structs.h"
-#include "ih264_deblk_edge_filters.h"
+#include "ih264e_cabac.h"
 #include "ih264e_core_coding.h"
 #include "ih264_cavlc_tables.h"
 #include "ih264e_cavlc.h"
-#include "ih264_padding.h"
 #include "ih264e_intra_modes_eval.h"
-#include "ih264_mem_fns.h"
 #include "ih264e_fmt_conv.h"
 #include "ih264e_half_pel.h"
 
@@ -109,144 +104,144 @@
 void ih264e_init_function_ptr_neon_a9q(codec_t *ps_codec)
 {
     WORD32 i= 0;
-
-        /* curr proc ctxt */
-        process_ctxt_t *ps_proc = NULL;
-        me_ctxt_t *ps_me_ctxt = NULL;
-
-        /* Init function pointers for intra pred leaf level functions luma
-         * Intra 16x16 */
-        ps_codec->apf_intra_pred_16_l[0] = ih264_intra_pred_luma_16x16_mode_vert_a9q;
-        ps_codec->apf_intra_pred_16_l[1] = ih264_intra_pred_luma_16x16_mode_horz_a9q;
-        ps_codec->apf_intra_pred_16_l[2] = ih264_intra_pred_luma_16x16_mode_dc_a9q;
-        ps_codec->apf_intra_pred_16_l[3] = ih264_intra_pred_luma_16x16_mode_plane_a9q;
-
-        /* Init function pointers for intra pred leaf level functions luma
-         * Intra 4x4 */
-        ps_codec->apf_intra_pred_4_l[0] = ih264_intra_pred_luma_4x4_mode_vert_a9q;
-        ps_codec->apf_intra_pred_4_l[1] = ih264_intra_pred_luma_4x4_mode_horz_a9q;
-        ps_codec->apf_intra_pred_4_l[2] = ih264_intra_pred_luma_4x4_mode_dc_a9q;
-        ps_codec->apf_intra_pred_4_l[3] = ih264_intra_pred_luma_4x4_mode_diag_dl_a9q;
-        ps_codec->apf_intra_pred_4_l[4] = ih264_intra_pred_luma_4x4_mode_diag_dr_a9q;
-        ps_codec->apf_intra_pred_4_l[5] = ih264_intra_pred_luma_4x4_mode_vert_r_a9q;
-        ps_codec->apf_intra_pred_4_l[6] = ih264_intra_pred_luma_4x4_mode_horz_d_a9q;
-        ps_codec->apf_intra_pred_4_l[7] = ih264_intra_pred_luma_4x4_mode_vert_l_a9q;
-        ps_codec->apf_intra_pred_4_l[8] = ih264_intra_pred_luma_4x4_mode_horz_u_a9q;
-
-        /* Init function pointers for intra pred leaf level functions luma
-         * Intra 8x8 */
-        ps_codec->apf_intra_pred_8_l[0] = ih264_intra_pred_luma_8x8_mode_vert_a9q;
-        ps_codec->apf_intra_pred_8_l[2] = ih264_intra_pred_luma_8x8_mode_dc_a9q;
-        ps_codec->apf_intra_pred_8_l[3] = ih264_intra_pred_luma_8x8_mode_diag_dl_a9q;
-        ps_codec->apf_intra_pred_8_l[4] = ih264_intra_pred_luma_8x8_mode_diag_dr_a9q;
-        ps_codec->apf_intra_pred_8_l[5] = ih264_intra_pred_luma_8x8_mode_vert_r_a9q;
-        ps_codec->apf_intra_pred_8_l[6] = ih264_intra_pred_luma_8x8_mode_horz_d_a9q;
-        ps_codec->apf_intra_pred_8_l[7] = ih264_intra_pred_luma_8x8_mode_vert_l_a9q;
-        ps_codec->apf_intra_pred_8_l[8] = ih264_intra_pred_luma_8x8_mode_horz_u_a9q;
-
-        /* Init function pointers for intra pred leaf level functions chroma
-         * Intra 8x8 */
-        ps_codec->apf_intra_pred_c[0] = ih264_intra_pred_chroma_8x8_mode_dc_a9q;
-        ps_codec->apf_intra_pred_c[1] = ih264_intra_pred_chroma_8x8_mode_horz_a9q;
-        ps_codec->apf_intra_pred_c[2] = ih264_intra_pred_chroma_8x8_mode_vert_a9q;
-        ps_codec->apf_intra_pred_c[3] = ih264_intra_pred_chroma_8x8_mode_plane_a9q;
-
-        /* Init forward transform fn ptr */
-        ps_codec->pf_resi_trans_quant_8x8           = ih264_resi_trans_quant_8x8;
-        ps_codec->pf_resi_trans_quant_4x4           = ih264_resi_trans_quant_4x4_a9;
-        ps_codec->pf_resi_trans_quant_chroma_4x4    = ih264_resi_trans_quant_chroma_4x4_a9;
-        ps_codec->pf_hadamard_quant_4x4             = ih264_hadamard_quant_4x4_a9;
-        ps_codec->pf_hadamard_quant_2x2_uv          = ih264_hadamard_quant_2x2_uv_a9;
-
-        /* Init inverse transform fn ptr */
-        ps_codec->pf_iquant_itrans_recon_8x8            = ih264_iquant_itrans_recon_8x8;
-        ps_codec->pf_iquant_itrans_recon_4x4            = ih264_iquant_itrans_recon_4x4_a9;
-        ps_codec->pf_iquant_itrans_recon_4x4_dc         = ih264_iquant_itrans_recon_4x4_dc_a9;
-        ps_codec->pf_iquant_itrans_recon_chroma_4x4     = ih264_iquant_itrans_recon_chroma_4x4_a9;
-        ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc  = ih264_iquant_itrans_recon_chroma_4x4_dc_a9;
-        ps_codec->pf_ihadamard_scaling_4x4              = ih264_ihadamard_scaling_4x4_a9;
-        ps_codec->pf_ihadamard_scaling_2x2_uv           = ih264_ihadamard_scaling_2x2_uv_a9;
-        ps_codec->pf_interleave_copy                    = ih264_interleave_copy_a9;
-
-        /* Init fn ptr luma core coding */
-        ps_codec->luma_energy_compaction[0] = ih264e_code_luma_intra_macroblock_16x16;
-        ps_codec->luma_energy_compaction[1] = ih264e_code_luma_intra_macroblock_4x4;
-        ps_codec->luma_energy_compaction[3] = ih264e_code_luma_inter_macroblock_16x16;
-
-        /* Init fn ptr chroma core coding */
-        ps_codec->chroma_energy_compaction[0] = ih264e_code_chroma_intra_macroblock_8x8;
-        ps_codec->chroma_energy_compaction[1] = ih264e_code_chroma_inter_macroblock_8x8;
-
-        /* Init fn ptr luma deblocking */
-        ps_codec->pf_deblk_luma_vert_bs4 = ih264_deblk_luma_vert_bs4_a9;
-        ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_a9;
-        ps_codec->pf_deblk_luma_horz_bs4 = ih264_deblk_luma_horz_bs4_a9;
-        ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_a9;
-
-        /* Init fn ptr chroma deblocking */
-        ps_codec->pf_deblk_chroma_vert_bs4 = ih264_deblk_chroma_vert_bs4_a9;
-        ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4_a9;
-        ps_codec->pf_deblk_chroma_horz_bs4 = ih264_deblk_chroma_horz_bs4_a9;
-        ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_a9;
-
-        /* write mb syntax layer */
-        ps_codec->pf_write_mb_syntax_layer[ISLICE] = ih264e_write_islice_mb;
-        ps_codec->pf_write_mb_syntax_layer[PSLICE] = ih264e_write_pslice_mb;
-
-        /* Padding Functions */
-        ps_codec->pf_pad_top = ih264_pad_top_a9q;
-        ps_codec->pf_pad_bottom = ih264_pad_bottom;
-        ps_codec->pf_pad_left_luma = ih264_pad_left_luma_a9q;
-        ps_codec->pf_pad_left_chroma = ih264_pad_left_chroma_a9q;
-        ps_codec->pf_pad_right_luma = ih264_pad_right_luma_a9q;
-        ps_codec->pf_pad_right_chroma = ih264_pad_right_chroma_a9q;
-
-        /* Inter pred leaf level functions */
-        ps_codec->pf_inter_pred_luma_copy = ih264_inter_pred_luma_copy_a9q;
-        ps_codec->pf_inter_pred_luma_horz = ih264_inter_pred_luma_horz_a9q;
-        ps_codec->pf_inter_pred_luma_vert = ih264_inter_pred_luma_vert_a9q;
-        ps_codec->pf_inter_pred_luma_bilinear = ih264_inter_pred_luma_bilinear_a9q;
-        ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_a9q;
-
-        /* sad me level functions */
-        ps_codec->apf_compute_sad_16x16[0] = ime_compute_sad_16x16_a9q;
-        ps_codec->apf_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_a9q;
-        ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8_a9q;
-
-        /* memor handling operations */
-        ps_codec->pf_mem_cpy = ih264_memcpy_a9q;
-        ps_codec->pf_mem_cpy_mul8 = ih264_memcpy_mul_8_a9q;
-        ps_codec->pf_mem_set = ih264_memset_a9q;
-        ps_codec->pf_mem_set_mul8 = ih264_memset_mul_8_a9q;
-
-        /* sad me level functions */
-        for(i = 0; i < (MAX_PROCESS_CTXT); i++)
-        {
-            ps_proc = &ps_codec->as_process[i];
-            ps_me_ctxt = &ps_proc->s_me_ctxt;
-            ps_me_ctxt->pf_ime_compute_sad_16x16[0] = ime_compute_sad_16x16_a9q;
-            ps_me_ctxt->pf_ime_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_a9q;
-            ps_me_ctxt->pf_ime_compute_sad_16x8 = ime_compute_sad_16x8_a9q;
-            ps_me_ctxt->pf_ime_compute_sad4_diamond = ime_calculate_sad4_prog_a9q;
-            ps_me_ctxt->pf_ime_compute_sad3_diamond = ime_calculate_sad3_prog_a9q;
-            ps_me_ctxt->pf_ime_compute_sad2_diamond = ime_calculate_sad2_prog_a9q;
-            ps_me_ctxt->pf_ime_sub_pel_compute_sad_16x16 = ime_sub_pel_compute_sad_16x16_a9q;
-            ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16 = ime_compute_satqd_16x16_lumainter_a9q;
-        }
-
-        /* intra mode eval -encoder level function */
-        ps_codec->pf_ih264e_evaluate_intra16x16_modes = ih264e_evaluate_intra16x16_modes_a9q;
-        ps_codec->pf_ih264e_evaluate_intra_chroma_modes = ih264e_evaluate_intra_chroma_modes_a9q;
-        ps_codec->pf_ih264e_evaluate_intra_4x4_modes = ih264e_evaluate_intra_4x4_modes_a9q;
-
-        /* csc */
-        ps_codec->pf_ih264e_conv_420p_to_420sp = ih264e_fmt_conv_420p_to_420sp_a9q;
-        ps_codec->pf_ih264e_fmt_conv_422i_to_420sp = ih264e_fmt_conv_422i_to_420sp_a9q;
-
-        /* Halp pel generation function - encoder level*/
-        ps_codec->pf_ih264e_sixtapfilter_horz = ih264e_sixtapfilter_horz_a9q;
-        ps_codec->pf_ih264e_sixtap_filter_2dvh_vert = ih264e_sixtap_filter_2dvh_vert_a9q;
-
-        return ;
+    /* curr proc ctxt */
+    process_ctxt_t *ps_proc = NULL;
+    me_ctxt_t *ps_me_ctxt = NULL;
+
+    /* Init function pointers for intra pred leaf level functions luma
+     * Intra 16x16 */
+    ps_codec->apf_intra_pred_16_l[0] = ih264_intra_pred_luma_16x16_mode_vert_a9q;
+    ps_codec->apf_intra_pred_16_l[1] = ih264_intra_pred_luma_16x16_mode_horz_a9q;
+    ps_codec->apf_intra_pred_16_l[2] = ih264_intra_pred_luma_16x16_mode_dc_a9q;
+    ps_codec->apf_intra_pred_16_l[3] = ih264_intra_pred_luma_16x16_mode_plane_a9q;
+
+    /* Init function pointers for intra pred leaf level functions luma
+     * Intra 4x4 */
+    ps_codec->apf_intra_pred_4_l[0] = ih264_intra_pred_luma_4x4_mode_vert_a9q;
+    ps_codec->apf_intra_pred_4_l[1] = ih264_intra_pred_luma_4x4_mode_horz_a9q;
+    ps_codec->apf_intra_pred_4_l[2] = ih264_intra_pred_luma_4x4_mode_dc_a9q;
+    ps_codec->apf_intra_pred_4_l[3] = ih264_intra_pred_luma_4x4_mode_diag_dl_a9q;
+    ps_codec->apf_intra_pred_4_l[4] = ih264_intra_pred_luma_4x4_mode_diag_dr_a9q;
+    ps_codec->apf_intra_pred_4_l[5] = ih264_intra_pred_luma_4x4_mode_vert_r_a9q;
+    ps_codec->apf_intra_pred_4_l[6] = ih264_intra_pred_luma_4x4_mode_horz_d_a9q;
+    ps_codec->apf_intra_pred_4_l[7] = ih264_intra_pred_luma_4x4_mode_vert_l_a9q;
+    ps_codec->apf_intra_pred_4_l[8] = ih264_intra_pred_luma_4x4_mode_horz_u_a9q;
+
+    /* Init function pointers for intra pred leaf level functions luma
+     * Intra 8x8 */
+    ps_codec->apf_intra_pred_8_l[0] = ih264_intra_pred_luma_8x8_mode_vert_a9q;
+    ps_codec->apf_intra_pred_8_l[2] = ih264_intra_pred_luma_8x8_mode_dc_a9q;
+    ps_codec->apf_intra_pred_8_l[3] = ih264_intra_pred_luma_8x8_mode_diag_dl_a9q;
+    ps_codec->apf_intra_pred_8_l[4] = ih264_intra_pred_luma_8x8_mode_diag_dr_a9q;
+    ps_codec->apf_intra_pred_8_l[5] = ih264_intra_pred_luma_8x8_mode_vert_r_a9q;
+    ps_codec->apf_intra_pred_8_l[6] = ih264_intra_pred_luma_8x8_mode_horz_d_a9q;
+    ps_codec->apf_intra_pred_8_l[7] = ih264_intra_pred_luma_8x8_mode_vert_l_a9q;
+    ps_codec->apf_intra_pred_8_l[8] = ih264_intra_pred_luma_8x8_mode_horz_u_a9q;
+
+    /* Init function pointers for intra pred leaf level functions chroma
+     * Intra 8x8 */
+    ps_codec->apf_intra_pred_c[0] = ih264_intra_pred_chroma_8x8_mode_dc_a9q;
+    ps_codec->apf_intra_pred_c[1] = ih264_intra_pred_chroma_8x8_mode_horz_a9q;
+    ps_codec->apf_intra_pred_c[2] = ih264_intra_pred_chroma_8x8_mode_vert_a9q;
+    ps_codec->apf_intra_pred_c[3] = ih264_intra_pred_chroma_8x8_mode_plane_a9q;
+
+    /* Init forward transform fn ptr */
+    ps_codec->pf_resi_trans_quant_8x8           = ih264_resi_trans_quant_8x8;
+    ps_codec->pf_resi_trans_quant_4x4           = ih264_resi_trans_quant_4x4_a9;
+    ps_codec->pf_resi_trans_quant_chroma_4x4    = ih264_resi_trans_quant_chroma_4x4_a9;
+    ps_codec->pf_hadamard_quant_4x4             = ih264_hadamard_quant_4x4_a9;
+    ps_codec->pf_hadamard_quant_2x2_uv          = ih264_hadamard_quant_2x2_uv_a9;
+
+    /* Init inverse transform fn ptr */
+    ps_codec->pf_iquant_itrans_recon_8x8            = ih264_iquant_itrans_recon_8x8;
+    ps_codec->pf_iquant_itrans_recon_4x4            = ih264_iquant_itrans_recon_4x4_a9;
+    ps_codec->pf_iquant_itrans_recon_4x4_dc         = ih264_iquant_itrans_recon_4x4_dc_a9;
+    ps_codec->pf_iquant_itrans_recon_chroma_4x4     = ih264_iquant_itrans_recon_chroma_4x4_a9;
+    ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc  = ih264_iquant_itrans_recon_chroma_4x4_dc_a9;
+    ps_codec->pf_ihadamard_scaling_4x4              = ih264_ihadamard_scaling_4x4_a9;
+    ps_codec->pf_ihadamard_scaling_2x2_uv           = ih264_ihadamard_scaling_2x2_uv_a9;
+    ps_codec->pf_interleave_copy                    = ih264_interleave_copy_a9;
+
+    /* Init fn ptr luma core coding */
+    ps_codec->luma_energy_compaction[0] = ih264e_code_luma_intra_macroblock_16x16;
+    ps_codec->luma_energy_compaction[1] = ih264e_code_luma_intra_macroblock_4x4;
+    ps_codec->luma_energy_compaction[3] = ih264e_code_luma_inter_macroblock_16x16;
+
+    /* Init fn ptr chroma core coding */
+    ps_codec->chroma_energy_compaction[0] = ih264e_code_chroma_intra_macroblock_8x8;
+    ps_codec->chroma_energy_compaction[1] = ih264e_code_chroma_inter_macroblock_8x8;
+
+    /* Init fn ptr luma deblocking */
+    ps_codec->pf_deblk_luma_vert_bs4 = ih264_deblk_luma_vert_bs4_a9;
+    ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_a9;
+    ps_codec->pf_deblk_luma_horz_bs4 = ih264_deblk_luma_horz_bs4_a9;
+    ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_a9;
+
+    /* Init fn ptr chroma deblocking */
+    ps_codec->pf_deblk_chroma_vert_bs4 = ih264_deblk_chroma_vert_bs4_a9;
+    ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4_a9;
+    ps_codec->pf_deblk_chroma_horz_bs4 = ih264_deblk_chroma_horz_bs4_a9;
+    ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_a9;
+
+    /* write mb syntax layer */
+    ps_codec->pf_write_mb_syntax_layer[CAVLC][ISLICE] = ih264e_write_islice_mb_cavlc;
+    ps_codec->pf_write_mb_syntax_layer[CAVLC][PSLICE] = ih264e_write_pslice_mb_cavlc;
+    ps_codec->pf_write_mb_syntax_layer[CABAC][ISLICE] = ih264e_write_islice_mb_cabac;
+    ps_codec->pf_write_mb_syntax_layer[CABAC][PSLICE] = ih264e_write_pslice_mb_cabac;
+
+    /* Padding Functions */
+    ps_codec->pf_pad_top = ih264_pad_top_a9q;
+    ps_codec->pf_pad_bottom = ih264_pad_bottom;
+    ps_codec->pf_pad_left_luma = ih264_pad_left_luma_a9q;
+    ps_codec->pf_pad_left_chroma = ih264_pad_left_chroma_a9q;
+    ps_codec->pf_pad_right_luma = ih264_pad_right_luma_a9q;
+    ps_codec->pf_pad_right_chroma = ih264_pad_right_chroma_a9q;
+
+    /* Inter pred leaf level functions */
+    ps_codec->pf_inter_pred_luma_copy = ih264_inter_pred_luma_copy_a9q;
+    ps_codec->pf_inter_pred_luma_horz = ih264_inter_pred_luma_horz_a9q;
+    ps_codec->pf_inter_pred_luma_vert = ih264_inter_pred_luma_vert_a9q;
+    ps_codec->pf_inter_pred_luma_bilinear = ih264_inter_pred_luma_bilinear_a9q;
+    ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_a9q;
+
+    /* sad me level functions */
+    ps_codec->apf_compute_sad_16x16[0] = ime_compute_sad_16x16_a9q;
+    ps_codec->apf_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_a9q;
+    ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8_a9q;
+
+    /* memor handling operations */
+    ps_codec->pf_mem_cpy = ih264_memcpy_a9q;
+    ps_codec->pf_mem_cpy_mul8 = ih264_memcpy_mul_8_a9q;
+    ps_codec->pf_mem_set = ih264_memset_a9q;
+    ps_codec->pf_mem_set_mul8 = ih264_memset_mul_8_a9q;
+
+    /* sad me level functions */
+    for (i = 0; i < (MAX_PROCESS_CTXT); i++)
+    {
+        ps_proc = &ps_codec->as_process[i];
+        ps_me_ctxt = &ps_proc->s_me_ctxt;
+        ps_me_ctxt->pf_ime_compute_sad_16x16[0] = ime_compute_sad_16x16_a9q;
+        ps_me_ctxt->pf_ime_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_a9q;
+        ps_me_ctxt->pf_ime_compute_sad_16x8 = ime_compute_sad_16x8_a9q;
+        ps_me_ctxt->pf_ime_compute_sad4_diamond = ime_calculate_sad4_prog_a9q;
+        ps_me_ctxt->pf_ime_compute_sad3_diamond = ime_calculate_sad3_prog_a9q;
+        ps_me_ctxt->pf_ime_compute_sad2_diamond = ime_calculate_sad2_prog_a9q;
+        ps_me_ctxt->pf_ime_sub_pel_compute_sad_16x16 = ime_sub_pel_compute_sad_16x16_a9q;
+        ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16 = ime_compute_satqd_16x16_lumainter_a9q;
     }
 
+    /* intra mode eval -encoder level function */
+    ps_codec->pf_ih264e_evaluate_intra16x16_modes = ih264e_evaluate_intra16x16_modes_a9q;
+    ps_codec->pf_ih264e_evaluate_intra_chroma_modes = ih264e_evaluate_intra_chroma_modes_a9q;
+    ps_codec->pf_ih264e_evaluate_intra_4x4_modes = ih264e_evaluate_intra_4x4_modes_a9q;
+
+    /* csc */
+    ps_codec->pf_ih264e_conv_420p_to_420sp = ih264e_fmt_conv_420p_to_420sp_a9q;
+    ps_codec->pf_ih264e_fmt_conv_422i_to_420sp = ih264e_fmt_conv_422i_to_420sp_a9q;
+
+    /* Halp pel generation function - encoder level */
+    ps_codec->pf_ih264e_sixtapfilter_horz = ih264e_sixtapfilter_horz_a9q;
+    ps_codec->pf_ih264e_sixtap_filter_2dvh_vert = ih264e_sixtap_filter_2dvh_vert_a9q;
+
+}
+
diff --git a/encoder/arm/ih264e_function_selector_av8.c b/encoder/arm/ih264e_function_selector_av8.c
index 173c2d5..1679af3 100644
--- a/encoder/arm/ih264e_function_selector_av8.c
+++ b/encoder/arm/ih264e_function_selector_av8.c
@@ -62,8 +62,8 @@
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -72,23 +72,18 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
-
+#include "ih264_cabac_tables.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_platform_macros.h"
-#include "ih264_intra_pred_filters.h"
-#include "ih264_trans_quant_itrans_iquant.h"
-#include "ih264e_defs.h"
-#include "ih264e_structs.h"
-#include "ih264_deblk_edge_filters.h"
+#include "ih264e_cabac.h"
 #include "ih264e_core_coding.h"
 #include "ih264_cavlc_tables.h"
 #include "ih264e_cavlc.h"
-#include "ih264_padding.h"
 #include "ih264e_intra_modes_eval.h"
-#include "ih264_mem_fns.h"
 #include "ih264e_fmt_conv.h"
 #include "ih264e_half_pel.h"
 
@@ -197,8 +192,12 @@ void ih264e_init_function_ptr_neon_av8(codec_t *ps_codec)
         ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_av8;
 
         /* write mb syntax layer */
-        ps_codec->pf_write_mb_syntax_layer[ISLICE] = ih264e_write_islice_mb;
-        ps_codec->pf_write_mb_syntax_layer[PSLICE] = ih264e_write_pslice_mb;
+        /* write mb syntax layer */
+        ps_codec->pf_write_mb_syntax_layer[CAVLC][ISLICE] = ih264e_write_islice_mb_cavlc;
+        ps_codec->pf_write_mb_syntax_layer[CAVLC][PSLICE] = ih264e_write_pslice_mb_cavlc;
+        ps_codec->pf_write_mb_syntax_layer[CAVLC][BSLICE] = ih264e_write_bslice_mb_cavlc;
+        ps_codec->pf_write_mb_syntax_layer[CABAC][ISLICE] = ih264e_write_islice_mb_cabac;
+        ps_codec->pf_write_mb_syntax_layer[CABAC][PSLICE] = ih264e_write_pslice_mb_cabac;
 
         /* Padding Functions */
         ps_codec->pf_pad_top = ih264_pad_top_av8;
diff --git a/encoder/arm/ih264e_half_pel.s b/encoder/arm/ih264e_half_pel.s
index 1b9a87a..3ae6130 100644
--- a/encoder/arm/ih264e_half_pel.s
+++ b/encoder/arm/ih264e_half_pel.s
@@ -43,7 +43,6 @@
 .text
 .p2align 2
 
-@ /**
 @/*******************************************************************************
 @*
 @* @brief
diff --git a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
index e768c21..df06d41 100644
--- a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
+++ b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
@@ -17,7 +17,6 @@
 //*****************************************************************************
 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 //*/
-///**
 
 ///**
 //******************************************************************************
@@ -97,7 +96,6 @@ ih264e_evaluate_intra16x16_modes_av8:
 
     ldr       x16, [sp, #80]
     mov       x17, x4
-    mov       x18, x5
     mov       x14, x6
     mov       x15, x7
 
@@ -503,9 +501,9 @@ sad_comp:
 
     ///----------------------
     //DO VERTICAL PREDICTION
-    str       x8 , [x7]                 //MIN SAD
-    mov       x8, #0
-    str       x8 , [x6]                 // MODE
+    str       w8 , [x7]                 //MIN SAD
+    mov       w8, #0
+    str       w8 , [x6]                 // MODE
     add       x6, x1, #17
     ld1       {v30.16b}, [x6]
     b         do_dc_vert
@@ -515,9 +513,9 @@ not_vert: cmp x9, x10
 
     ///----------------------
     //DO HORIZONTAL
-    str       x9 , [x7]                 //MIN SAD
-    mov       x9, #1
-    str       x9 , [x6]                 // MODE
+    str       w9 , [x7]                 //MIN SAD
+    mov       w9, #1
+    str       w9 , [x6]                 // MODE
 
     ld1       {v0.16b}, [x1]
     dup       v10.16b, v0.b[15]
@@ -562,9 +560,9 @@ not_vert: cmp x9, x10
 
 do_dc: ///---------------------------------
     //DO DC
-    str       x10 , [x7]                //MIN SAD
-    mov       x10, #2
-    str       x10 , [x6]                // MODE
+    str       w10 , [x7]                //MIN SAD
+    mov       w10, #2
+    str       w10 , [x6]                // MODE
 do_dc_vert:
     st1       {v30.4s}, [x2], x4        //0
     st1       {v30.4s}, [x2], x4        //1
diff --git a/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s
index b02afd1..bb2526d 100644
--- a/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s
+++ b/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s
@@ -17,7 +17,6 @@
 //*****************************************************************************
 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 //*/
-///**
 
 ///**
 //******************************************************************************
@@ -401,10 +400,10 @@ sad_comp:
 
     ///----------------------
     //DO DC PREDICTION
-    str       x10 , [x7]                //MIN SAD
+    str       w10 , [x7]                //MIN SAD
 
-    mov       x10, #0
-    str       x10 , [x6]                // MODE
+    mov       w10, #0
+    str       w10 , [x6]                // MODE
 
     b         do_dc_vert
     //-----------------------------
@@ -414,10 +413,10 @@ not_dc:
     bgt       do_vert
     ///----------------------
     //DO HORIZONTAL
-    str       x9 , [x7]                 //MIN SAD
+    str       w9 , [x7]                 //MIN SAD
 
-    mov       x10, #1
-    str       x10 , [x6]                // MODE
+    mov       w10, #1
+    str       w10 , [x6]                // MODE
     ld1       {v0.8h}, [x1]
 
     dup       v10.8h, v0.h[7]
@@ -441,9 +440,9 @@ not_dc:
 
 do_vert:
     //DO VERTICAL PREDICTION
-    str       x8 , [x7]                 //MIN SAD
-    mov       x8, #2
-    str       x8 , [x6]                 // MODE
+    str       w8 , [x7]                 //MIN SAD
+    mov       w8, #2
+    str       w8 , [x6]                 // MODE
     add       x6, x1, #18
     ld1       {v28.8b, v29.8b}, [x6]    // vertical values
     ld1       {v30.8b, v31.8b}, [x6]    // vertical values
diff --git a/encoder/armv8/ih264e_half_pel_av8.s b/encoder/armv8/ih264e_half_pel_av8.s
index 817faa6..8f27104 100644
--- a/encoder/armv8/ih264e_half_pel_av8.s
+++ b/encoder/armv8/ih264e_half_pel_av8.s
@@ -44,7 +44,6 @@
 .p2align 2
 .include "ih264_neon_macros.s"
 
-// /**
 ///*******************************************************************************
 //*
 //* @brief
@@ -280,8 +279,8 @@ ih264e_sixtap_filter_2dvh_vert_av8:
     ld1       {v11.8b, v12.8b, v13.8b}, [x0], x3
     mov       x14, #20
     ld1       {v14.8b, v15.8b, v16.8b}, [x0], x3
-    mov       v0.4h[0], w12
-    mov       v0.4h[1], w14
+    mov       v0.h[0], w12
+    mov       v0.h[1], w14
     ld1       {v17.8b, v18.8b, v19.8b}, [x0], x3
     movi      v1.8b, #20
 
@@ -333,10 +332,10 @@ filter_2dvh_loop:
 
     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
     ext       v31.8b, v22.8b , v23.8b , #2
-    smlal     v26.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set1)
-    smlal     v26.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set1)
-    smlsl     v26.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
-    smlsl     v26.4s, v21.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
+    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
+    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
+    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
+    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
     ext       v30.8b, v21.8b , v22.8b , #4
 
     sqrshrun  v4.8b, v24.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
@@ -344,10 +343,10 @@ filter_2dvh_loop:
 
     ext       v28.8b, v21.8b , v22.8b , #2
     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
-    smlal     v20.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set2)
-    smlal     v20.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set2)
-    smlsl     v20.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
-    smlsl     v20.4s, v22.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
+    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
+    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
+    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
+    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
     ext       v31.8b, v23.8b , v24.8b , #2
     mov       v21.d[0], v20.d[1]
     ext       v2.8b, v2.8b , v3.8b , #2
@@ -362,10 +361,10 @@ filter_2dvh_loop:
 
     saddl     v2.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
     ext       v28.8b, v22.8b , v23.8b , #2
-    smlal     v2.4s, v30.4h, v0.4h[1]   //// a0 + a5 + 20a2                         (set3)
-    smlal     v2.4s, v29.4h, v0.4h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
-    smlsl     v2.4s, v28.4h, v0.4h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
-    smlsl     v2.4s, v23.4h, v0.4h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
+    smlal     v2.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
+    smlal     v2.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
+    smlsl     v2.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
+    smlsl     v2.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
     ext       v31.8b, v24.8b , v25.8b , #2
 
     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
@@ -376,10 +375,10 @@ filter_2dvh_loop:
     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
     ext       v28.8b, v23.8b , v24.8b , #2
     ext       v31.8b, v25.8b , v25.8b , #2
-    smlal     v26.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set4)
-    smlal     v26.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set4)
-    smlsl     v26.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
-    smlsl     v26.4s, v24.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
+    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
+    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
+    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
+    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
     ext       v30.8b, v24.8b , v25.8b , #4
 
     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
@@ -389,10 +388,10 @@ filter_2dvh_loop:
     shrn      v28.4h, v2.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
 
     ld1       {v2.8b, v3.8b, v4.8b}, [x0], x3 //// Load next Row data
-    smlal     v22.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set5)
-    smlal     v22.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set5)
-    smlsl     v22.4s, v31.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
-    smlsl     v22.4s, v25.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
+    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
+    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
+    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
+    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
     mov       v20.d[1], v21.d[0]
     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
@@ -449,10 +448,10 @@ filter_2dvh_loop:
 
     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
     ext       v31.8b, v22.8b , v23.8b , #2
-    smlal     v26.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set1)
-    smlal     v26.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set1)
-    smlsl     v26.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
-    smlsl     v26.4s, v21.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
+    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
+    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
+    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
+    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
     ext       v30.8b, v21.8b , v22.8b , #4
 
     sqrshrun  v7.8b, v24.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
@@ -460,10 +459,10 @@ filter_2dvh_loop:
 
     ext       v28.8b, v21.8b , v22.8b , #2
     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
-    smlal     v20.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set2)
-    smlal     v20.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set2)
-    smlsl     v20.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
-    smlsl     v20.4s, v22.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
+    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
+    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
+    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
+    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
     ext       v31.8b, v23.8b , v24.8b , #2
 
     ext       v5.8b, v5.8b , v6.8b , #2
@@ -478,10 +477,10 @@ filter_2dvh_loop:
 
     saddl     v6.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
     ext       v28.8b, v22.8b , v23.8b , #2
-    smlal     v6.4s, v30.4h, v0.4h[1]   //// a0 + a5 + 20a2                         (set3)
-    smlal     v6.4s, v29.4h, v0.4h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
-    smlsl     v6.4s, v28.4h, v0.4h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
-    smlsl     v6.4s, v23.4h, v0.4h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
+    smlal     v6.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
+    smlal     v6.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
+    smlsl     v6.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
+    smlsl     v6.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
     ext       v31.8b, v24.8b , v25.8b , #2
 
     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
@@ -492,10 +491,10 @@ filter_2dvh_loop:
     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
     ext       v28.8b, v23.8b , v24.8b , #2
     ext       v31.8b, v25.8b , v25.8b , #2
-    smlal     v26.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set4)
-    smlal     v26.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set4)
-    smlsl     v26.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
-    smlsl     v26.4s, v24.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
+    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
+    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
+    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
+    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
     ext       v30.8b, v24.8b , v25.8b , #4
 
     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
@@ -505,10 +504,10 @@ filter_2dvh_loop:
     shrn      v28.4h, v6.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
 
     ld1       {v5.8b, v6.8b, v7.8b}, [x0], x3 //// Load next Row data
-    smlal     v22.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set5)
-    smlal     v22.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set5)
-    smlsl     v22.4s, v31.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
-    smlsl     v22.4s, v25.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
+    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
+    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
+    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
+    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
     mov       v20.d[1], v21.d[0]
     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
@@ -564,10 +563,10 @@ filter_2dvh_loop:
 
     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
     ext       v31.8b, v22.8b , v23.8b , #2
-    smlal     v26.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set1)
-    smlal     v26.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set1)
-    smlsl     v26.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
-    smlsl     v26.4s, v21.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
+    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
+    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
+    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
+    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
     ext       v30.8b, v21.8b , v22.8b , #4
 
     sqrshrun  v10.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
@@ -575,10 +574,10 @@ filter_2dvh_loop:
 
     ext       v28.8b, v21.8b , v22.8b , #2
     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
-    smlal     v20.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set2)
-    smlal     v20.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set2)
-    smlsl     v20.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
-    smlsl     v20.4s, v22.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
+    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
+    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
+    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
+    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
     ext       v31.8b, v23.8b , v24.8b , #2
 
     ext       v8.8b, v8.8b , v9.8b , #2
@@ -593,10 +592,10 @@ filter_2dvh_loop:
 
     saddl     v8.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
     ext       v28.8b, v22.8b , v23.8b , #2
-    smlal     v8.4s, v30.4h, v0.4h[1]   //// a0 + a5 + 20a2                         (set3)
-    smlal     v8.4s, v29.4h, v0.4h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
-    smlsl     v8.4s, v28.4h, v0.4h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
-    smlsl     v8.4s, v23.4h, v0.4h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
+    smlal     v8.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
+    smlal     v8.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
+    smlsl     v8.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
+    smlsl     v8.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
     ext       v31.8b, v24.8b , v25.8b , #2
 
     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
@@ -607,10 +606,10 @@ filter_2dvh_loop:
     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
     ext       v28.8b, v23.8b , v24.8b , #2
     ext       v31.8b, v25.8b , v25.8b , #2
-    smlal     v26.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set4)
-    smlal     v26.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set4)
-    smlsl     v26.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
-    smlsl     v26.4s, v24.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
+    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
+    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
+    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
+    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
     ext       v30.8b, v24.8b , v25.8b , #4
 
     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
@@ -620,10 +619,10 @@ filter_2dvh_loop:
     shrn      v28.4h, v8.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
 
     ld1       {v8.8b, v9.8b, v10.8b}, [x0], x3 //// Load next Row data
-    smlal     v22.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set5)
-    smlal     v22.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set5)
-    smlsl     v22.4s, v31.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
-    smlsl     v22.4s, v25.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
+    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
+    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
+    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
+    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
     mov       v20.d[1], v21.d[0]
     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
@@ -678,10 +677,10 @@ filter_2dvh_loop:
 
     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
     ext       v31.8b, v22.8b , v23.8b , #2
-    smlal     v26.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set1)
-    smlal     v26.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set1)
-    smlsl     v26.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
-    smlsl     v26.4s, v21.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
+    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
+    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
+    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
+    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
     ext       v30.8b, v21.8b , v22.8b , #4
 
     sqrshrun  v13.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
@@ -689,10 +688,10 @@ filter_2dvh_loop:
 
     ext       v28.8b, v21.8b , v22.8b , #2
     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
-    smlal     v20.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set2)
-    smlal     v20.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set2)
-    smlsl     v20.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
-    smlsl     v20.4s, v22.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
+    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
+    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
+    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
+    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
     ext       v31.8b, v23.8b , v24.8b , #2
 
     ext       v11.8b, v11.8b , v12.8b , #2
@@ -707,10 +706,10 @@ filter_2dvh_loop:
 
     saddl     v12.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
     ext       v28.8b, v22.8b , v23.8b , #2
-    smlal     v12.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set3)
-    smlal     v12.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set3)
-    smlsl     v12.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
-    smlsl     v12.4s, v23.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
+    smlal     v12.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
+    smlal     v12.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
+    smlsl     v12.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
+    smlsl     v12.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
     ext       v31.8b, v24.8b , v25.8b , #2
 
     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
@@ -721,10 +720,10 @@ filter_2dvh_loop:
     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
     ext       v28.8b, v23.8b , v24.8b , #2
     ext       v31.8b, v25.8b , v25.8b , #2
-    smlal     v26.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set4)
-    smlal     v26.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set4)
-    smlsl     v26.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
-    smlsl     v26.4s, v24.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
+    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
+    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
+    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
+    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
     ext       v30.8b, v24.8b , v25.8b , #4
 
     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
@@ -734,10 +733,10 @@ filter_2dvh_loop:
     shrn      v28.4h, v12.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
 
     ld1       {v11.8b, v12.8b, v13.8b}, [x0], x3 //// Load next Row data
-    smlal     v22.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set5)
-    smlal     v22.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set5)
-    smlsl     v22.4s, v31.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
-    smlsl     v22.4s, v25.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
+    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
+    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
+    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
+    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
     mov       v20.d[1], v21.d[0]
     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
@@ -792,10 +791,10 @@ filter_2dvh_loop:
 
     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
     ext       v31.8b, v22.8b , v23.8b , #2
-    smlal     v26.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set1)
-    smlal     v26.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set1)
-    smlsl     v26.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
-    smlsl     v26.4s, v21.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
+    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
+    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
+    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
+    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
     ext       v30.8b, v21.8b , v22.8b , #4
 
     sqrshrun  v16.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
@@ -803,10 +802,10 @@ filter_2dvh_loop:
 
     ext       v28.8b, v21.8b , v22.8b , #2
     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
-    smlal     v20.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set2)
-    smlal     v20.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set2)
-    smlsl     v20.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
-    smlsl     v20.4s, v22.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
+    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
+    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
+    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
+    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
     ext       v31.8b, v23.8b , v24.8b , #2
 
     ext       v14.8b, v14.8b , v15.8b , #2
@@ -821,10 +820,10 @@ filter_2dvh_loop:
 
     saddl     v14.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
     ext       v28.8b, v22.8b , v23.8b , #2
-    smlal     v14.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set3)
-    smlal     v14.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set3)
-    smlsl     v14.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
-    smlsl     v14.4s, v23.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
+    smlal     v14.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
+    smlal     v14.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
+    smlsl     v14.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
+    smlsl     v14.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
     ext       v31.8b, v24.8b , v25.8b , #2
 
     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
@@ -835,10 +834,10 @@ filter_2dvh_loop:
     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
     ext       v28.8b, v23.8b , v24.8b , #2
     ext       v31.8b, v25.8b , v25.8b , #2
-    smlal     v26.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set4)
-    smlal     v26.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set4)
-    smlsl     v26.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
-    smlsl     v26.4s, v24.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
+    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
+    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
+    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
+    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
     ext       v30.8b, v24.8b , v25.8b , #4
 
     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
@@ -848,10 +847,10 @@ filter_2dvh_loop:
     shrn      v28.4h, v14.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
 
     ld1       {v14.8b, v15.8b, v16.8b}, [x0], x3 //// Load next Row data
-    smlal     v22.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set5)
-    smlal     v22.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set5)
-    smlsl     v22.4s, v31.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
-    smlsl     v22.4s, v25.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
+    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
+    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
+    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
+    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
     mov       v20.d[1], v21.d[0]
     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
@@ -909,10 +908,10 @@ filter_2dvh_loop:
 
     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
     ext       v31.8b, v22.8b , v23.8b , #2
-    smlal     v26.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set1)
-    smlal     v26.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set1)
-    smlsl     v26.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
-    smlsl     v26.4s, v21.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
+    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
+    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
+    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
+    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
     ext       v30.8b, v21.8b , v22.8b , #4
 
     sqrshrun  v19.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
@@ -920,10 +919,10 @@ filter_2dvh_loop:
 
     ext       v28.8b, v21.8b , v22.8b , #2
     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
-    smlal     v20.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set2)
-    smlal     v20.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set2)
-    smlsl     v20.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
-    smlsl     v20.4s, v22.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
+    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
+    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
+    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
+    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
     ext       v31.8b, v23.8b , v24.8b , #2
 
     ext       v17.8b, v17.8b , v18.8b , #2
@@ -938,10 +937,10 @@ filter_2dvh_loop:
 
     saddl     v18.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
     ext       v28.8b, v22.8b , v23.8b , #2
-    smlal     v18.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set3)
-    smlal     v18.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set3)
-    smlsl     v18.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
-    smlsl     v18.4s, v23.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
+    smlal     v18.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
+    smlal     v18.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
+    smlsl     v18.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
+    smlsl     v18.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
     ext       v31.8b, v24.8b , v25.8b , #2
 
     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
@@ -952,10 +951,10 @@ filter_2dvh_loop:
     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
     ext       v28.8b, v23.8b , v24.8b , #2
     ext       v31.8b, v25.8b , v25.8b , #2
-    smlal     v26.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set4)
-    smlal     v26.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set4)
-    smlsl     v26.4s, v28.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
-    smlsl     v26.4s, v24.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
+    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
+    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
+    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
+    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
     ext       v30.8b, v24.8b , v25.8b , #4
 
     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
@@ -965,10 +964,10 @@ filter_2dvh_loop:
     shrn      v28.4h, v18.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
 
     ld1       {v17.8b, v18.8b, v19.8b}, [x0], x3 //// Load next Row data
-    smlal     v22.4s, v30.4h, v0.4h[1]  //// a0 + a5 + 20a2                         (set5)
-    smlal     v22.4s, v29.4h, v0.4h[1]  //// a0 + a5 + 20a2 + 20a3                  (set5)
-    smlsl     v22.4s, v31.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
-    smlsl     v22.4s, v25.4h, v0.4h[0]  //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
+    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
+    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
+    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
+    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
     mov       v20.d[1], v21.d[0]
     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
diff --git a/encoder/ih264e_api.c b/encoder/ih264e_api.c
index 8a478bb..96122de 100644
--- a/encoder/ih264e_api.c
+++ b/encoder/ih264e_api.c
@@ -93,6 +93,7 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "ih264_macros.h"
 #include "ih264e_defs.h"
 #include "ih264e_globals.h"
@@ -109,10 +110,10 @@
 #include "ime_defs.h"
 #include "ime_distortion_metrics.h"
 #include "ime_structs.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_utils.h"
 #include "ih264e_core_coding.h"
-#include "ih264_buf_mgr.h"
 #include "ih264_platform_macros.h"
 #include "ih264e_platform_macros.h"
 #include "ih264_list.h"
@@ -399,7 +400,8 @@ static IV_STATUS_T api_check_struct_sanity(iv_obj_t *ps_handle,
                 return (IV_FAIL);
             }
 
-            if (ps_ip->s_ive_ip.u4_max_ref_cnt != 1)
+            if (ps_ip->s_ive_ip.u4_max_ref_cnt > MAX_REF_PIC_CNT ||
+                           ps_ip->s_ive_ip.u4_max_ref_cnt < MIN_REF_PIC_CNT)
             {
                 ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM;
                 ps_op->s_ive_op.u4_error_code |= IH264E_NUM_REF_UNSUPPORTED;
@@ -482,7 +484,15 @@ static IV_STATUS_T api_check_struct_sanity(iv_obj_t *ps_handle,
                 return (IV_FAIL);
             }
 
-            if (ps_ip->s_ive_ip.u4_max_num_bframes != 0)
+            if (ps_ip->s_ive_ip.u4_num_bframes > MAX_NUM_BFRAMES)
+            {
+                ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM;
+                ps_op->s_ive_op.u4_error_code |= IH264E_BFRAMES_NOT_SUPPORTED;
+                return (IV_FAIL);
+            }
+
+            if (ps_ip->s_ive_ip.u4_num_bframes
+                            && (ps_ip->s_ive_ip.u4_max_ref_cnt < 2))
             {
                 ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM;
                 ps_op->s_ive_op.u4_error_code |= IH264E_BFRAMES_NOT_SUPPORTED;
@@ -1472,15 +1482,6 @@ static IV_STATUS_T api_check_struct_sanity(iv_obj_t *ps_handle,
                         return IV_FAIL;
                     }
 
-                    if (ps_ip->s_ive_ip.u4_num_b_frames != 0)
-                    {
-                        ps_op->s_ive_op.u4_error_code |= 1
-                                        << IVE_UNSUPPORTEDPARAM;
-                        ps_op->s_ive_op.u4_error_code |=
-                                        IH264E_BFRAMES_NOT_SUPPORTED;
-                        return IV_FAIL;
-                    }
-
                     break;
                 }
 
@@ -1560,7 +1561,11 @@ static IV_STATUS_T api_check_struct_sanity(iv_obj_t *ps_handle,
                         return IV_FAIL;
                     }
 
-                    if ((ps_ip->s_ive_ip.u4_i_qp_min > ps_ip->s_ive_ip.u4_i_qp_max)
+                    /* We donot support QP < 4 */
+                    if ((ps_ip->s_ive_ip.u4_i_qp_min < 4)
+                                    || (ps_ip->s_ive_ip.u4_p_qp_min < 4)
+                                    || (ps_ip->s_ive_ip.u4_b_qp_min < 4)
+                                    || (ps_ip->s_ive_ip.u4_i_qp_min > ps_ip->s_ive_ip.u4_i_qp_max)
                                     || (ps_ip->s_ive_ip.u4_p_qp_min > ps_ip->s_ive_ip.u4_p_qp_max)
                                     || (ps_ip->s_ive_ip.u4_b_qp_min > ps_ip->s_ive_ip.u4_b_qp_max))
                     {
@@ -1743,7 +1748,8 @@ static IV_STATUS_T api_check_struct_sanity(iv_obj_t *ps_handle,
                         return IV_FAIL;
                     }
 
-                    if (ps_ip->s_ive_ip.e_profile != IV_PROFILE_BASE)
+                    if (ps_ip->s_ive_ip.e_profile != IV_PROFILE_BASE &&
+                        ps_ip->s_ive_ip.e_profile != IV_PROFILE_MAIN)
                     {
                         ps_op->s_ive_op.u4_error_code |= 1
                                         << IVE_UNSUPPORTEDPARAM;
@@ -1832,7 +1838,6 @@ IH264E_ERROR_T ih264e_codec_update_config(codec_t *ps_codec,
             ps_curr_cfg->i4_wd_mbs = ps_curr_cfg->u4_wd >> 4;
             ps_curr_cfg->i4_ht_mbs = ps_curr_cfg->u4_ht >> 4;
 
-            ps_codec->i4_src_strd = ps_codec->s_cfg.u4_strd;
             ps_codec->i4_rec_strd = ALIGN16(ps_cfg->u4_wd) + PAD_WD;
 
             /* If number of MBs in a frame changes the air map also changes.
@@ -1864,7 +1869,7 @@ IH264E_ERROR_T ih264e_codec_update_config(codec_t *ps_codec,
             u4_init_rc = 1;
 
             /* when the dimension changes, the header needs to be regenerated */
-            ps_codec->i4_header_mode = 1;
+            ps_codec->i4_gen_header = 1;
         }
     }
     else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_FRAMERATE)
@@ -2091,7 +2096,6 @@ IH264E_ERROR_T ih264e_codec_update_config(codec_t *ps_codec,
 
         ps_curr_cfg->u4_idr_frm_interval = ps_cfg->u4_idr_frm_interval;
 
-        ps_curr_cfg->u4_num_b_frames = ps_cfg->u4_num_b_frames;
     }
     else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_DEBLOCK_PARAMS)
     {
@@ -2188,6 +2192,7 @@ IH264E_ERROR_T ih264e_codec_update_config(codec_t *ps_codec,
     else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_PROFILE_PARAMS)
     {
         ps_codec->s_cfg.e_profile = ps_cfg->e_profile;
+        ps_codec->s_cfg.u4_entropy_coding_mode = ps_cfg->u4_entropy_coding_mode;
     }
     else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_NUM_CORES)
     {
@@ -2259,8 +2264,9 @@ IH264E_ERROR_T ih264e_codec_update_config(codec_t *ps_codec,
                        ps_codec->s_cfg.u4_target_bitrate,
                        ps_codec->s_cfg.u4_max_bitrate,
                        ps_codec->s_cfg.u4_vbv_buffer_delay,
-                       ps_codec->s_cfg.u4_i_frm_interval, au1_init_qp,
-                       H264_ALLOC_INTER_FRM_INTV, au1_min_max_qp,
+                       ps_codec->s_cfg.u4_i_frm_interval,
+                       ps_codec->s_cfg.u4_num_bframes + 1, au1_init_qp,
+                       ps_codec->s_cfg.u4_num_bframes + 2, au1_min_max_qp,
                        ps_codec->s_cfg.u4_max_level);
     }
 
@@ -2302,7 +2308,7 @@ static WORD32 ih264e_set_default_params(cfg_params_t *ps_cfg)
     ps_cfg->e_rc_mode = DEFAULT_RC;
     ps_cfg->u4_max_framerate = DEFAULT_MAX_FRAMERATE;
     ps_cfg->u4_max_bitrate = DEFAULT_MAX_BITRATE;
-    ps_cfg->u4_max_num_bframes = 0;
+    ps_cfg->u4_num_bframes = DEFAULT_MAX_NUM_BFRAMES;
     ps_cfg->e_content_type = IV_PROGRESSIVE;
     ps_cfg->u4_max_srch_rng_x = DEFAULT_MAX_SRCH_RANGE_X;
     ps_cfg->u4_max_srch_rng_y = DEFAULT_MAX_SRCH_RANGE_Y;
@@ -2350,7 +2356,6 @@ static WORD32 ih264e_set_default_params(cfg_params_t *ps_cfg)
     ps_cfg->u4_srch_rng_y = DEFAULT_SRCH_RNG_Y;
     ps_cfg->u4_i_frm_interval = DEFAULT_I_INTERVAL;
     ps_cfg->u4_idr_frm_interval = DEFAULT_IDR_INTERVAL;
-    ps_cfg->u4_num_b_frames = DEFAULT_B_FRAMES;
     ps_cfg->u4_disable_deblock_level = DEFAULT_DISABLE_DEBLK_LEVEL;
     ps_cfg->e_profile = DEFAULT_PROFILE;
     ps_cfg->u4_timestamp_low = 0;
@@ -2396,7 +2401,7 @@ static WORD32 ih264e_init(codec_t *ps_codec)
     WORD32 i;
 
     /* coded pic count */
-    ps_codec->i4_coded_pic_cnt = 0;
+    ps_codec->i4_poc = 0;
 
     /* Number of API calls to encode are made */
     ps_codec->i4_encode_api_call_cnt = -1;
@@ -2422,7 +2427,7 @@ static WORD32 ih264e_init(codec_t *ps_codec)
     ps_codec->i4_disable_deblk_pic_cnt = 0;
 
     /* frame num */
-    ps_codec->i4_frame_num = -1;
+    ps_codec->i4_frame_num = 0;
 
     /* set the current frame type to I frame, since we are going to start  encoding*/
     ps_codec->force_curr_frame_type = IV_NA_FRAME;
@@ -2502,7 +2507,7 @@ static WORD32 ih264e_init(codec_t *ps_codec)
     {
         WORD32 max_mb_rows = ps_cfg->i4_ht_mbs;
 
-        WORD32 num_jobs = max_mb_rows * 2;
+        WORD32 num_jobs = max_mb_rows * MAX_CTXT_SETS;
         WORD32 clz;
 
         /* Use next power of two number of entries*/
@@ -2619,10 +2624,11 @@ static WORD32 ih264e_init(codec_t *ps_codec)
 */
 static WORD32 ih264e_get_num_rec(void *pv_api_ip, void *pv_api_op)
 {
-    UNUSED(pv_api_ip);
     /* api call I/O structures */
     ih264e_num_mem_rec_op_t *ps_op = pv_api_op;
 
+    UNUSED(pv_api_ip);
+
     ps_op->s_ive_op.u4_num_mem_rec = MEM_REC_CNT;
 
     return IV_SUCCESS;
@@ -2674,8 +2680,6 @@ static WORD32 ih264e_fill_num_mem_rec(void *pv_api_ip, void *pv_api_op)
     /* error status */
     IV_STATUS_T status = IV_SUCCESS;
 
-    /* profile / level info */
-    level = ps_ip->s_ive_ip.u4_max_level;
     num_reorder_frames = ps_ip->s_ive_ip.u4_max_reorder_cnt;
     num_ref_frames = ps_ip->s_ive_ip.u4_max_ref_cnt;
 
@@ -2692,6 +2696,9 @@ static WORD32 ih264e_fill_num_mem_rec(void *pv_api_ip, void *pv_api_op)
     max_mb_cols = max_wd_luma / MB_SIZE;
     max_mb_cnt = max_mb_rows * max_mb_cols;
 
+    /* profile / level info */
+    level = ih264e_get_min_level(max_ht_luma, max_wd_luma);
+
     /* validate params */
     if ((level < MIN_LEVEL) || (level > MAX_LEVEL))
     {
@@ -2739,10 +2746,30 @@ static WORD32 ih264e_fill_num_mem_rec(void *pv_api_ip, void *pv_api_op)
     DEBUG("\nMemory record Id %d = %d \n", MEM_REC_CODEC, ps_mem_rec->u4_mem_size);
 
     /************************************************************************
+     * Request memory for CABAC context                                     *
+     ***********************************************************************/
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_CABAC];
+    {
+        ps_mem_rec->u4_mem_size = sizeof(cabac_ctxt_t);
+    }
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_CABAC, ps_mem_rec->u4_mem_size);
+
+    /************************************************************************
+     * Request memory for CABAC MB info                                     *
+     ***********************************************************************/
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_CABAC_MB_INFO];
+    {
+        ps_mem_rec->u4_mem_size = ((max_mb_cols + 1) + 1)
+                        * sizeof(mb_info_ctxt_t);
+    }
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_CABAC_MB_INFO, ps_mem_rec->u4_mem_size);
+
+
+    /************************************************************************
      *  Request memory for entropy context                                  *
      *  In multi core encoding, each row is assumed to be launched on a     *
      *  thread. The rows below can only start after its neighbors are coded *
-     *  The status of an mb coded/uncoded is signaled via entropy map.     *
+     *  The status of an mb coded/uncoded is signaled via entropy map.      *
      *         1. One word32 to store skip run cnt                          *
      *         2. mb entropy map (mb status entropy coded/uncoded). The size*
      *            of the entropy map is max mb cols. Further allocate one   *
@@ -3042,7 +3069,7 @@ static WORD32 ih264e_fill_num_mem_rec(void *pv_api_ip, void *pv_api_op)
     {
         /* One process job per row of MBs */
         /* Allocate for two pictures, so that wrap around can be handled easily */
-        WORD32 num_jobs = max_mb_rows * 2;
+        WORD32 num_jobs = max_mb_rows * MAX_CTXT_SETS;
 
         WORD32 job_queue_size = ih264_list_size(num_jobs, sizeof(job_t));
 
@@ -3057,7 +3084,7 @@ static WORD32 ih264e_fill_num_mem_rec(void *pv_api_ip, void *pv_api_op)
     {
         /* One process job per row of MBs */
         /* Allocate for two pictures, so that wrap around can be handled easily */
-        WORD32 num_jobs = max_mb_rows * 2;
+        WORD32 num_jobs = max_mb_rows * MAX_CTXT_SETS;
 
         WORD32 job_queue_size = ih264_list_size(num_jobs, sizeof(job_t));
 
@@ -3177,6 +3204,7 @@ static WORD32 ih264e_fill_num_mem_rec(void *pv_api_ip, void *pv_api_op)
     ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_SCRATCH];
     {
         WORD32 total_size = 0;
+        WORD32 i4_tmp_size;
 
         /* size to hold prediction buffer */
         total_size += sizeof(UWORD8) * 16 * 16;
@@ -3215,14 +3243,8 @@ static WORD32 ih264e_fill_num_mem_rec(void *pv_api_ip, void *pv_api_op)
         total_size = ALIGN64(total_size);
 
         /* Buffers for holding half_x , half_y and half_xy planes */
-        total_size += sizeof(UWORD8) * (HP_BUFF_WD * HP_BUFF_HT);
-        total_size = ALIGN64(total_size);
-
-        total_size += sizeof(UWORD8) * (HP_BUFF_WD * HP_BUFF_HT);
-        total_size = ALIGN64(total_size);
-
-        total_size += sizeof(UWORD8) * (HP_BUFF_WD * HP_BUFF_HT);
-        total_size = ALIGN64(total_size);
+        i4_tmp_size = sizeof(UWORD8) * (HP_BUFF_WD * HP_BUFF_HT);
+        total_size += (ALIGN64(i4_tmp_size) * SUBPEL_BUFF_CNT);
 
         /* Allocate for each process thread */
         total_size *= MAX_PROCESS_CTXT;
@@ -3449,9 +3471,9 @@ static WORD32 ih264e_fill_num_mem_rec(void *pv_api_ip, void *pv_api_op)
      ************************************************************************/
     ps_mem_rec = &ps_mem_rec_base[MEM_REC_MB_INFO_NMB];
     {
-        ps_mem_rec->u4_mem_size = MAX_PROCESS_CTXT * MAX_NMB
-                        * (sizeof(mb_info_nmb_t)
-                                        + MB_SIZE * MB_SIZE * sizeof(UWORD8));
+        ps_mem_rec->u4_mem_size = MAX_PROCESS_CTXT * max_mb_cols *
+                                 (sizeof(mb_info_nmb_t) + MB_SIZE * MB_SIZE 
+                                  * sizeof(UWORD8));
     }
     DEBUG("\nMemory record Id %d = %d \n", MEM_REC_MB_INFO_NMB, ps_mem_rec->u4_mem_size);
 
@@ -3517,6 +3539,9 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
 
     /* codec variables */
     codec_t * ps_codec;
+    cabac_ctxt_t *ps_cabac;
+    mb_info_ctxt_t *ps_mb_map_ctxt_inc;
+
     cfg_params_t *ps_cfg;
 
     /* frame dimensions */
@@ -3524,7 +3549,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
     WORD32 max_mb_rows, max_mb_cols, max_mb_cnt;
 
     /* temp var */
-    WORD32 i;
+    WORD32 i, j;
     WORD32 status = IV_SUCCESS;
 
     /* frame dimensions */
@@ -3543,11 +3568,23 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
         ps_codec_obj->pv_codec_handle = ps_mem_rec->pv_base;
         ps_codec = (codec_t *) (ps_codec_obj->pv_codec_handle);
     }
+    /* Init mem records_cabac ctxt */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_CABAC];
+    {
+        ps_cabac = (cabac_ctxt_t *)(ps_mem_rec->pv_base);
+    }
+
+    /* Init mem records mb info array for CABAC */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_CABAC_MB_INFO];
+    {
+        ps_mb_map_ctxt_inc = (mb_info_ctxt_t *)(ps_mem_rec->pv_base);
+    }
 
     /* Note this memset can not be done in init() call, since init will called
      during reset as well. And calling this during reset will mean all pointers
      need to reinitialized */
     memset(ps_codec, 0, sizeof(codec_t));
+    memset(ps_cabac, 0, sizeof(cabac_ctxt_t));
 
     /* Set default Config Params */
     ps_cfg = &ps_codec->s_cfg;
@@ -3565,7 +3602,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
     ps_cfg->e_recon_color_fmt = ps_ip->s_ive_ip.e_recon_color_fmt;
     ps_cfg->u4_max_framerate = ps_ip->s_ive_ip.u4_max_framerate;
     ps_cfg->u4_max_bitrate = ps_ip->s_ive_ip.u4_max_bitrate;
-    ps_cfg->u4_max_num_bframes = ps_ip->s_ive_ip.u4_max_num_bframes;
+    ps_cfg->u4_num_bframes = ps_ip->s_ive_ip.u4_num_bframes;
     ps_cfg->e_content_type = ps_ip->s_ive_ip.e_content_type;
     ps_cfg->u4_max_srch_rng_x = ps_ip->s_ive_ip.u4_max_srch_rng_x;
     ps_cfg->u4_max_srch_rng_y = ps_ip->s_ive_ip.u4_max_srch_rng_y;
@@ -3611,7 +3648,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
 
         for (i = 0; i < MAX_PROCESS_CTXT; i++)
         {
-            if (i < MAX_PROCESS_CTXT / 2)
+            if (i < MAX_PROCESS_CTXT / MAX_CTXT_SETS)
             {
                 /* base ptr */
                 UWORD8 *pu1_buf = ps_mem_rec->pv_base;
@@ -3652,6 +3689,8 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
                 size += (max_mb_cols * 4 * sizeof(UWORD8));
                 size = ALIGN128(size);
                 offset = size;
+                /* cabac Context */
+                ps_codec->as_process[i].s_entropy.ps_cabac = ps_cabac;
             }
             else
             {
@@ -3693,8 +3732,12 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
                                 (void *) (pu1_buf + size);
                 size += (max_mb_cols * 4 * sizeof(UWORD8));
                 size = ALIGN128(size);
+                /* cabac Context */
+                ps_codec->as_process[i].s_entropy.ps_cabac = ps_cabac;
            }
         }
+        ps_codec->as_process[0].s_entropy.ps_cabac->ps_mb_map_ctxt_inc_base =
+                        ps_mb_map_ctxt_inc;
     }
 
     ps_mem_rec = &ps_mem_rec_base[MEM_REC_MB_COEFF_DATA];
@@ -3720,7 +3763,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
 
         for (i = 0; i < MAX_PROCESS_CTXT; i++)
         {
-            if (i < MAX_PROCESS_CTXT / 2)
+            if (i < MAX_PROCESS_CTXT / MAX_CTXT_SETS)
             {
                 ps_codec->as_process[i].pv_pic_mb_coeff_data = pu1_buf;
                 ps_codec->as_process[i].s_entropy.pv_pic_mb_coeff_data =
@@ -3758,7 +3801,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
 
         for (i = 0; i < MAX_PROCESS_CTXT; i++)
         {
-            if (i < MAX_PROCESS_CTXT / 2)
+            if (i < MAX_PROCESS_CTXT / MAX_CTXT_SETS)
             {
                 ps_codec->as_process[i].pv_pic_mb_header_data = pu1_buf;
                 ps_codec->as_process[i].s_entropy.pv_pic_mb_header_data =
@@ -3838,7 +3881,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
 
         for (i = 0; i < MAX_PROCESS_CTXT; i++)
         {
-            if (i < MAX_PROCESS_CTXT / 2)
+            if (i < MAX_PROCESS_CTXT / MAX_CTXT_SETS)
             {
                 ps_codec->as_process[i].ps_slice_hdr_base = ps_mem_rec->pv_base;
             }
@@ -3860,7 +3903,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
 
         for (i = 0; i < MAX_PROCESS_CTXT; i++)
         {
-            if (i < MAX_PROCESS_CTXT / 2)
+            if (i < MAX_PROCESS_CTXT / MAX_CTXT_SETS)
             {
                 ps_codec->as_process[i].pu1_is_intra_coded = pu1_buf;
             }
@@ -3885,7 +3928,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
 
         for (i = 0; i < MAX_PROCESS_CTXT; i++)
         {
-            if (i < MAX_PROCESS_CTXT / 2)
+            if (i < MAX_PROCESS_CTXT / MAX_CTXT_SETS)
             {
                 ps_codec->as_process[i].pu1_slice_idx = pu1_buf_ping;
             }
@@ -3945,7 +3988,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
 
         for (i = 0; i < MAX_PROCESS_CTXT; i++)
         {
-            if (i < MAX_PROCESS_CTXT / 2)
+            if (i < MAX_PROCESS_CTXT / MAX_CTXT_SETS)
             {
                 ps_codec->as_process[i].pu1_proc_map = pu1_buf + max_mb_cols;
             }
@@ -3976,7 +4019,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
 
         for (i = 0; i < MAX_PROCESS_CTXT; i++)
         {
-            if (i < MAX_PROCESS_CTXT / 2)
+            if (i < MAX_PROCESS_CTXT / MAX_CTXT_SETS)
             {
                 ps_codec->as_process[i].pu1_deblk_map = pu1_buf + max_mb_cols;
 
@@ -4006,7 +4049,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
 
         for (i = 0; i < MAX_PROCESS_CTXT; i++)
         {
-            if (i < MAX_PROCESS_CTXT / 2)
+            if (i < MAX_PROCESS_CTXT / MAX_CTXT_SETS)
             {
                 ps_codec->as_process[i].pu1_me_map = pu1_buf + max_mb_cols;
             }
@@ -4102,18 +4145,11 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
             size += size_inv;
             size = ALIGN64(size);
 
-            /* Buffers for holding half_x , half_y and half_xy values */
-            ps_codec->as_process[i].pu1_half_x = (void *) (pu1_buf + size);
-            size += size_hp;
-            size = ALIGN64(size);
-
-            ps_codec->as_process[i].pu1_half_y = (void *) (pu1_buf + size);
-            size += size_hp;
-            size = ALIGN64(size);
-
-            ps_codec->as_process[i].pu1_half_xy = (void *) (pu1_buf + size);
-            size += size_hp;
-            size = ALIGN64(size);
+            for (j = 0; j < SUBPEL_BUFF_CNT; j++)
+            {
+                ps_codec->as_process[i].apu1_subpel_buffs[j] = (pu1_buf + size);
+                size += ALIGN64(size_hp);
+            }
         }
     }
 
@@ -4209,7 +4245,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
 
         for (i = 0; i < MAX_PROCESS_CTXT; i++)
         {
-            if (i < MAX_PROCESS_CTXT / 2)
+            if (i < MAX_PROCESS_CTXT / MAX_CTXT_SETS)
             {
                 ps_codec->as_process[i].ps_top_row_mb_syntax_ele_base =
                                 (mb_info_t *) pu1_buf;
@@ -4260,7 +4296,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
 
         for (i = 0; i < MAX_PROCESS_CTXT; i++)
         {
-            if (i < MAX_PROCESS_CTXT / 2)
+            if (i < MAX_PROCESS_CTXT / MAX_CTXT_SETS)
             {
                 pu1_buf_ping = (UWORD8 *) ps_mem_rec->pv_base;
 
@@ -4341,9 +4377,9 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
         UWORD8 *pu1_buf = ps_mem_rec->pv_base;
 
         /* size of nmb ctxt */
-        WORD32 size = MAX_NMB * sizeof(mb_info_nmb_t);
+        WORD32 size = max_mb_cols * sizeof(mb_info_nmb_t);
 
-        UWORD32 nmb_cntr, subpel_buf_size;
+        WORD32 nmb_cntr, subpel_buf_size;
 
         /* init nmb info structure pointer in all proc ctxts */
         for (i = 0; i < MAX_PROCESS_CTXT; i++)
@@ -4361,7 +4397,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj,
             mb_info_nmb_t* ps_mb_info_nmb =
                             &ps_codec->as_process[i].ps_nmb_info[0];
 
-            for (nmb_cntr = 0; nmb_cntr < MAX_NMB; nmb_cntr++)
+            for (nmb_cntr = 0; nmb_cntr < max_mb_cols; nmb_cntr++)
             {
                 ps_mb_info_nmb[nmb_cntr].pu1_best_sub_pel_buf = pu1_buf;
 
@@ -4477,13 +4513,14 @@ static WORD32 ih264e_set_flush_mode(iv_obj_t *ps_codec_obj,
                                     void *pv_api_ip,
                                     void *pv_api_op)
 {
-    UNUSED(pv_api_ip);
     /* codec ctxt */
     codec_t *ps_codec = (codec_t *) ps_codec_obj->pv_codec_handle;
 
     /* ctrl call I/O structures */
     ih264e_ctl_flush_op_t *ps_ctl_op = pv_api_op;
 
+    UNUSED(pv_api_ip);
+
     ps_ctl_op->s_ive_op.u4_error_code = 0;
 
     /* signal flush frame control call */
@@ -4522,7 +4559,6 @@ static WORD32 ih264e_get_buf_info(iv_obj_t *ps_codec_obj,
                                   void *pv_api_ip,
                                   void *pv_api_op)
 {
-    UNUSED(ps_codec_obj);
     /* ctrl call I/O structures */
     ih264e_ctl_getbufinfo_ip_t *ps_ip = pv_api_ip;
     ih264e_ctl_getbufinfo_op_t *ps_op = pv_api_op;
@@ -4532,6 +4568,8 @@ static WORD32 ih264e_get_buf_info(iv_obj_t *ps_codec_obj,
     WORD32 ht = ALIGN16(ps_ip->s_ive_ip.u4_max_ht);
     WORD32 i;
 
+    UNUSED(ps_codec_obj);
+
     ps_op->s_ive_op.u4_error_code = 0;
 
     /* Number of components in input buffers required for codec  &
@@ -4584,7 +4622,7 @@ static WORD32 ih264e_get_buf_info(iv_obj_t *ps_codec_obj,
 
     for (i = 0; i < (WORD32) ps_op->s_ive_op.u4_out_comp_cnt; i++)
     {
-        ps_op->s_ive_op.au4_min_out_buf_size[i] = (wd * ht * 3) >> 1;
+        ps_op->s_ive_op.au4_min_out_buf_size[i] = MAX(((wd * ht * 3) >> 1), MIN_STREAM_SIZE);
     }
 
     ps_op->s_ive_op.u4_min_inp_bufs = MIN_INP_BUFS;
@@ -5073,7 +5111,6 @@ static IV_STATUS_T ih264_set_gop_params(void *pv_api_ip,
 
     ps_cfg->u4_i_frm_interval = ps_ip->s_ive_ip.u4_i_frm_interval;
     ps_cfg->u4_idr_frm_interval = ps_ip->s_ive_ip.u4_idr_frm_interval;
-    ps_cfg->u4_num_b_frames = ps_ip->s_ive_ip.u4_num_b_frames;
 
     ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high;
     ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low;
@@ -5117,6 +5154,8 @@ static IV_STATUS_T ih264_set_profile_params(void *pv_api_ip,
 
     ps_cfg->e_profile = ps_ip->s_ive_ip.e_profile;
 
+    ps_cfg->u4_entropy_coding_mode = ps_ip->s_ive_ip.u4_entropy_coding_mode;
+
     ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high;
     ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low;
 
@@ -5237,13 +5276,14 @@ static WORD32 ih264e_reset(iv_obj_t *ps_codec_obj,
                            void *pv_api_ip,
                            void *pv_api_op)
 {
-    UNUSED(pv_api_ip);
     /* codec ctxt */
     codec_t * ps_codec = (codec_t *) (ps_codec_obj->pv_codec_handle);
 
     /* ctrl call I/O structures */
     ih264e_ctl_reset_op_t *ps_op = pv_api_op;
 
+    UNUSED(pv_api_ip);
+
     ps_op->s_ive_op.u4_error_code = 0;
 
     if (ps_codec != NULL)
@@ -5297,7 +5337,7 @@ static WORD32 ih264e_ctl(iv_obj_t *ps_codec_obj,
     IVE_CONTROL_API_COMMAND_TYPE_T sub_cmd = ps_ctl_ip->s_ive_ip.e_sub_cmd;
 
     /* error status */
-    IV_STATUS_T ret = 0;
+    IV_STATUS_T ret = IV_SUCCESS;
 
     /* temp var */
     WORD32 i;
diff --git a/encoder/ih264e_bitstream.c b/encoder/ih264e_bitstream.c
index e5bfbe4..d79f637 100644
--- a/encoder/ih264e_bitstream.c
+++ b/encoder/ih264e_bitstream.c
@@ -151,7 +151,6 @@ IH264E_ERROR_T ih264e_put_bits(bitstrm_t *ps_bitstrm,
     if(code_len < WORD_SIZE)
         ASSERT((u4_code_val >> code_len) == 0);
 
-
     /* sanity check on the bitstream engine state */
     ASSERT(bits_left_in_cw > 0 && bits_left_in_cw <= WORD_SIZE);
 
diff --git a/encoder/ih264e_bitstream.h b/encoder/ih264e_bitstream.h
index 21360cc..9cd2b81 100644
--- a/encoder/ih264e_bitstream.h
+++ b/encoder/ih264e_bitstream.h
@@ -65,6 +65,14 @@
 #define EPB_BYTE            0x03
 
 
+/**
+******************************************************************************
+ *  @brief  Stream buffer allocated per frame should be atleast MIN_STREAM_SIZE
+******************************************************************************
+ */
+#define MIN_STREAM_SIZE            0x800
+
+
 /*****************************************************************************/
 /* Function Macros                                                           */
 /*****************************************************************************/
@@ -106,12 +114,12 @@
  *  @brief   returns bits required to code a value
 ******************************************************************************
  */
-#define UE_LENGTH(bits,x)       \
-{                           \
-    UWORD32 r_bit;              \
-    GETRANGE(r_bit,x+1)         \
-    bits =(((r_bit - 1) << 1)+1);     \
-}                           \
+#define UE_LENGTH(bits,x)        \
+{                                \
+    UWORD32 r_bit;               \
+    GETRANGE(r_bit,x+1)          \
+    bits =(((r_bit - 1) << 1)+1);\
+}                                \
 
 /**
 ******************************************************************************
@@ -140,6 +148,51 @@
  */
 #define BYTE_ALIGNMENT(ps_bitstrm) ih264e_put_rbsp_trailing_bits(ps_bitstrm)
 
+/**
+******************************************************************************
+ *  @brief  Gets number of  bits coded
+******************************************************************************
+ */
+
+#define GET_NUM_BITS(ps_bitstream) ((ps_bitstream->u4_strm_buf_offset << 3) \
+                                    + 32 - ps_bitstream->i4_bits_left_in_cw);
+
+
+
+/**
+******************************************************************************
+ *  @macro Align bitstream to byte - Remainig bits are filled with '1'
+******************************************************************************
+*/
+#define BITSTREAM_BYTE_ALIGN(ps_bitstrm)                                    \
+   if (ps_bitstrm->i4_bits_left_in_cw & 0x07)                               \
+   {                                                                        \
+       const WORD32 len = (WORD32)((ps_bitstrm->i4_bits_left_in_cw) & 0x07);\
+       ih264e_put_bits(ps_bitstrm, (UWORD32)((1 << len) - 1), len);         \
+   }
+
+
+/**
+******************************************************************************
+* flush the bits in cur word byte by byte  and copy to stream                *
+* (current word is assumed to be byte aligned)                               *
+******************************************************************************
+*/
+#define  BITSTREAM_FLUSH(ps_bitstrm)                                           \
+{                                                                              \
+    WORD32 i;                                                                  \
+    for (i = WORD_SIZE; i > ps_bitstrm->i4_bits_left_in_cw; i -= 8)            \
+    {                                                                          \
+       UWORD8 u1_next_byte = (ps_bitstrm->u4_cur_word >> (i - 8)) & 0xFF;      \
+       PUTBYTE_EPB(ps_bitstrm->pu1_strm_buffer, ps_bitstrm->u4_strm_buf_offset,\
+                   u1_next_byte, ps_bitstrm->i4_zero_bytes_run);               \
+    }                                                                          \
+    ps_bitstrm->u4_cur_word = 0;                                               \
+    ps_bitstrm->i4_bits_left_in_cw = WORD_SIZE;                                \
+}                                                                              \
+
+
+
 
 /*****************************************************************************/
 /* Structures                                                                */
diff --git a/encoder/ih264e_cabac.c b/encoder/ih264e_cabac.c
new file mode 100644
index 0000000..64ff7cd
--- /dev/null
+++ b/encoder/ih264e_cabac.c
@@ -0,0 +1,819 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+*******************************************************************************
+* @file
+*  ih264e_cabac.c
+*
+* @brief
+*  Contains all leaf level functions for CABAC entropy coding.
+*
+*
+* @author
+* Doney Alex
+*
+* @par List of Functions:
+*
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+/* System include files */
+#include <stdio.h>
+#include <assert.h>
+#include <limits.h>
+#include <string.h>
+
+/* User include files */
+#include "ih264e_config.h"
+#include "ih264_typedefs.h"
+#include "iv2.h"
+#include "ive2.h"
+#include "ih264_debug.h"
+#include "ih264_defs.h"
+#include "ih264e_defs.h"
+#include "ih264_macros.h"
+#include "ih264e_error.h"
+#include "ih264e_bitstream.h"
+#include "ime_distortion_metrics.h"
+#include "ime_defs.h"
+#include "ime_structs.h"
+#include "ih264_error.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include "ih264_inter_pred_filters.h"
+#include "ih264_mem_fns.h"
+#include "ih264_padding.h"
+#include "ih264_platform_macros.h"
+#include "ih264_intra_pred_filters.h"
+#include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
+#include "irc_cntrl_param.h"
+#include "irc_frame_info_collector.h"
+#include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
+#include "ih264e_structs.h"
+#include "ih264e_cabac.h"
+#include "ih264e_encode_header.h"
+#include "ih264_cavlc_tables.h"
+#include "ih264e_statistics.h"
+#include "ih264e_trace.h"
+
+
+/*****************************************************************************/
+/* Function Definitions                                                      */
+/*****************************************************************************/
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  k-th order Exp-Golomb (UEGk) binarization process: Implements concatenated
+ *   unary/ k-th order Exp-Golomb  (UEGk) binarization process,
+ *   where k = 0 as defined in 9.3.2.3 of  ITU_T_H264-201402
+ *
+ * @param[in] i2_sufs
+ *  Suffix bit string
+ *
+ * @param[in] pi1_bins_len
+ *  Pointer to length of tthe string
+ *
+ * @returns Binarized value
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+UWORD32 ih264e_cabac_UEGk0_binarization(WORD16 i2_sufs, WORD8 *pi1_bins_len)
+{
+    UWORD32 u4_bins;
+    WORD32 i4_len;
+    WORD16 x, y;
+
+    x = i2_sufs + 1;
+    i4_len = CLZ(x);
+    i4_len = 31 - i4_len;
+    y = 1 << i4_len;
+    y = y - 1;
+    i2_sufs = i2_sufs - y;
+    u4_bins = y << 1;
+    u4_bins = u4_bins << i4_len;
+    u4_bins = u4_bins + i2_sufs;
+
+    REV(u4_bins, u4_bins);
+    u4_bins = u4_bins >> (31 - 2 * i4_len);
+    (*pi1_bins_len) = 2 * i4_len + 1;
+
+    return (u4_bins);
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Get cabac context for the MB :calculates the pointers to Top and   left
+ *          cabac neighbor context depending upon neighbor  availability.
+ *
+ * @param[in] ps_ent_ctxt
+ *  Pointer to entropy context structure
+ *
+ * @param[in] u4_mb_type
+ *  Type of MB
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+void ih264e_get_cabac_context(entropy_ctxt_t *ps_ent_ctxt, WORD32 u4_mb_type)
+{
+
+    /* CABAC context */
+    cabac_ctxt_t *ps_cabac_ctxt = ps_ent_ctxt->ps_cabac;
+    mb_info_ctxt_t *ps_ctx_inc_mb_map;
+    cab_csbp_t *ps_lft_csbp;
+
+    WORD32 i4_lft_avail, i4_top_avail, i4_is_intra;
+    WORD32 i4_mb_x, i4_mb_y;
+    UWORD8 *pu1_slice_idx = ps_ent_ctxt->pu1_slice_idx;
+
+    i4_is_intra = ((u4_mb_type == I16x16) || (u4_mb_type == I8x8)
+                    || (u4_mb_type == I4x4));
+
+    /* derive neighbor availability */
+    i4_mb_x = ps_ent_ctxt->i4_mb_x;
+    i4_mb_y = ps_ent_ctxt->i4_mb_y;
+    pu1_slice_idx += (i4_mb_y * ps_ent_ctxt->i4_wd_mbs);
+    /* left macroblock availability */
+    i4_lft_avail = (i4_mb_x == 0
+                    || (pu1_slice_idx[i4_mb_x - 1] != pu1_slice_idx[i4_mb_x])) ?
+                    0 : 1;
+    /* top macroblock availability */
+    i4_top_avail = (i4_mb_y == 0
+                    || (pu1_slice_idx[i4_mb_x - ps_ent_ctxt->i4_wd_mbs]
+                                    != pu1_slice_idx[i4_mb_x])) ? 0 : 1;
+    i4_mb_x = ps_ent_ctxt->i4_mb_x;
+    ps_ctx_inc_mb_map = ps_cabac_ctxt->ps_mb_map_ctxt_inc;
+    ps_cabac_ctxt->ps_curr_ctxt_mb_info = ps_ctx_inc_mb_map + i4_mb_x;
+    ps_cabac_ctxt->ps_left_ctxt_mb_info = ps_cabac_ctxt->ps_def_ctxt_mb_info;
+    ps_cabac_ctxt->ps_top_ctxt_mb_info = ps_cabac_ctxt->ps_def_ctxt_mb_info;
+    ps_lft_csbp = ps_cabac_ctxt->ps_lft_csbp;
+    ps_cabac_ctxt->pu1_left_y_ac_csbp = &ps_lft_csbp->u1_y_ac_csbp_top_mb;
+    ps_cabac_ctxt->pu1_left_uv_ac_csbp = &ps_lft_csbp->u1_uv_ac_csbp_top_mb;
+    ps_cabac_ctxt->pu1_left_yuv_dc_csbp = &ps_lft_csbp->u1_yuv_dc_csbp_top_mb;
+    ps_cabac_ctxt->pi1_left_ref_idx_ctxt_inc =
+                    &ps_cabac_ctxt->i1_left_ref_idx_ctx_inc_arr[0][0];
+    ps_cabac_ctxt->pu1_left_mv_ctxt_inc =
+                    ps_cabac_ctxt->u1_left_mv_ctxt_inc_arr[0];
+
+    if (i4_lft_avail)
+        ps_cabac_ctxt->ps_left_ctxt_mb_info =
+                        ps_cabac_ctxt->ps_curr_ctxt_mb_info - 1;
+    if (i4_top_avail)
+        ps_cabac_ctxt->ps_top_ctxt_mb_info =
+                        ps_cabac_ctxt->ps_curr_ctxt_mb_info;
+
+    if (!i4_lft_avail)
+    {
+        UWORD8 u1_def_csbp = i4_is_intra ? 0xf : 0;
+        *(ps_cabac_ctxt->pu1_left_y_ac_csbp) = u1_def_csbp;
+        *(ps_cabac_ctxt->pu1_left_uv_ac_csbp) = u1_def_csbp;
+        *(ps_cabac_ctxt->pu1_left_yuv_dc_csbp) = u1_def_csbp;
+        *((UWORD32 *) ps_cabac_ctxt->pi1_left_ref_idx_ctxt_inc) = 0;
+        memset(ps_cabac_ctxt->pu1_left_mv_ctxt_inc, 0, 16);
+    }
+    if (!i4_top_avail)
+    {
+        UWORD8 u1_def_csbp = i4_is_intra ? 0xff : 0;
+        ps_cabac_ctxt->ps_top_ctxt_mb_info->u1_yuv_ac_csbp = u1_def_csbp;
+        ps_cabac_ctxt->ps_top_ctxt_mb_info->u1_yuv_dc_csbp = u1_def_csbp;
+        ps_cabac_ctxt->ps_curr_ctxt_mb_info->i1_ref_idx[0] =
+        ps_cabac_ctxt->ps_curr_ctxt_mb_info->i1_ref_idx[1] =
+        ps_cabac_ctxt->ps_curr_ctxt_mb_info->i1_ref_idx[2] =
+        ps_cabac_ctxt->ps_curr_ctxt_mb_info->i1_ref_idx[3] = 0;
+        memset(ps_cabac_ctxt->ps_curr_ctxt_mb_info->u1_mv, 0, 16);
+    }
+
+}
+
+
+
+/**
+ *******************************************************************************
+ * @brief
+ *  flushing at termination: Explained in flowchart 9-12(ITU_T_H264-201402).
+ *
+ *  @param[in]   ps_cabac_ctxt
+ *  pointer to cabac context (handle)
+ *
+ * @returns  success or failure error code
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+WORD32 ih264e_cabac_flush(cabac_ctxt_t *ps_cabac_ctxt)
+{
+
+    /* bit stream ptr */
+    bitstrm_t *ps_stream = ps_cabac_ctxt->ps_bitstrm;
+    encoding_envirnoment_t *ps_cab_enc_env = &(ps_cabac_ctxt->s_cab_enc_env);
+    UWORD32 u4_low = ps_cab_enc_env->u4_code_int_low;
+    UWORD32 u4_bits_gen = ps_cab_enc_env->u4_bits_gen;
+    UWORD8 *pu1_strm_buf = ps_stream->pu1_strm_buffer;
+    UWORD32 u4_strm_buf_offset = ps_stream->u4_strm_buf_offset;
+    WORD32 zero_run = ps_stream->i4_zero_bytes_run;
+    UWORD32 u4_out_standing_bytes = ps_cab_enc_env->u4_out_standing_bytes;
+
+    /************************************************************************/
+    /* Insert the carry (propogated in previous byte) along with            */
+    /* outstanding bytes (if any) and flush remaining bits                  */
+    /************************************************************************/
+    {
+        /* carry = 1 => putbit(1); carry propogated due to L renorm */
+        WORD32 carry = (u4_low >> (u4_bits_gen + CABAC_BITS)) & 0x1;
+        WORD32 last_byte;
+        WORD32 bits_left;
+        WORD32 rem_bits;
+
+        /*********************************************************************/
+        /* Bitstream overflow check                                          */
+        /* NOTE: corner case of epb bytes (max 2 for 32bit word) not handled */
+        /*********************************************************************/
+        if ((u4_strm_buf_offset + u4_out_standing_bytes + 1)
+                        >= ps_stream->u4_max_strm_size)
+        {
+            /* return without corrupting the buffer beyond its size */
+            return (IH264E_BITSTREAM_BUFFER_OVERFLOW);
+        }
+
+        if (carry)
+        {
+            /* CORNER CASE: if the previous data is 0x000003, then EPB will be inserted
+             and the data will become 0x00000303 and if the carry is present, it will
+             be added with the last byte and it will become 0x00000304 which is not correct
+             as per standard */
+            /* so check for previous four bytes and if it is equal to 0x00000303
+             then subtract u4_strm_buf_offset by 1 */
+            if (pu1_strm_buf[u4_strm_buf_offset - 1] == 0x03
+                            && pu1_strm_buf[u4_strm_buf_offset - 2] == 0x03
+                            && pu1_strm_buf[u4_strm_buf_offset - 3] == 0x00
+                            && pu1_strm_buf[u4_strm_buf_offset - 4] == 0x00)
+            {
+                u4_strm_buf_offset -= 1;
+            }
+            /* previous byte carry add will not result in overflow to        */
+            /* u4_strm_buf_offset - 2 as we track 0xff as outstanding bytes  */
+            pu1_strm_buf[u4_strm_buf_offset - 1] += carry;
+            zero_run = 0;
+        }
+
+        /*        Insert outstanding bytes (if any)         */
+        while (u4_out_standing_bytes)
+        {
+            UWORD8 u1_0_or_ff = carry ? 0 : 0xFF;
+
+            PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, u1_0_or_ff, zero_run);
+            u4_out_standing_bytes--;
+        }
+
+        /*  clear the carry in low */
+        u4_low &= ((1 << (u4_bits_gen + CABAC_BITS)) - 1);
+
+        /* extract the remaining bits;                                   */
+        /* includes additional msb bit of low as per Figure 9-12      */
+        bits_left = u4_bits_gen + 1;
+        rem_bits = (u4_low >> (u4_bits_gen + CABAC_BITS - bits_left));
+
+        if (bits_left >= 8)
+        {
+            last_byte = (rem_bits >> (bits_left - 8)) & 0xFF;
+            PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, last_byte, zero_run);
+            bits_left -= 8;
+        }
+
+        /* insert last byte along with rbsp stop bit(1) and 0's in the end */
+        last_byte = (rem_bits << (8 - bits_left))
+                        | (1 << (7 - bits_left) | (1 << (7 - bits_left - 1)));
+        last_byte &= 0xFF;
+        PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, last_byte, zero_run);
+
+        /* update the state variables and return success */
+        ps_stream->u4_strm_buf_offset = u4_strm_buf_offset;
+        ps_stream->i4_zero_bytes_run = 0;
+        /* Default init values for scratch variables of bitstream context */
+        ps_stream->u4_cur_word = 0;
+        ps_stream->i4_bits_left_in_cw = WORD_SIZE;
+
+        return (IH264E_SUCCESS);
+    }
+}
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Puts new byte (and outstanding bytes) into bitstream after cabac
+ *         renormalization
+ *
+ *  @par   Description
+ *  1. Extract the leading byte of low(L)
+ *  2. If leading byte=0xff increment outstanding bytes and return
+ *     (as the actual bits depend on carry propogation later)
+ *  3. If leading byte is not 0xff check for any carry propogation
+ *  4. Insert the carry (propogated in previous byte) along with outstanding
+ *     bytes (if any) and leading byte
+ *
+ *
+ *  @param[in]   ps_cabac_ctxt
+ *  pointer to cabac context (handle)
+ *
+ *  @return
+ *
+ ******************************************************************************
+ */
+void ih264e_cabac_put_byte(cabac_ctxt_t *ps_cabac_ctxt)
+{
+
+    /* bit stream ptr */
+    bitstrm_t *ps_stream = ps_cabac_ctxt->ps_bitstrm;
+    encoding_envirnoment_t *ps_cab_enc_env = &(ps_cabac_ctxt->s_cab_enc_env);
+    UWORD32 u4_low = ps_cab_enc_env->u4_code_int_low;
+    UWORD32 u4_bits_gen = ps_cab_enc_env->u4_bits_gen;
+    WORD32 lead_byte = u4_low >> (u4_bits_gen + CABAC_BITS - 8);
+
+    /* Sanity checks */
+    ASSERT((ps_cab_enc_env->u4_code_int_range >= 256)
+                    && (ps_cab_enc_env->u4_code_int_range < 512));
+    ASSERT((u4_bits_gen >= 8));
+
+    /* update bits generated and low after extracting leading byte */
+    u4_bits_gen -= 8;
+    ps_cab_enc_env->u4_code_int_low &= ((1 << (CABAC_BITS + u4_bits_gen)) - 1);
+    ps_cab_enc_env->u4_bits_gen = u4_bits_gen;
+
+    /************************************************************************/
+    /* 1. Extract the leading byte of low(L)                                */
+    /* 2. If leading byte=0xff increment outstanding bytes and return       */
+    /*      (as the actual bits depend on carry propogation later)          */
+    /* 3. If leading byte is not 0xff check for any carry propogation       */
+    /* 4. Insert the carry (propogated in previous byte) along with         */
+    /*    outstanding bytes (if any) and leading byte                       */
+    /************************************************************************/
+    if (lead_byte == 0xff)
+    {
+        /* actual bits depend on carry propogration     */
+        ps_cab_enc_env->u4_out_standing_bytes++;
+        return ;
+    }
+    else
+    {
+        /* carry = 1 => putbit(1); carry propogated due to L renorm */
+        WORD32 carry = (lead_byte >> 8) & 0x1;
+        UWORD8 *pu1_strm_buf = ps_stream->pu1_strm_buffer;
+        UWORD32 u4_strm_buf_offset = ps_stream->u4_strm_buf_offset;
+        WORD32 zero_run = ps_stream->i4_zero_bytes_run;
+        UWORD32 u4_out_standing_bytes = ps_cab_enc_env->u4_out_standing_bytes;
+
+
+        /*********************************************************************/
+        /*        Insert the carry propogated in previous byte               */
+        /*                                                                   */
+        /* Note : Do not worry about corruption into slice header align byte */
+        /*        This is because the first bin cannot result in overflow    */
+        /*********************************************************************/
+        if (carry)
+        {
+            /* CORNER CASE: if the previous data is 0x000003, then EPB will be inserted
+             and the data will become 0x00000303 and if the carry is present, it will
+             be added with the last byte and it will become 0x00000304 which is not correct
+             as per standard */
+            /* so check for previous four bytes and if it is equal to 0x00000303
+             then subtract u4_strm_buf_offset by 1 */
+            if (pu1_strm_buf[u4_strm_buf_offset - 1] == 0x03
+                            && pu1_strm_buf[u4_strm_buf_offset - 2] == 0x03
+                            && pu1_strm_buf[u4_strm_buf_offset - 3] == 0x00
+                            && pu1_strm_buf[u4_strm_buf_offset - 4] == 0x00)
+            {
+                u4_strm_buf_offset -= 1;
+            }
+            /* previous byte carry add will not result in overflow to        */
+            /* u4_strm_buf_offset - 2 as we track 0xff as outstanding bytes  */
+            pu1_strm_buf[u4_strm_buf_offset - 1] += carry;
+            zero_run = 0;
+        }
+
+        /*        Insert outstanding bytes (if any)         */
+        while (u4_out_standing_bytes)
+        {
+            UWORD8 u1_0_or_ff = carry ? 0 : 0xFF;
+
+            PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, u1_0_or_ff, zero_run);
+
+            u4_out_standing_bytes--;
+        }
+        ps_cab_enc_env->u4_out_standing_bytes = 0;
+
+        /*        Insert the leading byte                   */
+        lead_byte &= 0xFF;
+        PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, lead_byte, zero_run);
+
+        /* update the state variables and return success */
+        ps_stream->u4_strm_buf_offset = u4_strm_buf_offset;
+        ps_stream->i4_zero_bytes_run = zero_run;
+
+    }
+}
+
+
+
+
+ /**
+ ******************************************************************************
+ *
+ *  @brief Codes a bin based on probablilty and mps packed context model
+ *
+ *  @par   Description
+ *  1. Apart from encoding bin, context model is updated as per state transition
+ *  2. Range and Low renormalization is done based on bin and original state
+ *  3. After renorm bistream is updated (if required)
+ *
+ *  @param[in]   ps_cabac
+ *  pointer to cabac context (handle)
+ *
+ *  @param[in]   bin
+ *  bin(boolean) to be encoded
+ *
+ *  @param[in]  pu1_bin_ctxts
+ *  index of cabac context model containing pState[bits 5-0] | MPS[bit6]
+ *
+ *  @return
+ *
+ ******************************************************************************
+  */
+void ih264e_cabac_encode_bin(cabac_ctxt_t *ps_cabac, WORD32 bin,
+                             bin_ctxt_model *pu1_bin_ctxts)
+{
+
+    encoding_envirnoment_t *ps_cab_enc_env = &(ps_cabac->s_cab_enc_env);
+    UWORD32 u4_range = ps_cab_enc_env->u4_code_int_range;
+    UWORD32 u4_low = ps_cab_enc_env->u4_code_int_low;
+    UWORD32 u4_rlps;
+    UWORD8 state_mps = (*pu1_bin_ctxts) & 0x3F;
+    UWORD8 u1_mps = !!((*pu1_bin_ctxts) & (0x40));
+    WORD32 shift;
+    UWORD32 u4_table_val;
+    /* Sanity checks */
+    ASSERT((bin == 0) || (bin == 1));
+    ASSERT((u4_range >= 256) && (u4_range < 512));
+
+    /* Get the lps range from LUT based on quantized range and state */
+    u4_table_val= gau4_ih264_cabac_table[state_mps][(u4_range >> 6) & 0x3];
+    u4_rlps = u4_table_val & 0xFF;
+    u4_range -= u4_rlps;
+
+    /* check if bin is mps or lps */
+    if (u1_mps ^ bin)
+    {
+        /* lps path;  L= L + R; R = RLPS */
+        u4_low += u4_range;
+        u4_range = u4_rlps;
+        if (state_mps == 0)
+        {
+            /* MPS(CtxIdx) = 1 - MPS(CtxIdx) */
+            u1_mps = 1 - u1_mps;
+        } /* update the context model from state transition LUT */
+
+        state_mps =  (u4_table_val >> 15) & 0x3F;
+    }
+    else
+    { /* update the context model from state transition LUT */
+        state_mps =  (u4_table_val >> 8) & 0x3F;
+    }
+
+    (*pu1_bin_ctxts) = (u1_mps << 6) | state_mps;
+
+        /*****************************************************************/
+        /* Renormalization; calculate bits generated based on range(R)   */
+        /* Note : 6 <= R < 512; R is 2 only for terminating encode       */
+        /*****************************************************************/
+        GETRANGE(shift, u4_range);
+        shift   = 9 - shift;
+        u4_low   <<= shift;
+        u4_range <<= shift;
+
+        /* bits to be inserted in the bitstream */
+        ps_cab_enc_env->u4_bits_gen += shift;
+        ps_cab_enc_env->u4_code_int_range = u4_range;
+        ps_cab_enc_env->u4_code_int_low   = u4_low;
+
+        /* generate stream when a byte is ready */
+        if (ps_cab_enc_env->u4_bits_gen > CABAC_BITS)
+        {
+            ih264e_cabac_put_byte(ps_cabac);
+        }
+
+}
+
+
+
+
+ /**
+ *******************************************************************************
+ *
+ * @brief
+ *  Encoding process for a binary decision :implements encoding process of a decision
+ *  as defined in 9.3.4.2 . This function encodes multiple bins, of a symbol. Implements
+ *  flowchart Figure 9-7( ITU_T_H264-201402)
+ *
+ * @param[in] u4_bins
+ * array of bin values
+ *
+ * @param[in] i1_bins_len
+ *  Length of bins, maximum 32
+ *
+ * @param[in] u4_ctx_inc
+ *  CtxInc, byte0- bin0, byte1-bin1 ..
+ *
+ * @param[in] i1_valid_len
+ *  valid length of bins, after that CtxInc is constant
+ *
+ * @param[in] pu1_bin_ctxt_type
+ *  Pointer to binary contexts
+
+ * @param[in] ps_cabac
+ *  Pointer to cabac_context_structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+void ih264e_encode_decision_bins(UWORD32 u4_bins, WORD8 i1_bins_len,
+                                 UWORD32 u4_ctx_inc, WORD8 i1_valid_len,
+                                 bin_ctxt_model *pu1_bin_ctxt_type,
+                                 cabac_ctxt_t *ps_cabac)
+{
+    WORD8 i;
+    UWORD8 u1_ctx_inc, u1_bin;
+
+    for (i = 0; i < i1_bins_len; i++)
+    {
+        u1_bin = (u4_bins & 0x01);
+        u4_bins = u4_bins >> 1;
+        u1_ctx_inc = u4_ctx_inc & 0x0f;
+        if (i < i1_valid_len)
+            u4_ctx_inc = u4_ctx_inc >> 4;
+        /* Encode the bin */
+        ih264e_cabac_encode_bin(ps_cabac, u1_bin,
+                                pu1_bin_ctxt_type + u1_ctx_inc);
+    }
+
+}
+
+
+
+
+
+
+/**
+ *******************************************************************************
+ * @brief
+ *  Encoding process for a binary decision before termination:Encoding process
+ *  of a termination(9.3.4.5 :ITU_T_H264-201402) . Explained in flowchart 9-11.
+ *
+ * @param[in] ps_cabac
+ *  Pointer to cabac structure
+ *
+ * @param[in] term_bin
+ *  Symbol value, end of slice or not, term_bin is binary
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+void ih264e_cabac_encode_terminate(cabac_ctxt_t *ps_cabac, WORD32 term_bin)
+{
+
+    encoding_envirnoment_t *ps_cab_enc_env = &(ps_cabac->s_cab_enc_env);
+
+    UWORD32 u4_range = ps_cab_enc_env->u4_code_int_range;
+    UWORD32 u4_low = ps_cab_enc_env->u4_code_int_low;
+    UWORD32 u4_rlps;
+    WORD32 shift;
+
+    /* Sanity checks */
+    ASSERT((u4_range >= 256) && (u4_range < 512));
+    ASSERT((term_bin == 0) || (term_bin == 1));
+
+    /*  term_bin = 1 has lps range = 2 */
+    u4_rlps = 2;
+    u4_range -= u4_rlps;
+
+    /* if terminate L is incremented by curR and R=2 */
+    if (term_bin)
+    {
+        /* lps path;  L= L + R; R = RLPS */
+        u4_low += u4_range;
+        u4_range = u4_rlps;
+    }
+
+    /*****************************************************************/
+    /* Renormalization; calculate bits generated based on range(R)   */
+    /* Note : 6 <= R < 512; R is 2 only for terminating encode       */
+    /*****************************************************************/
+    GETRANGE(shift, u4_range);
+    shift = 9 - shift;
+    u4_low <<= shift;
+    u4_range <<= shift;
+
+    /* bits to be inserted in the bitstream */
+    ps_cab_enc_env->u4_bits_gen += shift;
+    ps_cab_enc_env->u4_code_int_range = u4_range;
+    ps_cab_enc_env->u4_code_int_low = u4_low;
+
+    /* generate stream when a byte is ready */
+    if (ps_cab_enc_env->u4_bits_gen > CABAC_BITS)
+    {
+        ih264e_cabac_put_byte(ps_cabac);
+    }
+
+    if (term_bin)
+    {
+        ih264e_cabac_flush(ps_cabac);
+    }
+
+}
+
+
+/**
+ *******************************************************************************
+ * @brief
+ * Bypass encoding process for binary decisions:  Explained (9.3.4.4 :ITU_T_H264-201402)
+ * , flowchart 9-10.
+ *
+ *  @param[ino]  ps_cabac : pointer to cabac context (handle)
+ *
+ *  @param[in]   bin :  bypass bin(0/1) to be encoded
+ *
+ *  @returns
+ *
+ *  @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ih264e_cabac_encode_bypass_bin(cabac_ctxt_t *ps_cabac, WORD32 bin)
+{
+
+    encoding_envirnoment_t *ps_cab_enc_env = &(ps_cabac->s_cab_enc_env);
+
+    UWORD32 u4_range = ps_cab_enc_env->u4_code_int_range;
+    UWORD32 u4_low = ps_cab_enc_env->u4_code_int_low;
+
+    /* Sanity checks */
+    ASSERT((u4_range >= 256) && (u4_range < 512));
+    ASSERT((bin == 0) || (bin == 1));
+
+    u4_low <<= 1;
+    /* add range if bin is 1 */
+    if (bin)
+    {
+        u4_low += u4_range;
+    }
+
+    /* 1 bit to be inserted in the bitstream */
+    ps_cab_enc_env->u4_bits_gen++;
+    ps_cab_enc_env->u4_code_int_low = u4_low;
+
+    /* generate stream when a byte is ready */
+    if (ps_cab_enc_env->u4_bits_gen > CABAC_BITS)
+    {
+        ih264e_cabac_put_byte(ps_cabac);
+    }
+
+}
+
+
+ /**
+ ******************************************************************************
+ *
+ *  @brief Encodes a series of bypass bins (FLC bypass bins)
+ *
+ *  @par   Description
+ *  This function is more optimal than calling ih264e_cabac_encode_bypass_bin()
+ *  in a loop as cabac low, renorm and generating the stream (8bins at a time)
+ *  can be done in one operation
+ *
+ *  @param[inout]ps_cabac
+ *   pointer to cabac context (handle)
+ *
+ *  @param[in]   u4_bins
+ *   syntax element to be coded (as FLC bins)
+ *
+ *  @param[in]   num_bins
+ *   This is the FLC length for u4_sym
+ *
+ *  @return
+ *
+ ******************************************************************************
+ */
+
+void ih264e_cabac_encode_bypass_bins(cabac_ctxt_t *ps_cabac, UWORD32 u4_bins,
+                                     WORD32 num_bins)
+{
+
+    encoding_envirnoment_t *ps_cab_enc_env = &(ps_cabac->s_cab_enc_env);
+
+    UWORD32 u4_range = ps_cab_enc_env->u4_code_int_range;
+    WORD32 next_byte;
+    UWORD32 rev_next_byte;
+
+    /* Sanity checks */
+    ASSERT((num_bins < 33) && (num_bins > 0));
+    ASSERT((u4_range >= 256) && (u4_range < 512));
+
+    /* Compute bit always to populate the trace */
+    /* increment bits generated by num_bins */
+
+    /* Encode 8bins at a time and put in the bit-stream */
+    while (num_bins > 8)
+    {
+        num_bins -= 8;
+
+        /* extract the leading 8 bins */
+        next_byte = (u4_bins) & 0xff;
+        u4_bins >>= 8;
+        REV_NBITS(next_byte, 8, rev_next_byte);
+
+        /*  L = (L << 8) +  (R * next_byte) */
+        ps_cab_enc_env->u4_code_int_low <<= 8;
+        ps_cab_enc_env->u4_code_int_low += (rev_next_byte * u4_range);
+        ps_cab_enc_env->u4_bits_gen += 8;
+
+        if (ps_cab_enc_env->u4_bits_gen > CABAC_BITS)
+        {
+            /*  insert the leading byte of low into stream */
+            ih264e_cabac_put_byte(ps_cabac);
+        }
+    }
+
+    /* Update low with remaining bins and return */
+    next_byte = (u4_bins & ((1 << num_bins) - 1));
+
+    REV_NBITS(next_byte, num_bins, rev_next_byte);
+
+    ps_cab_enc_env->u4_code_int_low <<= num_bins;
+    ps_cab_enc_env->u4_code_int_low += (rev_next_byte * u4_range);
+    ps_cab_enc_env->u4_bits_gen += num_bins;
+
+    if (ps_cab_enc_env->u4_bits_gen > CABAC_BITS)
+    {
+        /*  insert the leading byte of low into stream */
+        ih264e_cabac_put_byte(ps_cabac);
+    }
+
+}
+
+
+
+
+
+
+
diff --git a/encoder/ih264e_cabac.h b/encoder/ih264e_cabac.h
new file mode 100644
index 0000000..e781783
--- /dev/null
+++ b/encoder/ih264e_cabac.h
@@ -0,0 +1,452 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+ *******************************************************************************
+ * @file
+ *  ih264e_cabac_structs.h
+ *
+ * @brief
+ *  This file contains cabac related macros, enums, tables and function declarations.
+ *
+ * @author
+ *  Doney Alex
+ *
+ * @remarks
+ *  none
+ *
+ *******************************************************************************
+ */
+
+#ifndef IH264E_CABAC_H_
+#define IH264E_CABAC_H_
+
+
+
+/*******************************************************************************
+@brief Bit precision of cabac engine;
+*******************************************************************************
+*/
+#define CABAC_BITS  9
+
+
+
+
+/**
+******************************************************************************
+ *  @macro Count number of bits set
+******************************************************************************
+*/
+#define REV_NBITS(word, size, rev_word)               \
+{                                                     \
+    WORD32 i;                                         \
+    rev_word = 0;                                     \
+    for (i = 0; i < (size); i++)                      \
+    {                                                 \
+        UWORD32 bit = ((word) >> i) & 1;              \
+        rev_word += (1 << ((size) - i - 1)) * bit;    \
+    }                                                 \
+}                                                     \
+
+/**
+******************************************************************************
+ *  @macro Reverse bits in an unsigned integer
+******************************************************************************
+*/
+#define REV(u4_input, u4_output)                 \
+{                                                \
+    UWORD32 u4_temp = (u4_input);                \
+    WORD8 i;                                     \
+    u4_output = 0;                               \
+    for (i = 0; i < 32; i++)                     \
+    {                                            \
+        u4_output = (u4_output << 1) +           \
+                        ((u4_temp >> i) & 0x01); \
+    }                                            \
+}
+
+/**
+******************************************************************************
+*! Bit manipulation macros
+******************************************************************************
+*/
+#define SETBIT(a, i)   ((a) |= (1 << (i)))
+#define CLEARBIT(a, i) ((a) &= ~(1 << (i)))
+
+
+/**
+******************************************************************************
+*! Cabac module expect atlesat MIN_STREAM_SIZE_MB bytes left in stream buffer
+*! for encoding an MB
+******************************************************************************
+*/
+#define MIN_STREAM_SIZE_MB   1024
+
+
+
+/*****************************************************************************/
+/* Function Declarations                                                 */
+/*****************************************************************************/
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Initialize default context values and pointers.
+ *
+ * @param[in] ps_ent_ctxt
+ *  Pointer to entropy context structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+void ih264e_init_cabac_table(entropy_ctxt_t *ps_ent_ctxt);
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Initialize cabac context: Intitalize all contest with init values given in the spec.
+ * Called at the beginning of entropy coding of each slice for CABAC encoding.
+ *
+ * @param[in] ps_ent_ctxt
+ *  Pointer to entropy context structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+void ih264e_init_cabac_ctxt(entropy_ctxt_t *ps_ent_ctxt);
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  k-th order Exp-Golomb (UEGk) binarization process: Implements concatenated
+ *   unary/ k-th order Exp-Golomb  (UEGk) binarization process,
+ *   where k = 0 as defined in 9.3.2.3 of  ITU_T_H264-201402
+ *
+ * @param[in] i2_sufs
+ *  Suffix bit string
+ *
+ * @param[in] pi1_bins_len
+ *  Pointer to length of the string
+ *
+ * @returns Binarized value
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+UWORD32 ih264e_cabac_UEGk0_binarization(WORD16 i2_sufs, WORD8 *pi1_bins_len);
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Get cabac context for the MB :calculates the pointers to Top and   left
+ *          cabac neighbor context depending upon neighbor  availability.
+ *
+ * @param[in] ps_ent_ctxt
+ *  Pointer to entropy context structure
+ *
+ * @param[in] u4_mb_type
+ *  Type of MB
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+void ih264e_get_cabac_context(entropy_ctxt_t *ps_ent_ctxt, WORD32 u4_mb_type);
+
+
+/**
+ *******************************************************************************
+ * @brief
+ *  flushing at termination: Explained in flowchart 9-12(ITU_T_H264-201402).
+ *
+ *  @param[in]   ps_cabac_ctxt
+ *  pointer to cabac context (handle)
+ *
+ * @returns  success or failure error code
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+WORD32 ih264e_cabac_flush(cabac_ctxt_t *ps_cabac_ctxt);
+
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Puts new byte (and outstanding bytes) into bitstream after cabac
+ *         renormalization
+ *
+ *  @par   Description
+ *  1. Extract the leading byte of low(L)
+ *  2. If leading byte=0xff increment outstanding bytes and return
+ *     (as the actual bits depend on carry propogation later)
+ *  3. If leading byte is not 0xff check for any carry propogation
+ *  4. Insert the carry (propogated in previous byte) along with outstanding
+ *     bytes (if any) and leading byte
+ *
+ *
+ *  @param[inout]   ps_cabac_ctxt
+ *  pointer to cabac context (handle)
+ *
+ *  @return
+ *
+ ******************************************************************************
+ */
+void ih264e_cabac_put_byte(cabac_ctxt_t *ps_cabac_ctxt);
+
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Codes a bin based on probablilty and mps packed context model
+ *
+ *  @par   Description
+ *  1. Apart from encoding bin, context model is updated as per state transition
+ *  2. Range and Low renormalization is done based on bin and original state
+ *  3. After renorm bistream is updated (if required)
+ *
+ *  @param[inout]   ps_cabac
+ *  pointer to cabac context (handle)
+ *
+ *  @param[in]   bin
+ *  bin(boolean) to be encoded
+ *
+ *  @param[in]  pu1_bin_ctxts
+ *  index of cabac context model containing pState[bits 5-0] | MPS[bit6]
+ *
+ *  @return
+ *
+ ******************************************************************************
+ */
+void ih264e_cabac_encode_bin(cabac_ctxt_t *ps_cabac, WORD32 bin,
+                             bin_ctxt_model *pu1_bin_ctxts);
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Encoding process for a binary decision :implements encoding process of a decision
+ *  as defined in 9.3.4.2 . This function encodes multiple bins, of a symbol. Implements
+ *  flowchart Figure 9-7( ITU_T_H264-201402)
+ *
+ * @param[in] u4_bins
+ * array of bin values
+ *
+ * @param[in] i1_bins_len
+ *  Length of bins, maximum 32
+ *
+ * @param[in] u4_ctx_inc
+ *  CtxInc, byte0- bin0, byte1-bin1 ..
+ *
+ * @param[in] i1_valid_len
+ *  valid length of bins, after that CtxInc is constant
+ *
+ * @param[in] pu1_bin_ctxt_type
+ *  Pointer to binary contexts
+
+ * @param[in] ps_cabac
+ *  Pointer to cabac_context_structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+void ih264e_encode_decision_bins(UWORD32 u4_bins, WORD8 i1_bins_len,
+                                 UWORD32 u4_ctx_inc, WORD8 i1_valid_len,
+                                 bin_ctxt_model *pu1_bin_ctxt_type,
+                                 cabac_ctxt_t *ps_cabac);
+
+/**
+ *******************************************************************************
+ * @brief
+ *  Encoding process for a binary decision before termination:Encoding process
+ *  of a termination(9.3.4.5 :ITU_T_H264-201402) . Explained in flowchart 9-11.
+ *
+ * @param[in] ps_cabac
+ *  Pointer to cabac structure
+ *
+ * @param[in] term_bin
+ *  Symbol value, end of slice or not, term_bin is binary
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+void ih264e_cabac_encode_terminate(cabac_ctxt_t *ps_cabac, WORD32 term_bin);
+
+
+/**
+ *******************************************************************************
+ * @brief
+ * Bypass encoding process for binary decisions:  Explained (9.3.4.4 :ITU_T_H264-201402)
+ * , flowchart 9-10.
+ *
+ *  @param[in]  ps_cabac : pointer to cabac context (handle)
+ *
+ *  @param[in]   bin :  bypass bin(0/1) to be encoded
+ *
+ *  @returns
+ *
+ *  @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ih264e_cabac_encode_bypass_bin(cabac_ctxt_t *ps_cabac, WORD32 bin);
+
+
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Encodes a series of bypass bins (FLC bypass bins)
+ *
+ *  @par   Description
+ *  This function is more optimal than calling ih264e_cabac_encode_bypass_bin()
+ *  in a loop as cabac low, renorm and generating the stream (8bins at a time)
+ *  can be done in one operation
+ *
+ *  @param[inout]ps_cabac
+ *   pointer to cabac context (handle)
+ *
+ *  @param[in]   u4_bins
+ *   syntax element to be coded (as FLC bins)
+ *
+ *  @param[in]   num_bins
+ *   This is the FLC length for u4_sym
+ *
+ *  @return
+ *
+ ******************************************************************************
+ */
+
+void ih264e_cabac_encode_bypass_bins(cabac_ctxt_t *ps_cabac, UWORD32 u4_bins,
+                                     WORD32 num_bins);
+
+
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function generates CABAC coded bit stream for an Intra Slice.
+ *
+ * @description
+ *  The mb syntax layer for intra slices constitutes luma mb mode, luma sub modes
+ *  (if present), mb qp delta, coded block pattern, chroma mb mode and
+ *  luma/chroma residue. These syntax elements are written as directed by table
+ *  7.3.5 of h264 specification.
+ *
+ * @param[in] ps_ent_ctxt
+ *  pointer to entropy context
+ *
+ * @returns error code
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+IH264E_ERROR_T ih264e_write_islice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt);
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function generates CABAC coded bit stream for Inter slices
+ *
+ * @description
+ *  The mb syntax layer for inter slices constitutes luma mb mode, luma sub modes
+ *  (if present), mb qp delta, coded block pattern, chroma mb mode and
+ *  luma/chroma residue. These syntax elements are written as directed by table
+ *  7.3.5 of h264 specification
+ *
+ * @param[in] ps_ent_ctxt
+ *  pointer to entropy context
+ *
+ * @returns error code
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+IH264E_ERROR_T ih264e_write_pslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt);
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function generates CABAC coded bit stream for B slices
+ *
+ * @description
+ *  The mb syntax layer for inter slices constitutes luma mb mode,
+ *  mb qp delta, coded block pattern, chroma mb mode and
+ *  luma/chroma residue. These syntax elements are written as directed by table
+ *  7.3.5 of h264 specification
+ *
+ * @param[in] ps_ent_ctxt
+ *  pointer to entropy context
+ *
+ * @returns error code
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt);
+
+
+#endif /* IH264E_CABAC_H_ */
diff --git a/encoder/ih264e_cabac_encode.c b/encoder/ih264e_cabac_encode.c
new file mode 100644
index 0000000..ebcd418
--- /dev/null
+++ b/encoder/ih264e_cabac_encode.c
@@ -0,0 +1,2391 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+*******************************************************************************
+* @file
+*  ih264e_cabac.c
+*
+* @brief
+*  Contains all functions to encode in CABAC entropy mode
+*
+*
+* @author
+* Doney Alex
+*
+* @par List of Functions:
+*
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+/* System include files */
+#include <stdio.h>
+#include <assert.h>
+#include <limits.h>
+#include <string.h>
+
+/* User include files */
+#include "ih264e_config.h"
+#include "ih264_typedefs.h"
+#include "iv2.h"
+#include "ive2.h"
+#include "ih264_debug.h"
+#include "ih264_defs.h"
+#include "ih264e_defs.h"
+#include "ih264_macros.h"
+#include "ih264e_error.h"
+#include "ih264e_bitstream.h"
+#include "ime_distortion_metrics.h"
+#include "ime_defs.h"
+#include "ime_structs.h"
+#include "ih264_error.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include "ih264_inter_pred_filters.h"
+#include "ih264_mem_fns.h"
+#include "ih264_padding.h"
+#include "ih264_platform_macros.h"
+#include "ih264_intra_pred_filters.h"
+#include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
+#include "irc_cntrl_param.h"
+#include "irc_frame_info_collector.h"
+#include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
+#include "ih264e_structs.h"
+#include "ih264e_cabac.h"
+#include "ih264e_encode_header.h"
+#include "ih264_cavlc_tables.h"
+#include "ih264e_cavlc.h"
+#include "ih264e_statistics.h"
+#include "ih264e_trace.h"
+
+/*****************************************************************************/
+/* Function Definitions                                                      */
+/*****************************************************************************/
+
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Encodes mb_skip_flag  using CABAC entropy coding mode.
+ *
+ * @param[in] u1_mb_skip_flag
+ *  mb_skip_flag
+ *
+ * @param[in] ps_cabac_ctxt
+ *  Pointer to cabac context structure
+ *
+ * @param[in] u4_ctxidx_offset
+ *  ctxIdxOffset for mb_skip_flag context
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+static void ih264e_cabac_enc_mb_skip(UWORD8 u1_mb_skip_flag,
+                                     cabac_ctxt_t *ps_cabac_ctxt,
+                                     UWORD32 u4_ctxidx_offset)
+{
+
+    UWORD8 u4_ctx_inc;
+    WORD8 a, b;
+    a = ((ps_cabac_ctxt->ps_left_ctxt_mb_info->u1_mb_type & CAB_SKIP_MASK) ?
+                    0 : 1);
+    b = ((ps_cabac_ctxt->ps_top_ctxt_mb_info->u1_mb_type & CAB_SKIP_MASK) ?
+                    0 : 1);
+
+    u4_ctx_inc = a + b;
+    /* Encode the bin */
+    ih264e_cabac_encode_bin(ps_cabac_ctxt,
+                            (UWORD32) u1_mb_skip_flag,
+                            ps_cabac_ctxt->au1_cabac_ctxt_table + u4_ctxidx_offset
+                                    + u4_ctx_inc);
+
+}
+
+
+/* ! < Table 9-36 � Binarization for macroblock types in I slices  in ITU_T_H264-201402
+ * Bits 0-7 : binarised value
+ * Bits 8-15: length of binary sequence
+ */
+static const UWORD32 u4_mb_type_intra[26] =
+    { 0x0100, 0x0620, 0x0621, 0x0622, 0x0623, 0x0748, 0x0749, 0x074a, 0x074b,
+      0x074c, 0x074d, 0x074e, 0x074f, 0x0628, 0x0629, 0x062a, 0x062b, 0x0758,
+      0x0759, 0x075a, 0x075b, 0x075c, 0x075d, 0x075e, 0x075f, 0x0203 };
+
+
+/* CtxInc for mb types */
+static const UWORD32 u4_mb_ctxinc[2][26] =
+{
+    /* Intra CtxInc's */
+    {   0x00,
+        0x03467, 0x03467, 0x03467, 0x03467, 0x034567, 0x034567, 0x034567,
+        0x034567, 0x034567, 0x034567, 0x034567, 0x034567, 0x03467, 0x03467,
+        0x03467, 0x03467, 0x034567, 0x034567, 0x034567, 0x034567, 0x034567,
+        0x034567, 0x034567, 0x034567, 0x00},
+    /* Inter CtxInc's */
+    {   0x00,
+        0x001233, 0x001233, 0x001233, 0x001233, 0x0012233, 0x0012233, 0x0012233,
+        0x0012233, 0x0012233, 0x0012233, 0x0012233, 0x0012233, 0x001233, 0x001233,
+        0x001233, 0x001233, 0x0012233, 0x0012233, 0x0012233, 0x0012233, 0x0012233,
+        0x0012233, 0x0012233, 0x0012233, 0x00}
+};
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Encodes mb_type for an intra MB.
+ *
+ * @param[in] u4_slice_type
+ *  slice type
+ *
+ * @param[in] u4_intra_mb_type
+ *  MB type (Table 7-11)
+ *
+ * @param[in] ps_cabac_ctxt
+ *  Pointer to cabac context structure
+ *
+ ** @param[in] u4_ctxidx_offset
+ *  ctxIdxOffset for mb_type context
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+static void ih264e_cabac_enc_intra_mb_type(UWORD32 u4_slice_type,
+                                           UWORD32 u4_intra_mb_type,
+                                           cabac_ctxt_t *ps_cabac_ctxt,
+                                           UWORD32 u4_ctx_idx_offset)
+{
+
+    encoding_envirnoment_t *ps_cab_enc_env = &(ps_cabac_ctxt->s_cab_enc_env);
+    bin_ctxt_model *pu1_mb_bin_ctxt, *pu1_bin_ctxt;
+    UWORD8 u1_bin;
+    mb_info_ctxt_t *ps_left_ctxt = ps_cabac_ctxt->ps_left_ctxt_mb_info;
+    mb_info_ctxt_t *ps_top_ctxt = ps_cabac_ctxt->ps_top_ctxt_mb_info;
+    UWORD32 u4_bins;
+    UWORD32 u4_ctx_inc;
+    WORD8 i1_bins_len;
+    UWORD32 u4_code_int_range;
+    UWORD32 u4_code_int_low;
+    UWORD16 u2_quant_code_int_range;
+    UWORD16 u4_code_int_range_lps;
+    WORD8 i;
+    UWORD8 u1_ctx_inc;
+    UWORD32 u4_table_val;
+
+    pu1_mb_bin_ctxt = ps_cabac_ctxt->au1_cabac_ctxt_table + u4_ctx_idx_offset;
+
+    u4_bins = u4_mb_type_intra[u4_intra_mb_type];
+    i1_bins_len = (WORD8) ((u4_bins >> 8) & 0x0f);
+    u4_ctx_inc = u4_mb_ctxinc[(u4_slice_type != ISLICE)][u4_intra_mb_type];
+    u1_ctx_inc = 0;
+    if (u4_slice_type == ISLICE)
+    {
+        if (ps_left_ctxt != ps_cabac_ctxt->ps_def_ctxt_mb_info)
+            u1_ctx_inc += ((ps_left_ctxt->u1_mb_type != CAB_I4x4) ? 1 : 0);
+        if (ps_top_ctxt != ps_cabac_ctxt->ps_def_ctxt_mb_info)
+            u1_ctx_inc += ((ps_top_ctxt->u1_mb_type != CAB_I4x4) ? 1 : 0);
+
+        u4_ctx_inc = (u4_ctx_inc | (u1_ctx_inc << ((i1_bins_len - 1) << 2)));
+    }
+    else
+    {
+        pu1_mb_bin_ctxt += 3;
+        if (u4_slice_type == BSLICE)
+            pu1_mb_bin_ctxt += 2;
+
+    }
+
+    u4_code_int_range = ps_cab_enc_env->u4_code_int_range;
+    u4_code_int_low = ps_cab_enc_env->u4_code_int_low;
+
+    for (i = (i1_bins_len - 1); i >= 0; i--)
+    {
+        WORD32 shift;
+
+        u1_ctx_inc = ((u4_ctx_inc >> (i << 2)) & 0x0f);
+        u1_bin = ((u4_bins >> i) & 0x01);
+        /* Encode the bin */
+        pu1_bin_ctxt = pu1_mb_bin_ctxt + u1_ctx_inc;
+        if (i != (i1_bins_len - 2))
+        {
+            WORD8 i1_mps = !!((*pu1_bin_ctxt) & (0x40));
+            WORD8 i1_state = (*pu1_bin_ctxt) & 0x3F;
+
+            u2_quant_code_int_range = ((u4_code_int_range >> 6) & 0x03);
+            u4_table_val =
+                            gau4_ih264_cabac_table[i1_state][u2_quant_code_int_range];
+            u4_code_int_range_lps = u4_table_val & 0xFF;
+
+            u4_code_int_range -= u4_code_int_range_lps;
+            if (u1_bin != i1_mps)
+            {
+                u4_code_int_low += u4_code_int_range;
+                u4_code_int_range = u4_code_int_range_lps;
+                if (i1_state == 0)
+                {
+                    /* MPS(CtxIdx) = 1 - MPS(CtxIdx) */
+                    i1_mps = 1 - i1_mps;
+                }
+
+                i1_state = (u4_table_val >> 15) & 0x3F;
+            }
+            else
+            {
+                i1_state = (u4_table_val >> 8) & 0x3F;
+
+            }
+
+            (*pu1_bin_ctxt) = (i1_mps << 6) | i1_state;
+        }
+        else
+        {
+            u4_code_int_range -= 2;
+        }
+
+        /* Renormalize */
+        /*****************************************************************/
+        /* Renormalization; calculate bits generated based on range(R)   */
+        /* Note : 6 <= R < 512; R is 2 only for terminating encode       */
+        /*****************************************************************/
+        GETRANGE(shift, u4_code_int_range);
+        shift = 9 - shift;
+        u4_code_int_low <<= shift;
+        u4_code_int_range <<= shift;
+
+        /* bits to be inserted in the bitstream */
+        ps_cab_enc_env->u4_bits_gen += shift;
+        ps_cab_enc_env->u4_code_int_range = u4_code_int_range;
+        ps_cab_enc_env->u4_code_int_low = u4_code_int_low;
+
+        /* generate stream when a byte is ready */
+        if (ps_cab_enc_env->u4_bits_gen > CABAC_BITS)
+        {
+            ih264e_cabac_put_byte(ps_cabac_ctxt);
+            u4_code_int_range = ps_cab_enc_env->u4_code_int_range;
+            u4_code_int_low = ps_cab_enc_env->u4_code_int_low;
+
+        }
+    }
+}
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Encodes prev_intra4x4_pred_mode_flag and
+ *  rem_intra4x4_pred_mode using CABAC entropy coding mode
+ *
+ * @param[in] ps_cabac_ctxt
+ *  Pointer to cabac context structure
+ *
+ *  @param[in] pu1_intra_4x4_modes
+ *  Pointer to array containing prev_intra4x4_pred_mode_flag and
+ *  rem_intra4x4_pred_mode
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+static void ih264e_cabac_enc_4x4mb_modes(cabac_ctxt_t *ps_cabac_ctxt,
+                                         UWORD8 *pu1_intra_4x4_modes)
+{
+    WORD32 i;
+    WORD8 byte;
+    for (i = 0; i < 16; i += 2)
+    {
+        /* sub blk idx 1 */
+        byte = *pu1_intra_4x4_modes++;
+        if (byte & 0x1)
+        {
+            ih264e_cabac_encode_bin(ps_cabac_ctxt,
+                                    1,
+                                    ps_cabac_ctxt->au1_cabac_ctxt_table
+                                            + PREV_INTRA4X4_PRED_MODE_FLAG);
+        }
+        else
+        {
+            /* Binarization is FL and Cmax=7 */
+            ih264e_encode_decision_bins(byte & 0xF,
+                                        4,
+                                        0x05554,
+                                        4,
+                                        ps_cabac_ctxt->au1_cabac_ctxt_table
+                                            + REM_INTRA4X4_PRED_MODE - 5,
+                                        ps_cabac_ctxt);
+        }
+        /* sub blk idx 2 */
+        byte >>= 4;
+        if (byte & 0x1)
+        {
+            ih264e_cabac_encode_bin(ps_cabac_ctxt,
+                                    1,
+                                    ps_cabac_ctxt->au1_cabac_ctxt_table
+                                            + PREV_INTRA4X4_PRED_MODE_FLAG);
+        }
+        else
+        {
+            ih264e_encode_decision_bins(byte & 0xF,
+                                        4,
+                                        0x05554,
+                                        4,
+                                        ps_cabac_ctxt->au1_cabac_ctxt_table
+                                            + REM_INTRA4X4_PRED_MODE - 5,
+                                        ps_cabac_ctxt);
+        }
+    }
+}
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Encodes chroma  intrapred mode for the MB.
+ *
+ * @param[in] u1_chroma_pred_mode
+ *  Chroma intr prediction mode
+ *
+ * @param[in] ps_cabac_ctxt
+ *  Pointer to cabac context structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+static void ih264e_cabac_enc_chroma_predmode(UWORD8 u1_chroma_pred_mode,
+                                             cabac_ctxt_t *ps_cabac_ctxt)
+{
+
+    WORD8 i1_temp;
+    mb_info_ctxt_t *ps_curr_ctxt = ps_cabac_ctxt->ps_curr_ctxt_mb_info;
+    mb_info_ctxt_t *ps_left_ctxt = ps_cabac_ctxt->ps_left_ctxt_mb_info;
+    mb_info_ctxt_t *ps_top_ctxt = ps_cabac_ctxt->ps_top_ctxt_mb_info;
+    UWORD32 u4_bins = 0;
+    WORD8 i1_bins_len = 1;
+    UWORD32 u4_ctx_inc = 0;
+    UWORD8 a, b;
+    a = ((ps_left_ctxt->u1_intrapred_chroma_mode != 0) ? 1 : 0);
+    b = ((ps_top_ctxt->u1_intrapred_chroma_mode != 0) ? 1 : 0);
+
+    /* Binarization is TU and Cmax=3 */
+    ps_curr_ctxt->u1_intrapred_chroma_mode = u1_chroma_pred_mode;
+
+    u4_ctx_inc = a + b;
+    u4_ctx_inc = (u4_ctx_inc | 0x330);
+    if (u1_chroma_pred_mode)
+    {
+        u4_bins = 1;
+        i1_temp = u1_chroma_pred_mode;
+        i1_temp--;
+        /* Put a stream of 1's of length Chromaps_pred_mode_ctxt value */
+        while (i1_temp)
+        {
+            u4_bins = (u4_bins | (1 << i1_bins_len));
+            i1_bins_len++;
+            i1_temp--;
+        }
+        /* If Chromaps_pred_mode_ctxt < Cmax i.e 3. Terminate put a zero */
+        if (u1_chroma_pred_mode < 3)
+        {
+            i1_bins_len++;
+        }
+    }
+
+    ih264e_encode_decision_bins(u4_bins,
+                                i1_bins_len,
+                                u4_ctx_inc,
+                                3,
+                                ps_cabac_ctxt->au1_cabac_ctxt_table
+                                    + INTRA_CHROMA_PRED_MODE,
+                                ps_cabac_ctxt);
+
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Encodes CBP for the MB.
+ *
+ * @param[in] u1_cbp
+ *  CBP for the MB
+ *
+ * @param[in] ps_cabac_ctxt
+ *  Pointer to cabac context structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+static void ih264e_cabac_enc_cbp(UWORD32 u4_cbp, cabac_ctxt_t *ps_cabac_ctxt)
+{
+    mb_info_ctxt_t *ps_left_ctxt = ps_cabac_ctxt->ps_left_ctxt_mb_info;
+    mb_info_ctxt_t *ps_top_ctxt = ps_cabac_ctxt->ps_top_ctxt_mb_info;
+    WORD8 i2_cbp_chroma, i, j;
+    UWORD8 u1_ctxt_inc, u1_bin;
+    UWORD8 a, b;
+    UWORD32 u4_ctx_inc;
+    UWORD32 u4_bins;
+    WORD8 i1_bins_len;
+
+    /* CBP Luma, FL, Cmax = 15, L = 4 */
+    u4_ctx_inc = 0;
+    u4_bins = 0;
+    i1_bins_len = 5;
+    for (i = 0; i < 4; i++)
+    {
+        /* calulate ctxtInc, depending on neighbour availability */
+        /* u1_ctxt_inc = CondTerm(A) + 2 * CondTerm(B);
+         A: Left block and B: Top block */
+
+        /* Check for Top availability */
+        if (i >> 1)
+        {
+            j = i - 2;
+            /* Top is available always and it's current MB */
+            b = (((u4_cbp >> j) & 0x01) != 0 ? 0 : 1);
+        }
+        else
+        {
+            /* for blocks whose top reference is in another MB */
+            {
+                j = i + 2;
+                b = ((ps_top_ctxt->u1_cbp >> j) & 0x01) ? 0 : 1;
+            }
+        }
+
+        /* Check for Left availability */
+        if (i & 0x01)
+        {
+            /* Left is available always and it's current MB */
+            j = i - 1;
+            a = (((u4_cbp >> j) & 0x01) != 0 ? 0 : 1);
+        }
+        else
+        {
+            {
+                j = i + 1;
+                a = ((ps_left_ctxt->u1_cbp >> j) & 0x01) ? 0 : 1;
+            }
+        }
+        u1_ctxt_inc = a + 2 * b;
+        u1_bin = ((u4_cbp >> i) & 0x01);
+        u4_ctx_inc = (u4_ctx_inc | (u1_ctxt_inc << (i << 2)));
+        u4_bins = (u4_bins | (u1_bin << i));
+    }
+
+    /* CBP Chroma, TU, Cmax = 2 */
+    i2_cbp_chroma = u4_cbp >> 4;
+    /* calulate ctxtInc, depending on neighbour availability */
+    a = (ps_left_ctxt->u1_cbp > 15) ? 1 : 0;
+    b = (ps_top_ctxt->u1_cbp > 15) ? 1 : 0;
+
+    u1_ctxt_inc = a + 2 * b;
+    if (i2_cbp_chroma)
+    {
+        u4_ctx_inc = u4_ctx_inc | ((4 + u1_ctxt_inc) << 16);
+        u4_bins = (u4_bins | 0x10);
+        /* calulate ctxtInc, depending on neighbour availability */
+        a = (ps_left_ctxt->u1_cbp > 31) ? 1 : 0;
+        b = (ps_top_ctxt->u1_cbp > 31) ? 1 : 0;
+        u1_ctxt_inc = a + 2 * b;
+        u4_ctx_inc = u4_ctx_inc | ((8 + u1_ctxt_inc) << 20);
+        u4_bins = (u4_bins | (((i2_cbp_chroma >> 1) & 0x01) << i1_bins_len));
+        i1_bins_len++;
+    }
+    else
+    {
+        u4_ctx_inc = (u4_ctx_inc | ((4 + u1_ctxt_inc) << 16));
+    }
+    ih264e_encode_decision_bins(u4_bins, i1_bins_len, u4_ctx_inc, 8,
+                                ps_cabac_ctxt->au1_cabac_ctxt_table + CBP_LUMA,
+                                ps_cabac_ctxt);
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Encodes mb_qp_delta for the MB.
+ *
+ * @param[in] i1_mb_qp_delta
+ *  mb_qp_delta
+ *
+ * @param[in] ps_cabac_ctxt
+ *  Pointer to cabac context structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+static void ih264e_cabac_enc_mb_qp_delta(WORD8 i1_mb_qp_delta,
+                                         cabac_ctxt_t *ps_cabac_ctxt)
+{
+    UWORD8 u1_code_num;
+    UWORD8 u1_ctxt_inc;
+
+    UWORD32 u4_ctx_inc;
+    UWORD32 u4_bins;
+    WORD8 i1_bins_len;
+    UWORD8 u1_ctx_inc, u1_bin;
+    /* Range of ps_mb_qp_delta_ctxt= -26 to +25 inclusive */
+        ASSERT((i1_mb_qp_delta < 26) && (i1_mb_qp_delta > -27));
+    /* if ps_mb_qp_delta_ctxt=0, then codeNum=0 */
+    u1_code_num = 0;
+    if (i1_mb_qp_delta > 0)
+        u1_code_num = (i1_mb_qp_delta << 1) - 1;
+    else if (i1_mb_qp_delta < 0)
+        u1_code_num = (ABS(i1_mb_qp_delta)) << 1;
+
+    u4_ctx_inc = 0;
+    u4_bins = 0;
+    i1_bins_len = 1;
+    /* calculate ctxtInc, depending on neighbour availability */
+    u1_ctxt_inc = (!(!(ps_cabac_ctxt->i1_prevps_mb_qp_delta_ctxt)));
+    ps_cabac_ctxt->i1_prevps_mb_qp_delta_ctxt = i1_mb_qp_delta;
+
+    if (u1_code_num == 0)
+    {
+        /* b0 */
+        u1_bin = (UWORD8) (u4_bins);
+        u1_ctx_inc = u1_ctxt_inc & 0x0f;
+        /* Encode the bin */
+        ih264e_cabac_encode_bin(ps_cabac_ctxt,
+                                u1_bin,
+                                ps_cabac_ctxt->au1_cabac_ctxt_table + MB_QP_DELTA
+                                        + u1_ctx_inc);
+
+    }
+    else
+    {
+        /* b0 */
+        u4_ctx_inc = u1_ctxt_inc;
+        u4_bins = 1;
+        u1_code_num--;
+        if (u1_code_num == 0)
+        {
+            /* b1 */
+            u4_ctx_inc = (u4_ctx_inc | 0x20);
+            i1_bins_len++;
+            ih264e_encode_decision_bins(u4_bins, i1_bins_len, u4_ctx_inc, 3,
+                                        ps_cabac_ctxt->au1_cabac_ctxt_table + MB_QP_DELTA,
+                                        ps_cabac_ctxt);
+        }
+        else
+        {
+            /* b1 */
+            u4_ctx_inc = (u4_ctx_inc | 0x20);
+            u4_bins = (u4_bins | (1 << i1_bins_len));
+            i1_bins_len++;
+            u1_code_num--;
+            /* BinIdx from b2 onwards */
+            if (u1_code_num < 30)
+            { /* maximum i1_bins_len = 31 */
+                while (u1_code_num)
+                {
+                    u4_bins = (u4_bins | (1 << i1_bins_len));
+                    i1_bins_len++;
+                    u1_code_num--;
+                };
+                u4_ctx_inc = (u4_ctx_inc | 0x300);
+                i1_bins_len++;
+                ih264e_encode_decision_bins(u4_bins,
+                                            i1_bins_len,
+                                            u4_ctx_inc,
+                                            2,
+                                            ps_cabac_ctxt->au1_cabac_ctxt_table
+                                                + MB_QP_DELTA,
+                                            ps_cabac_ctxt);
+            }
+            else
+            {
+                /* maximum i1_bins_len = 53 */
+                u4_bins = 0xffffffff;
+                i1_bins_len = 32;
+                u4_ctx_inc = (u4_ctx_inc | 0x300);
+                u1_code_num -= 30;
+                ih264e_encode_decision_bins(u4_bins,
+                                            i1_bins_len,
+                                            u4_ctx_inc,
+                                            2,
+                                            ps_cabac_ctxt->au1_cabac_ctxt_table
+                                                + MB_QP_DELTA,
+                                            ps_cabac_ctxt);
+                u4_bins = 0;
+                i1_bins_len = 0;
+                u4_ctx_inc = 0x033;
+                while (u1_code_num)
+                {
+                    u4_bins = (u4_bins | (1 << i1_bins_len));
+                    i1_bins_len++;
+                    u1_code_num--;
+                };
+
+                u4_ctx_inc = (u4_ctx_inc | 0x300);
+                i1_bins_len++;
+                ih264e_encode_decision_bins(u4_bins,
+                                            i1_bins_len,
+                                            u4_ctx_inc,
+                                            1,
+                                            ps_cabac_ctxt->au1_cabac_ctxt_table
+                                                + MB_QP_DELTA,
+                                            ps_cabac_ctxt);
+            }
+        }
+    }
+}
+
+
+
+
+/**
+ *******************************************************************************
+ * @brief
+ *  Encodes 4residual_block_cabac as defined in 7.3.5.3.3.
+ *
+ * @param[in] pi2_res_block
+ *  pointer to the array of residues
+ *
+ * @param[in]  u1_nnz
+ *  Number of non zero coeffs in the block
+ *
+ * @param[in] u1_max_num_coeffs
+ *  Max number of coeffs that can be there in the block
+ *
+ * @param[in] u2_sig_coeff_map
+ *  Significant coeff map
+ *
+ * @param[in] u4_ctx_cat_offset
+ *  ctxIdxOffset for  absolute value contexts
+ *
+ * @param[in]  pu1_ctxt_sig_coeff
+ *  Pointer to residual state variables
+ *
+ * @param[in] ps_cabac_ctxt
+ *  Pointer to cabac context structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+static void ih264e_cabac_write_coeff4x4(WORD16 *pi2_res_block, UWORD8 u1_nnz,
+                                        UWORD8 u1_max_num_coeffs,
+                                        UWORD16 u2_sig_coeff_map,
+                                        UWORD32 u4_ctx_cat_offset,
+                                        bin_ctxt_model *pu1_ctxt_sig_coeff,
+                                        cabac_ctxt_t *ps_cabac_ctxt)
+{
+
+    WORD8 i;
+    WORD16 *pi16_coeffs;
+    UWORD32 u4_sig_coeff, u4_bins;
+    UWORD32 u4_ctx_inc;
+    UWORD8 u1_last_sig_coef_index = (31 - CLZ(u2_sig_coeff_map));
+
+    /* Always put Coded Block Flag as 1 */
+
+        pi16_coeffs = pi2_res_block;
+        {
+            bin_ctxt_model *pu1_bin_ctxt;
+            UWORD8 u1_bin, uc_last;
+
+            i = 0;
+            pu1_bin_ctxt = pu1_ctxt_sig_coeff;
+            u4_sig_coeff = 0;
+            u1_bin = 1;
+            if ((u1_last_sig_coef_index))
+            {
+                u1_bin = !!(u2_sig_coeff_map & 01);
+            }
+            uc_last = 1;
+
+            do
+            {
+                /* Encode Decision */
+                ih264e_cabac_encode_bin(ps_cabac_ctxt, u1_bin, pu1_bin_ctxt);
+
+                if (u1_bin & uc_last)
+                {
+                    u4_sig_coeff = (u4_sig_coeff | (1 << i));
+                    pu1_bin_ctxt = pu1_ctxt_sig_coeff + i
+                                    + LAST_SIGNIFICANT_COEFF_FLAG_FRAME
+                                    - SIGNIFICANT_COEFF_FLAG_FRAME;
+                    u1_bin = (i == u1_last_sig_coef_index);
+                    uc_last = 0;
+                }
+                else
+                {
+                    i = i + 1;
+                    pu1_bin_ctxt = pu1_ctxt_sig_coeff + i;
+                    u1_bin = (i == u1_last_sig_coef_index);
+                    uc_last = 1;
+                    if ((i != u1_last_sig_coef_index))
+                    {
+                        u1_bin = !!((u2_sig_coeff_map >> i) & 01);
+                    }
+                }
+            }while (!((i > u1_last_sig_coef_index)
+                            || (i > (u1_max_num_coeffs - 1))));
+        }
+
+        /* Encode coeff_abs_level_minus1 and coeff_sign_flag */
+        {
+            UWORD8 u1_sign;
+            UWORD16 u2_abs_level;
+            UWORD8 u1_abs_level_equal1 = 1, u1_abs_level_gt1 = 0;
+            UWORD8 u1_ctx_inc;
+            UWORD8 u1_coff;
+            WORD16 i2_sufs;
+            WORD8 i1_bins_len;
+            i = u1_last_sig_coef_index;
+            pi16_coeffs = pi2_res_block + u1_nnz - 1;
+            do
+            {
+                {
+                    u4_sig_coeff = u4_sig_coeff & ((1 << i) - 1);
+                    u4_bins = 0;
+                    u4_ctx_inc = 0;
+                    i1_bins_len = 1;
+                    /* Encode the AbsLevelMinus1 */
+                    u2_abs_level = ABS(*(pi16_coeffs)) - 1;
+                    /* CtxInc for bin0 */
+                    u4_ctx_inc = MIN(u1_abs_level_equal1, 4);
+                    /* CtxInc for remaining */
+                    u1_ctx_inc = 5 + MIN(u1_abs_level_gt1, 4);
+                    u4_ctx_inc = u4_ctx_inc + (u1_ctx_inc << 4);
+                    if (u2_abs_level)
+                    {
+                        u1_abs_level_gt1++;
+                        u1_abs_level_equal1 = 0;
+                    }
+                    if (!u1_abs_level_gt1)
+                        u1_abs_level_equal1++;
+
+                    u1_coff = 14;
+                    if (u2_abs_level >= u1_coff)
+                    {
+                        /* Prefix TU i.e string of 14 1's */
+                        u4_bins = 0x3fff;
+                        i1_bins_len = 14;
+                        ih264e_encode_decision_bins(u4_bins, i1_bins_len,
+                                                    u4_ctx_inc, 1, ps_cabac_ctxt->au1_cabac_ctxt_table
+                                                    + u4_ctx_cat_offset,
+                                                    ps_cabac_ctxt);
+
+                        /* Suffix, uses EncodeBypass */
+                        i2_sufs = u2_abs_level - u1_coff;
+
+                        u4_bins = ih264e_cabac_UEGk0_binarization(i2_sufs,
+                                                                  &i1_bins_len);
+
+                        ih264e_cabac_encode_bypass_bins(ps_cabac_ctxt, u4_bins,
+                                                        i1_bins_len);
+
+                    }
+                    else
+                    {
+                        /* Prefix only */
+                        u4_bins = (1 << u2_abs_level) - 1;
+                        i1_bins_len = u2_abs_level + 1;
+                        /* Encode Terminating bit */
+                        ih264e_encode_decision_bins(u4_bins, i1_bins_len,
+                                                    u4_ctx_inc, 1, ps_cabac_ctxt->au1_cabac_ctxt_table
+                                                    + u4_ctx_cat_offset,
+                                                    ps_cabac_ctxt);
+                    }
+                }
+                /* encode coeff_sign_flag[i] */
+                u1_sign = ((*pi16_coeffs) < 0) ? 1 : 0;
+                ih264e_cabac_encode_bypass_bins(ps_cabac_ctxt, u1_sign, 1);
+                i = CLZ(u4_sig_coeff);
+                i = 31 - i;
+                pi16_coeffs--;
+            }while (u4_sig_coeff);
+        }
+
+}
+
+
+/**
+ *******************************************************************************
+ * @brief
+ * Write DC coeffs for intra predicted luma block
+ *
+ * @param[in] ps_ent_ctxt
+ *  Pointer to entropy context structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+static void ih264e_cabac_encode_residue_luma_dc(entropy_ctxt_t *ps_ent_ctxt)
+{
+
+    /* CABAC context */
+    cabac_ctxt_t *ps_cabac_ctxt = ps_ent_ctxt->ps_cabac;
+    tu_sblk_coeff_data_t *ps_mb_coeff_data;
+
+    /* packed residue */
+    void *pv_mb_coeff_data = ps_ent_ctxt->pv_mb_coeff_data;
+    UWORD16 u2_sig_coeff_map;
+    WORD16 *pi2_res_block;
+    UWORD8 u1_nnz;
+    UWORD8 u1_cbf;
+    mb_info_ctxt_t *ps_top_ctxt = ps_cabac_ctxt->ps_top_ctxt_mb_info;
+    mb_info_ctxt_t *p_CurCtxt = ps_cabac_ctxt->ps_curr_ctxt_mb_info;
+
+    PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, u1_nnz,
+                               u2_sig_coeff_map, pi2_res_block);
+
+    u1_cbf = !!(u1_nnz);
+
+    {
+        UWORD32 u4_ctx_inc;
+        UWORD8 u1_a, u1_b;
+
+        u1_a = ps_cabac_ctxt->pu1_left_yuv_dc_csbp[0] & 0x1;
+        u1_b = ps_top_ctxt->u1_yuv_dc_csbp & 0x1;
+        u4_ctx_inc = u1_a + (u1_b << 1);
+
+        ih264e_cabac_encode_bin(ps_cabac_ctxt,
+                                u1_cbf,
+                                ps_cabac_ctxt->au1_cabac_ctxt_table + CBF
+                                        + (LUMA_DC_CTXCAT << 2) + u4_ctx_inc);
+    }
+
+    /* Write coded_block_flag */
+    if (u1_cbf)
+    {
+        ih264e_cabac_write_coeff4x4(pi2_res_block,
+                                   u1_nnz,
+                                   15,
+                                   u2_sig_coeff_map,
+                                   COEFF_ABS_LEVEL_MINUS1 + COEFF_ABS_LEVEL_CAT_0_OFFSET,
+                                   ps_cabac_ctxt->au1_cabac_ctxt_table
+                                        + SIGNIFICANT_COEFF_FLAG_FRAME
+                                        + SIG_COEFF_CTXT_CAT_0_OFFSET,
+                                   ps_cabac_ctxt);
+
+        ps_cabac_ctxt->pu1_left_yuv_dc_csbp[0] |= 0x1;
+        p_CurCtxt->u1_yuv_dc_csbp |= 0x1;
+    }
+    else
+    {
+        ps_cabac_ctxt->pu1_left_yuv_dc_csbp[0] &= 0x6;
+        p_CurCtxt->u1_yuv_dc_csbp &= 0x6;
+    }
+
+    ps_ent_ctxt->pv_mb_coeff_data = pv_mb_coeff_data;
+}
+
+
+
+
+/**
+ *******************************************************************************
+ * @brief
+ * Write chroma residues to the bitstream
+ *
+ * @param[in] ps_ent_ctxt
+ *  Pointer to entropy context structure
+ *
+ * @param[in] u1_chroma_cbp
+ * coded block pattern, chroma
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+static void ih264e_cabac_write_chroma_residue(entropy_ctxt_t *ps_ent_ctxt,
+                                              UWORD8 u1_chroma_cbp)
+{
+    /* CABAC context */
+    cabac_ctxt_t *ps_cabac_ctxt = ps_ent_ctxt->ps_cabac;
+    tu_sblk_coeff_data_t *ps_mb_coeff_data;
+    /* packed residue */
+    void *pv_mb_coeff_data = ps_ent_ctxt->pv_mb_coeff_data;
+    UWORD16 u2_sig_coeff_map;
+    UWORD8 u1_nnz;
+    mb_info_ctxt_t *ps_top_ctxt_mb_info, *ps_curr_ctxt;
+
+    ps_top_ctxt_mb_info = ps_cabac_ctxt->ps_top_ctxt_mb_info;
+    ps_curr_ctxt = ps_cabac_ctxt->ps_curr_ctxt_mb_info;
+
+    /********************/
+    /* Write Chroma DC */
+    /********************/
+    {
+        WORD16 *pi2_res_block;
+        UWORD8 u1_left_dc_csbp, u1_top_dc_csbp, u1_uv, u1_cbf;
+
+        u1_left_dc_csbp = (ps_cabac_ctxt->pu1_left_yuv_dc_csbp[0]) >> 1;
+        u1_top_dc_csbp = (ps_top_ctxt_mb_info->u1_yuv_dc_csbp) >> 1;
+
+        for (u1_uv = 0; u1_uv < 2; u1_uv++)
+        {
+            PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data,
+                                       u1_nnz, u2_sig_coeff_map, pi2_res_block);
+            u1_cbf = !!(u1_nnz);
+            {
+                UWORD8 u1_a, u1_b;
+                UWORD32 u4_ctx_inc;
+                u1_a = (u1_left_dc_csbp >> u1_uv) & 0x01;
+                u1_b = (u1_top_dc_csbp >> u1_uv) & 0x01;
+                u4_ctx_inc = (u1_a + (u1_b << 1));
+
+                ih264e_cabac_encode_bin(ps_cabac_ctxt,
+                                        u1_cbf,
+                                        ps_cabac_ctxt->au1_cabac_ctxt_table + CBF
+                                                + (CHROMA_DC_CTXCAT << 2)
+                                                + u4_ctx_inc);
+            }
+
+            if (u1_cbf)
+            {
+                ih264e_cabac_write_coeff4x4(pi2_res_block,
+                                            u1_nnz,
+                                            3,
+                                            u2_sig_coeff_map,
+                                            COEFF_ABS_LEVEL_MINUS1
+                                                + COEFF_ABS_LEVEL_CAT_3_OFFSET,
+                                             ps_cabac_ctxt->au1_cabac_ctxt_table
+                                                + SIGNIFICANT_COEFF_FLAG_FRAME
+                                                + SIG_COEFF_CTXT_CAT_3_OFFSET,
+                                              ps_cabac_ctxt);
+
+                SETBIT(u1_top_dc_csbp, u1_uv);
+                SETBIT(u1_left_dc_csbp, u1_uv);
+            }
+            else
+            {
+                CLEARBIT(u1_top_dc_csbp, u1_uv);
+                CLEARBIT(u1_left_dc_csbp, u1_uv);
+            }
+        }
+        /*************************************************************/
+        /*      Update the DC csbp                                   */
+        /*************************************************************/
+        ps_cabac_ctxt->pu1_left_yuv_dc_csbp[0] &= 0x1;
+        ps_curr_ctxt->u1_yuv_dc_csbp &= 0x1;
+        ps_cabac_ctxt->pu1_left_yuv_dc_csbp[0] |= (u1_left_dc_csbp << 1);
+        ps_curr_ctxt->u1_yuv_dc_csbp |= (u1_top_dc_csbp << 1);
+    }
+    /*******************/
+    /* Write Chroma AC */
+    /*******************/
+    {
+        if (u1_chroma_cbp == 2)
+        {
+            UWORD8 u1_uv_blkno, u1_left_ac_csbp, u1_top_ac_csbp;
+            WORD16 *pi2_res_block;
+            u1_left_ac_csbp = ps_cabac_ctxt->pu1_left_uv_ac_csbp[0];
+            u1_top_ac_csbp = ps_top_ctxt_mb_info->u1_yuv_ac_csbp >> 4;
+
+            for (u1_uv_blkno = 0; u1_uv_blkno < 8; u1_uv_blkno++)
+            {
+                UWORD8 u1_cbf;
+                UWORD8 u1_b2b0, u1_b2b1;
+                PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data,
+                                           u1_nnz, u2_sig_coeff_map,
+                                           pi2_res_block);
+
+                u1_cbf = !!(u1_nnz);
+                u1_b2b0 = ((u1_uv_blkno & 0x4) >> 1) | (u1_uv_blkno & 0x1);
+                u1_b2b1 = ((u1_uv_blkno & 0x4) >> 1)
+                                | ((u1_uv_blkno & 0x2) >> 1);
+
+                {
+                    UWORD8 u1_a, u1_b;
+                    UWORD32 u4_ctx_inc;
+                    /* write coded_block_flag */
+                    u1_a = (u1_left_ac_csbp >> u1_b2b1) & 0x1;
+                    u1_b = (u1_top_ac_csbp >> u1_b2b0) & 0x1;
+                    u4_ctx_inc = u1_a + (u1_b << 1);
+
+                    ih264e_cabac_encode_bin(ps_cabac_ctxt,
+                                            u1_cbf,
+                                            ps_cabac_ctxt->au1_cabac_ctxt_table + CBF
+                                                    + (CHROMA_AC_CTXCAT << 2)
+                                                    + u4_ctx_inc);
+
+                }
+                if (u1_cbf)
+                {
+                    ih264e_cabac_write_coeff4x4(pi2_res_block,
+                                                u1_nnz,
+                                                14,
+                                                u2_sig_coeff_map,
+                                                COEFF_ABS_LEVEL_MINUS1
+                                                    + COEFF_ABS_LEVEL_CAT_4_OFFSET,
+                                                ps_cabac_ctxt->au1_cabac_ctxt_table
+                                                    + +SIGNIFICANT_COEFF_FLAG_FRAME
+                                                    + SIG_COEFF_CTXT_CAT_4_OFFSET,
+                                                ps_cabac_ctxt);
+
+                    SETBIT(u1_left_ac_csbp, u1_b2b1);
+                    SETBIT(u1_top_ac_csbp, u1_b2b0);
+                }
+                else
+                {
+                    CLEARBIT(u1_left_ac_csbp, u1_b2b1);
+                    CLEARBIT(u1_top_ac_csbp, u1_b2b0);
+
+                }
+            }
+            /*************************************************************/
+            /*      Update the AC csbp                                   */
+            /*************************************************************/
+            ps_cabac_ctxt->pu1_left_uv_ac_csbp[0] = u1_left_ac_csbp;
+            ps_curr_ctxt->u1_yuv_ac_csbp &= 0x0f;
+            ps_curr_ctxt->u1_yuv_ac_csbp |= (u1_top_ac_csbp << 4);
+        }
+        else
+        {
+            ps_cabac_ctxt->pu1_left_uv_ac_csbp[0] = 0;
+            ps_curr_ctxt->u1_yuv_ac_csbp &= 0xf;
+        }
+    }
+    ps_ent_ctxt->pv_mb_coeff_data = pv_mb_coeff_data;
+}
+
+
+
+
+/**
+ *******************************************************************************
+ * @brief
+ * Encodes Residues for the MB as defined in 7.3.5.3
+ *
+ * @param[in] ps_ent_ctxt
+ *  Pointer to entropy context structure
+ *
+ * @param[in] u1_cbp
+ * coded block pattern
+ *
+ * @param[in] u1_ctx_cat
+ * Context category, LUMA_AC_CTXCAT or LUMA_4x4_CTXCAT
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+static void ih264e_cabac_encode_residue(entropy_ctxt_t *ps_ent_ctxt,
+                                        UWORD32 u4_cbp, UWORD8 u1_ctx_cat)
+{
+    /* CABAC context */
+    cabac_ctxt_t *ps_cabac_ctxt = ps_ent_ctxt->ps_cabac;
+
+    tu_sblk_coeff_data_t *ps_mb_coeff_data;
+    /* packed residue */
+    void *pv_mb_coeff_data = ps_ent_ctxt->pv_mb_coeff_data;
+    UWORD16 u2_sig_coeff_map;
+    UWORD8 u1_nnz;
+    mb_info_ctxt_t *ps_curr_ctxt;
+    mb_info_ctxt_t *ps_top_ctxt;
+    UWORD8 u1_left_ac_csbp;
+    UWORD8 u1_top_ac_csbp;
+    UWORD32 u4_ctx_idx_offset_sig_coef, u4_ctx_idx_offset_abs_lvl;
+    ps_curr_ctxt = ps_cabac_ctxt->ps_curr_ctxt_mb_info;
+    ps_top_ctxt = ps_cabac_ctxt->ps_top_ctxt_mb_info;
+    u1_left_ac_csbp = ps_cabac_ctxt->pu1_left_y_ac_csbp[0];
+    u1_top_ac_csbp = ps_top_ctxt->u1_yuv_ac_csbp;
+
+    if (u4_cbp & 0xf)
+    {
+        /*  Write luma residue  */
+        UWORD8 u1_offset;
+        WORD16 *pi2_res_block;
+        UWORD8 u1_subblk_num;
+        if (u1_ctx_cat == LUMA_AC_CTXCAT)
+        {
+            u1_offset = 1;
+            u4_ctx_idx_offset_sig_coef = SIG_COEFF_CTXT_CAT_1_OFFSET;
+            u4_ctx_idx_offset_abs_lvl = COEFF_ABS_LEVEL_MINUS1
+                                      + COEFF_ABS_LEVEL_CAT_1_OFFSET;
+        }
+        else
+        {
+            u1_offset = 0;
+            u4_ctx_idx_offset_sig_coef = SIG_COEFF_CTXT_CAT_2_OFFSET;
+            u4_ctx_idx_offset_abs_lvl = COEFF_ABS_LEVEL_MINUS1
+                                        + COEFF_ABS_LEVEL_CAT_2_OFFSET;
+        }
+
+        for (u1_subblk_num = 0; u1_subblk_num < 16; u1_subblk_num++)
+        {
+            UWORD8 u1_b0, u1_b1, u1_b2, u1_b3, u1_b2b0, u1_b3b1, u1_b3b2;
+            u1_b0 = (u1_subblk_num & 0x1);
+            u1_b1 = (u1_subblk_num & 0x2) >> 1;
+            u1_b2 = (u1_subblk_num & 0x4) >> 2;
+            u1_b3 = (u1_subblk_num & 0x8) >> 3;
+            u1_b2b0 = (u1_b2 << 1) | (u1_b0);
+            u1_b3b1 = (u1_b3 << 1) | (u1_b1);
+            u1_b3b2 = (u1_b3 << 1) | (u1_b2);
+
+            if (!((u4_cbp >> u1_b3b2) & 0x1))
+            {
+                /* ---------------------------------------------------------- */
+                /* The current block is not coded so skip all the sub block */
+                /* and set the pointer of scan level, csbp accrodingly      */
+                /* ---------------------------------------------------------- */
+                CLEARBIT(u1_top_ac_csbp, u1_b2b0);
+                CLEARBIT(u1_top_ac_csbp, (u1_b2b0 + 1));
+                CLEARBIT(u1_left_ac_csbp, u1_b3b1);
+                CLEARBIT(u1_left_ac_csbp, (u1_b3b1 + 1));
+
+                u1_subblk_num += 3;
+            }
+            else
+            {
+                PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data,
+                                           u1_nnz, u2_sig_coeff_map,
+                                           pi2_res_block);
+
+                UWORD8 u1_csbf = !!(u1_nnz);
+                {
+                    UWORD8 u1_a, u1_b;
+                    UWORD32 u4_ctx_inc;
+                    u1_b = (u1_top_ac_csbp >> u1_b2b0) & 0x01;
+                    u1_a = (u1_left_ac_csbp >> u1_b3b1) & 0x01;
+                    u4_ctx_inc = u1_a + (u1_b << 1);
+
+                    /* Encode the bin */
+                    ih264e_cabac_encode_bin(ps_cabac_ctxt,
+                                            u1_csbf,
+                                            ps_cabac_ctxt->au1_cabac_ctxt_table + CBF
+                                                + (u1_ctx_cat << 2) + u4_ctx_inc);
+
+                }
+                /**************************/
+                /* Write coded_block_flag */
+                /**************************/
+                if (u1_csbf)
+                {
+                    ih264e_cabac_write_coeff4x4(pi2_res_block,
+                                                u1_nnz,
+                                                (UWORD8) (15 - u1_offset),
+                                                u2_sig_coeff_map,
+                                                u4_ctx_idx_offset_abs_lvl,
+                                                ps_cabac_ctxt->au1_cabac_ctxt_table
+                                                    + SIGNIFICANT_COEFF_FLAG_FRAME
+                                                        + u4_ctx_idx_offset_sig_coef,
+                                                ps_cabac_ctxt);
+
+                    SETBIT(u1_top_ac_csbp, u1_b2b0);
+                    SETBIT(u1_left_ac_csbp, u1_b3b1);
+                }
+                else
+                {
+                    CLEARBIT(u1_top_ac_csbp, u1_b2b0);
+                    CLEARBIT(u1_left_ac_csbp, u1_b3b1);
+                }
+            }
+        }
+        /**************************************************************************/
+        /*                   Update the AC csbp                                   */
+        /**************************************************************************/
+        ps_cabac_ctxt->pu1_left_y_ac_csbp[0] = u1_left_ac_csbp & 0xf;
+        u1_top_ac_csbp &= 0x0f;
+        ps_curr_ctxt->u1_yuv_ac_csbp &= 0xf0;
+        ps_curr_ctxt->u1_yuv_ac_csbp |= u1_top_ac_csbp;
+    }
+    else
+    {
+        ps_cabac_ctxt->pu1_left_y_ac_csbp[0] = 0;
+        ps_curr_ctxt->u1_yuv_ac_csbp &= 0xf0;
+    }
+
+    /*     Write chroma residue */
+
+    ps_ent_ctxt->pv_mb_coeff_data = pv_mb_coeff_data;
+    {
+        UWORD8 u1_cbp_chroma;
+        u1_cbp_chroma = u4_cbp >> 4;
+        if (u1_cbp_chroma)
+        {
+            ih264e_cabac_write_chroma_residue(ps_ent_ctxt, u1_cbp_chroma);
+        }
+        else
+        {
+            ps_cabac_ctxt->pu1_left_yuv_dc_csbp[0] &= 0x1;
+            ps_curr_ctxt->u1_yuv_dc_csbp &= 0x1;
+            ps_cabac_ctxt->pu1_left_uv_ac_csbp[0] = 0;
+            ps_curr_ctxt->u1_yuv_ac_csbp &= 0xf;
+        }
+    }
+}
+
+/**
+ *******************************************************************************
+ * @brief
+ * Encodes a Motion vector (9.3.3.1.1.7 )
+ *
+ * @param[in] u1_mvd
+ *  Motion vector to be encoded
+ *
+ * @param[in] u4_ctx_idx_offset
+ * *  ctxIdxOffset for MV_X or MV_Ycontext
+ *
+ * @param[in]  ui2_abs_mvd
+ * sum of absolute value of corresponding neighboring motion vectors
+ *
+ * @param[in] ps_cabac_ctxt
+ *  Pointer to cabac context structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+static void ih264e_cabac_enc_ctx_mvd(WORD16 u1_mvd, UWORD32 u4_ctx_idx_offset,
+                                     UWORD16 ui2_abs_mvd,
+                                     cabac_ctxt_t *ps_cabac_ctxt)
+{
+
+    UWORD8  u1_bin, u1_ctxt_inc;
+    WORD8 k = 3, u1_coff = 9;
+    WORD16 i2_abs_mvd, i2_sufs;
+    UWORD32 u4_ctx_inc;
+    UWORD32 u4_bins;
+    WORD8 i1_bins_len;
+
+    /* if mvd < u1_coff
+     only Prefix
+     else
+     Prefix + Suffix
+
+     encode sign bit
+
+     Prefix TU encoding Cmax =u1_coff and Suffix 3rd order Exp-Golomb
+     */
+
+    if (ui2_abs_mvd < 3)
+        u4_ctx_inc = 0;
+    else if (ui2_abs_mvd > 32)
+        u4_ctx_inc = 2;
+    else
+        u4_ctx_inc = 1;
+
+    u4_bins = 0;
+    i1_bins_len = 1;
+
+    if (u1_mvd == 0)
+    {
+        ih264e_cabac_encode_bin(ps_cabac_ctxt,
+                                0,
+                                ps_cabac_ctxt->au1_cabac_ctxt_table + u4_ctx_idx_offset
+                                        + u4_ctx_inc);
+    }
+    else
+    {
+        i2_abs_mvd = ABS(u1_mvd);
+        if (i2_abs_mvd >= u1_coff)
+        {
+            /* Prefix TU i.e string of 9 1's */
+            u4_bins = 0x1ff;
+            i1_bins_len = 9;
+            u4_ctx_inc = (u4_ctx_inc | 0x065430);
+
+            ih264e_encode_decision_bins(u4_bins,
+                                        i1_bins_len,
+                                        u4_ctx_inc,
+                                        4,
+                                        ps_cabac_ctxt->au1_cabac_ctxt_table
+                                            + u4_ctx_idx_offset,
+                                        ps_cabac_ctxt);
+
+            /* Suffix, uses EncodeBypass */
+            u4_bins = 0;
+            i1_bins_len = 0;
+            i2_sufs = i2_abs_mvd - u1_coff;
+            while (1)
+            {
+                if (i2_sufs >= (1 << k))
+                {
+                    u4_bins = (u4_bins | (1 << i1_bins_len));
+                    i1_bins_len++;
+                    i2_sufs = i2_sufs - (1 << k);
+                    k++;
+                }
+                else
+                {
+                    i1_bins_len++;
+                    while (k--)
+                    {
+                        u1_bin = ((i2_sufs >> k) & 0x01);
+                        u4_bins = (u4_bins | (u1_bin << i1_bins_len));
+                        i1_bins_len++;
+                    }
+                    break;
+                }
+            }
+            ih264e_cabac_encode_bypass_bins(ps_cabac_ctxt, u4_bins,
+                                            i1_bins_len);
+        }
+        else
+        {
+            /* Prefix only */
+            /* b0 */
+            u4_bins = 1;
+            i2_abs_mvd--;
+            u1_ctxt_inc = 3;
+            while (i2_abs_mvd)
+            {
+                i2_abs_mvd--;
+                u4_bins = (u4_bins | (1 << i1_bins_len));
+                if (u1_ctxt_inc <= 6)
+                {
+                    u4_ctx_inc = (u4_ctx_inc
+                                    | (u1_ctxt_inc << (i1_bins_len << 2)));
+                    u1_ctxt_inc++;
+                }
+                i1_bins_len++;
+            }
+            /* Encode Terminating bit */
+            if (i1_bins_len <= 4)
+                u4_ctx_inc = (u4_ctx_inc | (u1_ctxt_inc << (i1_bins_len << 2)));
+            i1_bins_len++;
+            ih264e_encode_decision_bins(u4_bins,
+                                        i1_bins_len,
+                                        u4_ctx_inc,
+                                        4,
+                                        ps_cabac_ctxt->au1_cabac_ctxt_table
+                                            + u4_ctx_idx_offset,
+                                        ps_cabac_ctxt);
+        }
+        /* sign bit, uses EncodeBypass */
+        if (u1_mvd > 0)
+            ih264e_cabac_encode_bypass_bins(ps_cabac_ctxt, 0, 1);
+        else
+            ih264e_cabac_encode_bypass_bins(ps_cabac_ctxt, 1, 1);
+    }
+}
+
+/**
+ *******************************************************************************
+ * @brief
+ * Encodes all motion vectors for a P16x16 MB
+ *
+ * @param[in] ps_cabac_ctxt
+ *  Pointer to cabac context structure
+ *
+ * @param[in] pi2_mv_ptr
+ * Pointer to array of motion vectors
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+static void ih264e_cabac_enc_mvds_p16x16(cabac_ctxt_t *ps_cabac_ctxt,
+                                         WORD16 *pi2_mv_ptr)
+{
+
+
+    /* Encode the differential component of the motion vectors */
+
+    {
+        UWORD8 u1_abs_mvd_x, u1_abs_mvd_y;
+        UWORD8 *pu1_top_mv_ctxt, *pu1_lft_mv_ctxt;
+        WORD16 u2_mv;
+        u1_abs_mvd_x = 0;
+        u1_abs_mvd_y = 0;
+        pu1_top_mv_ctxt = ps_cabac_ctxt->ps_curr_ctxt_mb_info->u1_mv[0];
+        pu1_lft_mv_ctxt = ps_cabac_ctxt->pu1_left_mv_ctxt_inc[0];
+        {
+            UWORD16 u2_abs_mvd_x_a, u2_abs_mvd_x_b, u2_abs_mvd_y_a,
+                            u2_abs_mvd_y_b;
+            u2_abs_mvd_x_b = (UWORD16) pu1_top_mv_ctxt[0];
+            u2_abs_mvd_y_b = (UWORD16) pu1_top_mv_ctxt[1];
+            u2_abs_mvd_x_a = (UWORD16) pu1_lft_mv_ctxt[0];
+            u2_abs_mvd_y_a = (UWORD16) pu1_lft_mv_ctxt[1];
+            u2_mv = *(pi2_mv_ptr++);
+
+            ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_X,
+                                    (UWORD16) (u2_abs_mvd_x_a + u2_abs_mvd_x_b),
+                                    ps_cabac_ctxt);
+
+            u1_abs_mvd_x = CLIP3(0, 127, ABS(u2_mv));
+            u2_mv = *(pi2_mv_ptr++);
+
+            ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_Y,
+                                    (UWORD16) (u2_abs_mvd_y_a + u2_abs_mvd_y_b),
+                                    ps_cabac_ctxt);
+
+            u1_abs_mvd_y = CLIP3(0, 127, ABS(u2_mv));
+        }
+        /***************************************************************/
+        /* Store abs_mvd_values cabac contexts                         */
+        /***************************************************************/
+        pu1_top_mv_ctxt[0] = pu1_lft_mv_ctxt[0] = u1_abs_mvd_x;
+        pu1_top_mv_ctxt[1] = pu1_lft_mv_ctxt[1] = u1_abs_mvd_y;
+    }
+}
+
+
+/**
+ *******************************************************************************
+ * @brief
+ * Encodes all motion vectors for a B MB (Assues that mbype is B_L0_16x16, B_L1_16x16 or B_Bi_16x16
+ *
+ * @param[in] ps_cabac_ctxt
+ *  Pointer to cabac context structure
+ *
+ * @param[in] pi2_mv_ptr
+ * Pointer to array of motion vectors
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+static void ih264e_cabac_enc_mvds_b16x16(cabac_ctxt_t *ps_cabac_ctxt,
+                                         WORD16 *pi2_mv_ptr,
+                                         WORD32 i4_mb_part_pred_mode )
+{
+
+    /* Encode the differential component of the motion vectors */
+
+    {
+        UWORD8 u1_abs_mvd_x, u1_abs_mvd_y;
+        UWORD8 *pu1_top_mv_ctxt, *pu1_lft_mv_ctxt;
+        WORD16 u2_mv;
+        u1_abs_mvd_x = 0;
+        u1_abs_mvd_y = 0;
+        pu1_top_mv_ctxt = ps_cabac_ctxt->ps_curr_ctxt_mb_info->u1_mv[0];
+        pu1_lft_mv_ctxt = ps_cabac_ctxt->pu1_left_mv_ctxt_inc[0];
+        if (i4_mb_part_pred_mode != PRED_L1)/* || PRED_BI */
+        {
+            UWORD16 u2_abs_mvd_x_a, u2_abs_mvd_x_b, u2_abs_mvd_y_a,
+                            u2_abs_mvd_y_b;
+            u2_abs_mvd_x_b = (UWORD16) pu1_top_mv_ctxt[0];
+            u2_abs_mvd_y_b = (UWORD16) pu1_top_mv_ctxt[1];
+            u2_abs_mvd_x_a = (UWORD16) pu1_lft_mv_ctxt[0];
+            u2_abs_mvd_y_a = (UWORD16) pu1_lft_mv_ctxt[1];
+            u2_mv = *(pi2_mv_ptr++);
+
+            ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_X,
+                                    (UWORD16) (u2_abs_mvd_x_a + u2_abs_mvd_x_b),
+                                    ps_cabac_ctxt);
+
+            u1_abs_mvd_x = CLIP3(0, 127, ABS(u2_mv));
+            u2_mv = *(pi2_mv_ptr++);
+
+            ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_Y,
+                                    (UWORD16) (u2_abs_mvd_y_a + u2_abs_mvd_y_b),
+                                    ps_cabac_ctxt);
+
+            u1_abs_mvd_y = CLIP3(0, 127, ABS(u2_mv));
+        }
+        /***************************************************************/
+        /* Store abs_mvd_values cabac contexts                         */
+        /***************************************************************/
+        pu1_top_mv_ctxt[0] = pu1_lft_mv_ctxt[0] = u1_abs_mvd_x;
+        pu1_top_mv_ctxt[1] = pu1_lft_mv_ctxt[1] = u1_abs_mvd_y;
+
+        u1_abs_mvd_x = 0;
+        u1_abs_mvd_y = 0;
+        if (i4_mb_part_pred_mode != PRED_L0)/* || PRED_BI */
+        {
+            UWORD16 u2_abs_mvd_x_a, u2_abs_mvd_x_b, u2_abs_mvd_y_a,
+                            u2_abs_mvd_y_b;
+            u2_abs_mvd_x_b = (UWORD16) pu1_top_mv_ctxt[2];
+            u2_abs_mvd_y_b = (UWORD16) pu1_top_mv_ctxt[3];
+            u2_abs_mvd_x_a = (UWORD16) pu1_lft_mv_ctxt[2];
+            u2_abs_mvd_y_a = (UWORD16) pu1_lft_mv_ctxt[3];
+            u2_mv = *(pi2_mv_ptr++);
+
+            ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_X,
+                                    (UWORD16) (u2_abs_mvd_x_a + u2_abs_mvd_x_b),
+                                    ps_cabac_ctxt);
+
+            u1_abs_mvd_x = CLIP3(0, 127, ABS(u2_mv));
+            u2_mv = *(pi2_mv_ptr++);
+
+            ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_Y,
+                                    (UWORD16) (u2_abs_mvd_y_a + u2_abs_mvd_y_b),
+                                    ps_cabac_ctxt);
+
+            u1_abs_mvd_y = CLIP3(0, 127, ABS(u2_mv));
+        }
+        /***************************************************************/
+        /* Store abs_mvd_values cabac contexts                         */
+        /***************************************************************/
+        pu1_top_mv_ctxt[2] = pu1_lft_mv_ctxt[2] = u1_abs_mvd_x;
+        pu1_top_mv_ctxt[3] = pu1_lft_mv_ctxt[3] = u1_abs_mvd_y;
+    }
+}
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function generates CABAC coded bit stream for an Intra Slice.
+ *
+ * @description
+ *  The mb syntax layer for intra slices constitutes luma mb mode, mb qp delta, coded block pattern, chroma mb mode and
+ *  luma/chroma residue. These syntax elements are written as directed by table
+ *  7.3.5 of h264 specification.
+ *
+ * @param[in] ps_ent_ctxt
+ *  pointer to entropy context
+ *
+ * @returns error code
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+IH264E_ERROR_T ih264e_write_islice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt)
+{
+    /* bit stream ptr */
+    bitstrm_t *ps_bitstream = ps_ent_ctxt->ps_bitstrm;
+    /* CABAC context */
+    cabac_ctxt_t *ps_cabac_ctxt = ps_ent_ctxt->ps_cabac;
+    /* packed header data */
+    UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data;
+    mb_info_ctxt_t *ps_curr_ctxt;
+    WORD32 mb_tpm, mb_type, cbp, chroma_intra_mode, luma_intra_mode;
+    WORD8 mb_qp_delta;
+    UWORD32 u4_cbp_l, u4_cbp_c;
+    WORD32 byte_count = 0;
+    WORD32 bitstream_start_offset, bitstream_end_offset;
+
+    if ((ps_bitstream->u4_strm_buf_offset + MIN_STREAM_SIZE_MB)
+                    >= ps_bitstream->u4_max_strm_size)
+    {
+        /* return without corrupting the buffer beyond its size */
+        return (IH264E_BITSTREAM_BUFFER_OVERFLOW);
+    }
+    /* mb header info */
+    mb_tpm = *pu1_byte++;
+    byte_count++;
+    cbp = *pu1_byte++;
+    byte_count++;
+    mb_qp_delta = *pu1_byte++;
+    byte_count++;
+    /* mb type */
+    mb_type = mb_tpm & 0xF;
+
+    ih264e_get_cabac_context(ps_ent_ctxt, mb_type);
+    ps_curr_ctxt = ps_cabac_ctxt->ps_curr_ctxt_mb_info;
+
+    /* Starting bitstream offset for header in bits */
+    bitstream_start_offset = GET_NUM_BITS(ps_bitstream);
+    u4_cbp_c = (cbp >> 4);
+    u4_cbp_l = (cbp & 0xF);
+    if (mb_type == I16x16)
+    {
+        luma_intra_mode = ((mb_tpm >> 4) & 3) + 1 + (u4_cbp_c << 2)
+                        + (u4_cbp_l == 15) * 12;
+    }
+    else
+    {
+        luma_intra_mode = 0;
+    }
+
+    chroma_intra_mode = (mb_tpm >> 6);
+
+    /* Encode Intra pred mode, Luma */
+    ih264e_cabac_enc_intra_mb_type(ISLICE, luma_intra_mode, ps_cabac_ctxt,
+                                   MB_TYPE_I_SLICE);
+
+    if (mb_type == I4x4)
+    {   /* Encode 4x4 MB modes */
+        ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, pu1_byte);
+        byte_count += 8;
+    }
+    /* Encode chroma mode */
+    ih264e_cabac_enc_chroma_predmode(chroma_intra_mode, ps_cabac_ctxt);
+
+    if (mb_type != I16x16)
+    { /* Encode MB cbp */
+        ih264e_cabac_enc_cbp(cbp, ps_cabac_ctxt);
+    }
+
+    if ((cbp > 0) || (mb_type == I16x16))
+    {
+        /* Encode mb_qp_delta */
+        ih264e_cabac_enc_mb_qp_delta(mb_qp_delta, ps_cabac_ctxt);
+        /* Ending bitstream offset for header in bits */
+        bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+        ps_ent_ctxt->u4_header_bits[0] += bitstream_end_offset
+                        - bitstream_start_offset;
+        /* Starting bitstream offset for residue */
+        bitstream_start_offset = bitstream_end_offset;
+        if (mb_type == I16x16)
+        {
+            ps_curr_ctxt->u1_mb_type = CAB_I16x16;
+            ps_curr_ctxt->u1_cbp = cbp;
+            ih264e_cabac_encode_residue_luma_dc(ps_ent_ctxt);
+            ih264e_cabac_encode_residue(ps_ent_ctxt, cbp, LUMA_AC_CTXCAT);
+        }
+        else
+        {
+            ps_curr_ctxt->u1_cbp = cbp;
+            ps_curr_ctxt->u1_mb_type = I4x4;
+            ps_curr_ctxt->u1_mb_type = CAB_I4x4;
+            ih264e_cabac_encode_residue(ps_ent_ctxt, cbp, LUMA_4X4_CTXCAT);
+            ps_cabac_ctxt->pu1_left_yuv_dc_csbp[0] &= 0x6;
+            ps_cabac_ctxt->ps_curr_ctxt_mb_info->u1_yuv_dc_csbp &= 0x6;
+        }
+        /* Ending bitstream offset for reside in bits */
+        bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+        ps_ent_ctxt->u4_residue_bits[0] += bitstream_end_offset
+                        - bitstream_start_offset;
+    }
+    else
+    {
+        ps_curr_ctxt->u1_yuv_ac_csbp = 0;
+        ps_curr_ctxt->u1_yuv_dc_csbp = 0;
+        *(ps_cabac_ctxt->pu1_left_uv_ac_csbp) = 0;
+        *(ps_cabac_ctxt->pu1_left_y_ac_csbp) = 0;
+        *(ps_cabac_ctxt->pu1_left_yuv_dc_csbp) = 0;
+        /* Ending bitstream offset for header in bits */
+        bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+        ps_ent_ctxt->u4_header_bits[0] += bitstream_end_offset
+                        - bitstream_start_offset;
+
+        /* Computing the number of used used for encoding the MB syntax */
+    }
+    memset(ps_curr_ctxt->u1_mv, 0, 16);
+    memset(ps_cabac_ctxt->pu1_left_mv_ctxt_inc, 0, 16);
+    ps_cabac_ctxt->ps_curr_ctxt_mb_info->u1_cbp = cbp;
+    ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count;
+    if (mb_type == I16x16)
+    {
+        ps_curr_ctxt->u1_mb_type = CAB_I16x16;
+
+    }
+    else
+    {
+        ps_curr_ctxt->u1_mb_type = CAB_I4x4;
+
+    }
+    return IH264E_SUCCESS;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function generates CABAC coded bit stream for Inter slices
+ *
+ * @description
+ *  The mb syntax layer for inter slices constitutes luma mb mode, mb qp delta, coded block pattern, chroma mb mode and
+ *  luma/chroma residue. These syntax elements are written as directed by table
+ *  7.3.5 of h264 specification
+ *
+ * @param[in] ps_ent_ctxt
+ *  pointer to entropy context
+ *
+ * @returns error code
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+IH264E_ERROR_T ih264e_write_pslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt)
+{
+    /* bit stream ptr */
+    bitstrm_t *ps_bitstream = ps_ent_ctxt->ps_bitstrm;
+    /* CABAC context */
+    cabac_ctxt_t *ps_cabac_ctxt = ps_ent_ctxt->ps_cabac;
+
+    mb_info_ctxt_t *ps_curr_ctxt;
+
+    WORD32 bitstream_start_offset, bitstream_end_offset;
+    WORD32 mb_tpm, mb_type, cbp, chroma_intra_mode, luma_intra_mode;
+    WORD8 mb_qp_delta;
+    UWORD32 u4_cbp_l, u4_cbp_c;
+    WORD32 byte_count = 0;
+    UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data;
+
+    if ((ps_bitstream->u4_strm_buf_offset + MIN_STREAM_SIZE_MB)
+                    >= ps_bitstream->u4_max_strm_size)
+    {
+        /* return without corrupting the buffer beyond its size */
+        return (IH264E_BITSTREAM_BUFFER_OVERFLOW);
+    }
+    /* mb header info */
+    mb_tpm = *pu1_byte++;
+    byte_count++;
+
+    /* mb type */
+    mb_type = mb_tpm & 0xF;
+    /* CABAC contexts for the MB */
+    ih264e_get_cabac_context(ps_ent_ctxt, mb_type);
+    ps_curr_ctxt = ps_cabac_ctxt->ps_curr_ctxt_mb_info;
+
+    /* if Intra MB */
+    if (mb_type == I16x16 || mb_type == I4x4)
+    {
+        cbp = *pu1_byte++;
+        byte_count++;
+        mb_qp_delta = *pu1_byte++;
+        byte_count++;
+
+        /* Starting bitstream offset for header in bits */
+        bitstream_start_offset = GET_NUM_BITS(ps_bitstream);
+
+        /* Encode mb_skip_flag */
+        ih264e_cabac_enc_mb_skip(0, ps_cabac_ctxt, MB_SKIP_FLAG_P_SLICE);
+        u4_cbp_c = (cbp >> 4);
+        u4_cbp_l = (cbp & 0xF);
+        if (mb_type == I16x16)
+        {
+            luma_intra_mode = ((mb_tpm >> 4) & 3) + 1 + (u4_cbp_c << 2)
+                            + (u4_cbp_l == 15) * 12;
+        }
+        else
+        {
+            luma_intra_mode = 0;
+        }
+        /* Encode intra mb type */
+        {
+            ih264e_cabac_encode_bin(ps_cabac_ctxt,
+                                    1,
+                                    ps_cabac_ctxt->au1_cabac_ctxt_table
+                                        + MB_TYPE_P_SLICE);
+
+            ih264e_cabac_enc_intra_mb_type(PSLICE, (UWORD8) luma_intra_mode,
+                                           ps_cabac_ctxt, MB_TYPE_P_SLICE);
+        }
+
+        if (mb_type == I4x4)
+        {   /* Intra 4x4 modes */
+            ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, pu1_byte);
+            byte_count += 8;
+        }
+        chroma_intra_mode = (mb_tpm >> 6);
+
+        ih264e_cabac_enc_chroma_predmode(chroma_intra_mode, ps_cabac_ctxt);
+
+        if (mb_type != I16x16)
+        {
+            /* encode CBP */
+            ih264e_cabac_enc_cbp(cbp, ps_cabac_ctxt);
+        }
+
+        if ((cbp > 0) || (mb_type == I16x16))
+        {
+            ih264e_cabac_enc_mb_qp_delta(mb_qp_delta, ps_cabac_ctxt);
+
+            /* Ending bitstream offset for header in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_header_bits[0] += bitstream_end_offset
+                            - bitstream_start_offset;
+            /* Starting bitstream offset for residue */
+            bitstream_start_offset = bitstream_end_offset;
+
+            /* Encoding Residue */
+            if (mb_type == I16x16)
+            {
+                ps_curr_ctxt->u1_mb_type = CAB_I16x16;
+                ps_curr_ctxt->u1_cbp = (UWORD8) cbp;
+                ih264e_cabac_encode_residue_luma_dc(ps_ent_ctxt);
+                ih264e_cabac_encode_residue(ps_ent_ctxt, cbp, LUMA_AC_CTXCAT);
+            }
+            else
+            {
+                ps_curr_ctxt->u1_cbp = (UWORD8) cbp;
+                ps_curr_ctxt->u1_mb_type = I4x4;
+                ps_curr_ctxt->u1_mb_type = CAB_I4x4;
+                ih264e_cabac_encode_residue(ps_ent_ctxt, cbp, LUMA_4X4_CTXCAT);
+                ps_cabac_ctxt->pu1_left_yuv_dc_csbp[0] &= 0x6;
+                ps_cabac_ctxt->ps_curr_ctxt_mb_info->u1_yuv_dc_csbp &= 0x6;
+            }
+
+            /* Ending bitstream offset for reside in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_residue_bits[0] += bitstream_end_offset
+                            - bitstream_start_offset;
+        }
+        else
+        {
+            ps_curr_ctxt->u1_yuv_ac_csbp = 0;
+            ps_curr_ctxt->u1_yuv_dc_csbp = 0;
+            *(ps_cabac_ctxt->pu1_left_uv_ac_csbp) = 0;
+            *(ps_cabac_ctxt->pu1_left_y_ac_csbp) = 0;
+            *(ps_cabac_ctxt->pu1_left_yuv_dc_csbp) = 0;
+            /* Ending bitstream offset for header in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_header_bits[0] += bitstream_end_offset
+                            - bitstream_start_offset;
+        }
+
+        memset(ps_curr_ctxt->u1_mv, 0, 16);
+        memset(ps_cabac_ctxt->pu1_left_mv_ctxt_inc, 0, 16);
+        ps_cabac_ctxt->ps_curr_ctxt_mb_info->u1_cbp = (UWORD8) cbp;
+
+        if (mb_type == I16x16)
+        {
+            ps_curr_ctxt->u1_mb_type = CAB_I16x16;
+        }
+        else
+        {
+            ps_curr_ctxt->u1_mb_type = CAB_I4x4;
+        }
+
+        ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count;
+
+        return IH264E_SUCCESS;
+    }
+    else /* Inter MB */
+    {
+        /* Starting bitstream offset for header in bits */
+        bitstream_start_offset = GET_NUM_BITS(ps_bitstream);
+        /* Encoding P16x16 */
+        if (mb_type != PSKIP)
+        {
+            cbp = *pu1_byte++;
+            byte_count++;
+            mb_qp_delta = *pu1_byte++;
+            byte_count++;
+
+            /* Encoding mb_skip */
+            ih264e_cabac_enc_mb_skip(0, ps_cabac_ctxt, MB_SKIP_FLAG_P_SLICE);
+
+            /* Encoding mb_type as P16x16 */
+            {
+                UWORD32 u4_ctx_inc_p;
+                u4_ctx_inc_p = (0x010 + ((2) << 8));
+
+                ih264e_encode_decision_bins(0, 3, u4_ctx_inc_p, 3,
+                                            &(ps_cabac_ctxt->au1_cabac_ctxt_table[MB_TYPE_P_SLICE]),
+                                            ps_cabac_ctxt);
+            }
+            ps_curr_ctxt->u1_mb_type = CAB_P;
+            {
+                WORD16 *pi2_mv_ptr = (WORD16 *) pu1_byte;
+                byte_count += 4;
+                ps_curr_ctxt->u1_mb_type = (ps_curr_ctxt->u1_mb_type
+                                            | CAB_NON_BD16x16);
+                 /* Encoding motion vector for P16x16 */
+                ih264e_cabac_enc_mvds_p16x16(ps_cabac_ctxt, pi2_mv_ptr);
+            }
+            /* Encode CBP */
+            ih264e_cabac_enc_cbp(cbp, ps_cabac_ctxt);
+
+            if (cbp)
+            {
+                /* encode mb_qp_delta */
+                ih264e_cabac_enc_mb_qp_delta(mb_qp_delta, ps_cabac_ctxt);
+            }
+
+            /* Ending bitstream offset for header in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_header_bits[1] += bitstream_end_offset
+                            - bitstream_start_offset;
+            /* Starting bitstream offset for residue */
+            bitstream_start_offset = bitstream_end_offset;
+
+        }
+        else/* MB = PSKIP */
+        {
+            ih264e_cabac_enc_mb_skip(1, ps_cabac_ctxt, MB_SKIP_FLAG_P_SLICE);
+
+            ps_curr_ctxt->u1_mb_type = CAB_P_SKIP;
+            (*ps_ent_ctxt->pi4_mb_skip_run)++;
+
+            memset(ps_curr_ctxt->u1_mv, 0, 16);
+            memset(ps_cabac_ctxt->pu1_left_mv_ctxt_inc, 0, 16);
+            cbp = 0;
+
+            /* Ending bitstream offset for header in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_header_bits[1] += bitstream_end_offset
+                            - bitstream_start_offset;
+            /* Starting bitstream offset for residue */
+
+        }
+
+        if (cbp > 0)
+        {
+            /* Encode residue */
+            ih264e_cabac_encode_residue(ps_ent_ctxt, cbp, LUMA_4X4_CTXCAT);
+            /* Ending bitstream offset for reside in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_residue_bits[1] += bitstream_end_offset
+                            - bitstream_start_offset;
+
+            ps_cabac_ctxt->pu1_left_yuv_dc_csbp[0] &= 0x6;
+            ps_curr_ctxt->u1_yuv_dc_csbp &= 0x6;
+        }
+        else
+        {
+            ps_curr_ctxt->u1_yuv_ac_csbp = 0;
+            ps_curr_ctxt->u1_yuv_dc_csbp = 0;
+            *(ps_cabac_ctxt->pu1_left_uv_ac_csbp) = 0;
+            *(ps_cabac_ctxt->pu1_left_y_ac_csbp) = 0;
+            *(ps_cabac_ctxt->pu1_left_yuv_dc_csbp) = 0;
+        }
+        ps_curr_ctxt->u1_intrapred_chroma_mode = 0;
+        ps_curr_ctxt->u1_cbp = cbp;
+        ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count;
+        return IH264E_SUCCESS;
+    }
+}
+
+
+/* ! < Table 9-37 � Binarization for macroblock types in B slices  in ITU_T_H264-201402
+ * Bits 0-7 : binarised value
+ * Bits 8-15: length of binary sequence */
+
+
+static const UWORD32 u4_b_mb_type[27] = { 0x0100, 0x0301, 0x0305, 0x0603,
+                                          0x0623, 0x0613, 0x0633, 0x060b,
+                                          0x062b, 0x061b, 0x063b, 0x061f,
+                                          0x0707, 0x0747, 0x0727, 0x0767,
+                                          0x0717, 0x0757, 0x0737, 0x0777,
+                                          0x070f, 0x074f, 0x063f };
+/* CtxInc for mb types in B slices */
+static const UWORD32 ui_b_mb_type_ctx_inc[27] = { 0x00, 0x0530, 0x0530,
+                                                  0x0555430, 0x0555430,
+                                                  0x0555430, 0x0555430,
+                                                  0x0555430, 0x0555430,
+                                                  0x0555430, 0x0555430,
+                                                  0x0555430, 0x05555430,
+                                                  0x05555430, 0x05555430,
+                                                  0x05555430, 0x05555430,
+                                                  0x05555430, 0x05555430,
+                                                  0x05555430, 0x05555430,
+                                                  0x05555430, 0x0555430 };
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function generates CABAC coded bit stream for B slices
+ *
+ * @description
+ *  The mb syntax layer for inter slices constitutes luma mb mode,
+ *  mb qp delta, coded block pattern, chroma mb mode and
+ *  luma/chroma residue. These syntax elements are written as directed by table
+ *  7.3.5 of h264 specification
+ *
+ * @param[in] ps_ent_ctxt
+ *  pointer to entropy context
+ *
+ * @returns error code
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt)
+{
+    /* bit stream ptr */
+    bitstrm_t *ps_bitstream = ps_ent_ctxt->ps_bitstrm;
+    /* CABAC context */
+    cabac_ctxt_t *ps_cabac_ctxt = ps_ent_ctxt->ps_cabac;
+
+    mb_info_ctxt_t *ps_curr_ctxt;
+
+    WORD32 bitstream_start_offset, bitstream_end_offset;
+    WORD32 mb_tpm, mb_type, cbp, chroma_intra_mode, luma_intra_mode;
+    WORD8 mb_qp_delta;
+    UWORD32 u4_cbp_l, u4_cbp_c;
+    WORD32 byte_count = 0;
+    UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data;
+
+    if ((ps_bitstream->u4_strm_buf_offset + MIN_STREAM_SIZE_MB)
+                    >= ps_bitstream->u4_max_strm_size)
+    {
+        /* return without corrupting the buffer beyond its size */
+        return (IH264E_BITSTREAM_BUFFER_OVERFLOW);
+    }
+    /* mb header info */
+    mb_tpm = *pu1_byte++;
+    byte_count++;
+
+    /* mb type */
+    mb_type = mb_tpm & 0xF;
+    /* CABAC contexts for the MB */
+    ih264e_get_cabac_context(ps_ent_ctxt, mb_type);
+    ps_curr_ctxt = ps_cabac_ctxt->ps_curr_ctxt_mb_info;
+
+    /* if Intra MB */
+    if (mb_type == I16x16 || mb_type == I4x4)
+    {
+        cbp = *pu1_byte++;
+        byte_count++;
+        mb_qp_delta = *pu1_byte++;
+        byte_count++;
+
+        /* Starting bitstream offset for header in bits */
+        bitstream_start_offset = GET_NUM_BITS(ps_bitstream);
+
+        /* Encode mb_skip_flag */
+        ih264e_cabac_enc_mb_skip(0, ps_cabac_ctxt, MB_SKIP_FLAG_B_SLICE);
+        u4_cbp_c = (cbp >> 4);
+        u4_cbp_l = (cbp & 0xF);
+        if (mb_type == I16x16)
+        {
+            luma_intra_mode = ((mb_tpm >> 4) & 3) + 1 + (u4_cbp_c << 2)
+                            + (u4_cbp_l == 15) * 12;
+        }
+        else
+        {
+            luma_intra_mode = 0;
+        }
+        /* Encode intra mb type */
+        {
+            mb_info_ctxt_t *ps_left_ctxt = ps_cabac_ctxt->ps_left_ctxt_mb_info;
+            mb_info_ctxt_t *ps_top_ctxt = ps_cabac_ctxt->ps_top_ctxt_mb_info;
+            UWORD32 u4_ctx_inc = 0;
+
+            if (ps_left_ctxt != ps_cabac_ctxt->ps_def_ctxt_mb_info)
+                u4_ctx_inc += ((ps_left_ctxt->u1_mb_type & CAB_BD16x16_MASK)
+                                != CAB_BD16x16) ? 1 : 0;
+            if (ps_top_ctxt != ps_cabac_ctxt->ps_def_ctxt_mb_info)
+                u4_ctx_inc += ((ps_top_ctxt->u1_mb_type & CAB_BD16x16_MASK)
+                                != CAB_BD16x16) ? 1 : 0;
+
+            /* Intra Prefix Only "111101" */
+            u4_ctx_inc = (u4_ctx_inc | 0x05555430);
+            ih264e_encode_decision_bins(0x2f,
+                                        6,
+                                        u4_ctx_inc,
+                                        3,
+                                        ps_cabac_ctxt->au1_cabac_ctxt_table
+                                            + MB_TYPE_B_SLICE,
+                                        ps_cabac_ctxt);
+
+            ih264e_cabac_enc_intra_mb_type(BSLICE, (UWORD8) luma_intra_mode,
+                                           ps_cabac_ctxt, MB_TYPE_B_SLICE);
+
+        }
+
+        if (mb_type == I4x4)
+        { /* Intra 4x4 modes */
+            ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, pu1_byte);
+            byte_count += 8;
+        }
+        chroma_intra_mode = (mb_tpm >> 6);
+
+        ih264e_cabac_enc_chroma_predmode(chroma_intra_mode, ps_cabac_ctxt);
+
+        if (mb_type != I16x16)
+        {
+            /* encode CBP */
+            ih264e_cabac_enc_cbp(cbp, ps_cabac_ctxt);
+        }
+
+        if ((cbp > 0) || (mb_type == I16x16))
+        {
+            ih264e_cabac_enc_mb_qp_delta(mb_qp_delta, ps_cabac_ctxt);
+
+            /* Ending bitstream offset for header in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_header_bits[0] += bitstream_end_offset
+                            - bitstream_start_offset;
+            /* Starting bitstream offset for residue */
+            bitstream_start_offset = bitstream_end_offset;
+
+            /* Encoding Residue */
+            if (mb_type == I16x16)
+            {
+                ps_curr_ctxt->u1_mb_type = CAB_I16x16;
+                ps_curr_ctxt->u1_cbp = (UWORD8) cbp;
+                ih264e_cabac_encode_residue_luma_dc(ps_ent_ctxt);
+                ih264e_cabac_encode_residue(ps_ent_ctxt, cbp, LUMA_AC_CTXCAT);
+            }
+            else
+            {
+                ps_curr_ctxt->u1_cbp = (UWORD8) cbp;
+                ps_curr_ctxt->u1_mb_type = I4x4;
+                ps_curr_ctxt->u1_mb_type = CAB_I4x4;
+                ih264e_cabac_encode_residue(ps_ent_ctxt, cbp, LUMA_4X4_CTXCAT);
+                ps_cabac_ctxt->pu1_left_yuv_dc_csbp[0] &= 0x6;
+                ps_cabac_ctxt->ps_curr_ctxt_mb_info->u1_yuv_dc_csbp &= 0x6;
+            }
+
+            /* Ending bitstream offset for reside in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_residue_bits[0] += bitstream_end_offset
+                            - bitstream_start_offset;
+        }
+        else
+        {
+            ps_curr_ctxt->u1_yuv_ac_csbp = 0;
+            ps_curr_ctxt->u1_yuv_dc_csbp = 0;
+            *(ps_cabac_ctxt->pu1_left_uv_ac_csbp) = 0;
+            *(ps_cabac_ctxt->pu1_left_y_ac_csbp) = 0;
+            *(ps_cabac_ctxt->pu1_left_yuv_dc_csbp) = 0;
+            /* Ending bitstream offset for header in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_header_bits[0] += bitstream_end_offset
+                            - bitstream_start_offset;
+        }
+
+        memset(ps_curr_ctxt->u1_mv, 0, 16);
+        memset(ps_cabac_ctxt->pu1_left_mv_ctxt_inc, 0, 16);
+        ps_cabac_ctxt->ps_curr_ctxt_mb_info->u1_cbp = (UWORD8) cbp;
+
+        if (mb_type == I16x16)
+        {
+            ps_curr_ctxt->u1_mb_type = CAB_I16x16;
+        }
+        else
+        {
+            ps_curr_ctxt->u1_mb_type = CAB_I4x4;
+        }
+
+        ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count;
+
+        return IH264E_SUCCESS;
+    }
+
+    else /* Inter MB */
+    {
+        /* Starting bitstream offset for header in bits */
+        bitstream_start_offset = GET_NUM_BITS(ps_bitstream);
+        /* Encoding B_Direct_16x16 */
+        if (mb_type == BDIRECT)
+        {
+            cbp = *pu1_byte++;
+            byte_count++;
+            mb_qp_delta = *pu1_byte++;
+            byte_count++;
+
+            /* Encoding mb_skip */
+            ih264e_cabac_enc_mb_skip(0, ps_cabac_ctxt, MB_SKIP_FLAG_B_SLICE);
+
+            /* Encoding mb_type as B_Direct_16x16 */
+            {
+
+                mb_info_ctxt_t *ps_left_ctxt =
+                                ps_cabac_ctxt->ps_left_ctxt_mb_info;
+                mb_info_ctxt_t *ps_top_ctxt = ps_cabac_ctxt->ps_top_ctxt_mb_info;
+                UWORD32 u4_ctx_inc = 0;
+
+                if (ps_left_ctxt != ps_cabac_ctxt->ps_def_ctxt_mb_info)
+                    u4_ctx_inc += ((ps_left_ctxt->u1_mb_type & CAB_BD16x16_MASK)
+                                    != CAB_BD16x16) ? 1 : 0;
+                if (ps_top_ctxt != ps_cabac_ctxt->ps_def_ctxt_mb_info)
+                    u4_ctx_inc += ((ps_top_ctxt->u1_mb_type & CAB_BD16x16_MASK)
+                                    != CAB_BD16x16) ? 1 : 0;
+                /* Encode the bin */
+                ih264e_cabac_encode_bin(
+                                ps_cabac_ctxt,
+                                0,
+                                ps_cabac_ctxt->au1_cabac_ctxt_table
+                                                + MB_TYPE_B_SLICE + u4_ctx_inc);
+
+            }
+            ps_curr_ctxt->u1_mb_type = CAB_BD16x16;
+            memset(ps_curr_ctxt->u1_mv, 0, 16);
+            memset(ps_cabac_ctxt->pu1_left_mv_ctxt_inc, 0, 16);
+
+            /* Encode CBP */
+            ih264e_cabac_enc_cbp(cbp, ps_cabac_ctxt);
+
+            if (cbp)
+            {
+                /* encode mb_qp_delta */
+                ih264e_cabac_enc_mb_qp_delta(mb_qp_delta, ps_cabac_ctxt);
+            }
+
+            /* Ending bitstream offset for header in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_header_bits[1] += bitstream_end_offset
+                            - bitstream_start_offset;
+            /* Starting bitstream offset for residue */
+            bitstream_start_offset = bitstream_end_offset;
+            /* Starting bitstream offset for residue */
+
+        }
+
+        else if (mb_type == BSKIP)/* MB = BSKIP */
+        {
+            ih264e_cabac_enc_mb_skip(1, ps_cabac_ctxt, MB_SKIP_FLAG_B_SLICE);
+
+            ps_curr_ctxt->u1_mb_type = CAB_B_SKIP;
+
+            memset(ps_curr_ctxt->u1_mv, 0, 16);
+            memset(ps_cabac_ctxt->pu1_left_mv_ctxt_inc, 0, 16);
+            cbp = 0;
+
+            /* Ending bitstream offset for header in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_header_bits[1] += bitstream_end_offset
+                            - bitstream_start_offset;
+            /* Starting bitstream offset for residue */
+
+        }
+
+        else /* mbype is B_L0_16x16, B_L1_16x16 or B_Bi_16x16 */
+        {
+            WORD32 i4_mb_part_pred_mode = (mb_tpm >> 4);
+            UWORD32 u4_mb_type = mb_type - B16x16 + B_L0_16x16
+                            + i4_mb_part_pred_mode;
+            cbp = *pu1_byte++;
+            byte_count++;
+            mb_qp_delta = *pu1_byte++;
+            byte_count++;
+
+            /* Encoding mb_skip */
+            ih264e_cabac_enc_mb_skip(0, ps_cabac_ctxt, MB_SKIP_FLAG_B_SLICE);
+
+            /* Encoding mb_type as B16x16 */
+            {
+                mb_info_ctxt_t *ps_left_ctxt =
+                                ps_cabac_ctxt->ps_left_ctxt_mb_info;
+                mb_info_ctxt_t *ps_top_ctxt = ps_cabac_ctxt->ps_top_ctxt_mb_info;
+                UWORD32 u4_ctx_inc = 0;
+
+                UWORD32 u4_mb_type_bins = u4_b_mb_type[u4_mb_type];
+                UWORD32 u4_bin_len = (u4_mb_type_bins >> 8) & 0x0F;
+                u4_mb_type_bins = u4_mb_type_bins & 0xFF;
+
+                if (ps_left_ctxt != ps_cabac_ctxt->ps_def_ctxt_mb_info)
+                    u4_ctx_inc += ((ps_left_ctxt->u1_mb_type & CAB_BD16x16_MASK)
+                                    != CAB_BD16x16) ? 1 : 0;
+                if (ps_top_ctxt != ps_cabac_ctxt->ps_def_ctxt_mb_info)
+                    u4_ctx_inc += ((ps_top_ctxt->u1_mb_type & CAB_BD16x16_MASK)
+                                    != CAB_BD16x16) ? 1 : 0;
+
+                u4_ctx_inc = u4_ctx_inc | ui_b_mb_type_ctx_inc[u4_mb_type];
+
+                ih264e_encode_decision_bins(u4_mb_type_bins,
+                                            u4_bin_len,
+                                            u4_ctx_inc,
+                                            u4_bin_len,
+                                            &(ps_cabac_ctxt->au1_cabac_ctxt_table[MB_TYPE_B_SLICE]),
+                                            ps_cabac_ctxt);
+            }
+
+            ps_curr_ctxt->u1_mb_type = CAB_NON_BD16x16;
+            {
+                WORD16 *pi2_mv_ptr = (WORD16 *) pu1_byte;
+                /* Get the pred modes */
+
+                byte_count += 4 * (1 + (i4_mb_part_pred_mode == PRED_BI));
+
+                ps_curr_ctxt->u1_mb_type = (ps_curr_ctxt->u1_mb_type
+                                | CAB_NON_BD16x16);
+                /* Encoding motion vector for B16x16 */
+                ih264e_cabac_enc_mvds_b16x16(ps_cabac_ctxt, pi2_mv_ptr,
+                                             i4_mb_part_pred_mode);
+            }
+            /* Encode CBP */
+            ih264e_cabac_enc_cbp(cbp, ps_cabac_ctxt);
+
+            if (cbp)
+            {
+                /* encode mb_qp_delta */
+                ih264e_cabac_enc_mb_qp_delta(mb_qp_delta, ps_cabac_ctxt);
+            }
+
+            /* Ending bitstream offset for header in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_header_bits[1] += bitstream_end_offset
+                            - bitstream_start_offset;
+            /* Starting bitstream offset for residue */
+            bitstream_start_offset = bitstream_end_offset;
+        }
+
+        if (cbp > 0)
+        {
+            /* Encode residue */
+            ih264e_cabac_encode_residue(ps_ent_ctxt, cbp, LUMA_4X4_CTXCAT);
+            /* Ending bitstream offset for reside in bits */
+            bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+            ps_ent_ctxt->u4_residue_bits[1] += bitstream_end_offset
+                            - bitstream_start_offset;
+
+            ps_cabac_ctxt->pu1_left_yuv_dc_csbp[0] &= 0x6;
+            ps_curr_ctxt->u1_yuv_dc_csbp &= 0x6;
+        }
+        else
+        {
+            ps_curr_ctxt->u1_yuv_ac_csbp = 0;
+            ps_curr_ctxt->u1_yuv_dc_csbp = 0;
+            *(ps_cabac_ctxt->pu1_left_uv_ac_csbp) = 0;
+            *(ps_cabac_ctxt->pu1_left_y_ac_csbp) = 0;
+            *(ps_cabac_ctxt->pu1_left_yuv_dc_csbp) = 0;
+        }
+        ps_curr_ctxt->u1_intrapred_chroma_mode = 0;
+        ps_curr_ctxt->u1_cbp = cbp;
+        ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count;
+        return IH264E_SUCCESS;
+    }
+}
diff --git a/encoder/ih264e_cabac_init.c b/encoder/ih264e_cabac_init.c
new file mode 100644
index 0000000..347842c
--- /dev/null
+++ b/encoder/ih264e_cabac_init.c
@@ -0,0 +1,226 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+*******************************************************************************
+* @file
+* ih264e_cabac_init.c
+*
+* @brief
+*  Contains all initialization functions for cabac contexts
+*
+* @author
+*  Doney Alex
+*
+* @par List of Functions:
+*
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+/* System include files */
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "iv2.h"
+#include "ive2.h"
+#include "ih264_defs.h"
+#include "ih264_debug.h"
+#include "ime_distortion_metrics.h"
+#include "ime_defs.h"
+#include "ime_structs.h"
+#include "ih264_error.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include "ih264_inter_pred_filters.h"
+#include "ih264_mem_fns.h"
+#include "ih264_padding.h"
+#include "ih264_intra_pred_filters.h"
+#include "ih264_deblk_edge_filters.h"
+#include "ih264_platform_macros.h"
+#include "ih264_macros.h"
+#include "ih264_buf_mgr.h"
+#include "ih264e_error.h"
+#include "ih264e_bitstream.h"
+#include "ih264_common_tables.h"
+#include "ih264_cabac_tables.h"
+#include "ih264_list.h"
+#include "ih264e_defs.h"
+#include "irc_cntrl_param.h"
+#include "irc_frame_info_collector.h"
+#include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
+#include "ih264e_structs.h"
+#include "ih264e_cabac.h"
+#include "ih264e_process.h"
+#include "ithread.h"
+#include "ih264e_intra_modes_eval.h"
+#include "ih264e_encode_header.h"
+#include "ih264e_globals.h"
+#include "ih264e_config.h"
+#include "ih264e_trace.h"
+#include "ih264e_statistics.h"
+#include "ih264_cavlc_tables.h"
+#include "ih264e_deblk.h"
+#include "ih264e_me.h"
+#include "ih264e_debug.h"
+#include "ih264e_master.h"
+#include "ih264e_utils.h"
+#include "irc_mem_req_and_acq.h"
+#include "irc_rate_control_api.h"
+#include "ih264e_platform_macros.h"
+#include "ime_statistics.h"
+
+
+
+/*****************************************************************************/
+/*  Function definitions .                                                   */
+/*****************************************************************************/
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Initialize cabac encoding environment
+ *
+ * @param[in] ps_cab_enc_env
+ *  Pointer to encoding_envirnoment_t structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+*/
+static void ih264e_init_cabac_enc_envirnoment(encoding_envirnoment_t *ps_cab_enc_env)
+{
+    ps_cab_enc_env->u4_code_int_low = 0;
+    ps_cab_enc_env->u4_code_int_range = 0x1fe;
+    ps_cab_enc_env->u4_out_standing_bytes = 0;
+    ps_cab_enc_env->u4_bits_gen = 0;
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Initialize default context values and pointers (Called once at the beginning of encoding).
+ *
+ * @param[in] ps_ent_ctxt
+ *  Pointer to entropy context structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+*/
+void ih264e_init_cabac_table(entropy_ctxt_t *ps_ent_ctxt)
+{
+    /* CABAC context */
+    cabac_ctxt_t *ps_cabac_ctxt = ps_ent_ctxt->ps_cabac;
+    ps_cabac_ctxt->ps_mb_map_ctxt_inc = ps_cabac_ctxt->ps_mb_map_ctxt_inc_base + 1;
+    ps_cabac_ctxt->ps_lft_csbp = &ps_cabac_ctxt->s_lft_csbp;
+    ps_cabac_ctxt->ps_bitstrm = ps_ent_ctxt->ps_bitstrm;
+
+    {
+        /* 0th entry of mb_map_ctxt_inc will be always be containing default values */
+        /* for CABAC context representing MB not available                       */
+        mb_info_ctxt_t *ps_def_ctxt = ps_cabac_ctxt->ps_mb_map_ctxt_inc - 1;
+        UWORD32 *pu4_temp;
+        WORD8 i;
+
+        ps_def_ctxt->u1_mb_type = CAB_SKIP;
+        ps_def_ctxt->u1_cbp = 0x0f;
+        ps_def_ctxt->u1_intrapred_chroma_mode = 0;
+        pu4_temp = (UWORD32 *)ps_def_ctxt->i1_ref_idx;
+        pu4_temp[0] = 0;
+        pu4_temp = (UWORD32 *)ps_def_ctxt->u1_mv;
+        for (i = 0; i < 4; i++, pu4_temp++)
+            (*pu4_temp) = 0;
+        ps_cabac_ctxt->ps_def_ctxt_mb_info = ps_def_ctxt;
+    }
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Initialize cabac context: Initialize all contest with init values given in the spec.
+ * Called at the beginning of entropy coding of each slice for CABAC encoding.
+ *
+ * @param[in] ps_ent_ctxt
+ *  Pointer to entropy context structure
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+void ih264e_init_cabac_ctxt(entropy_ctxt_t *ps_ent_ctxt)
+{
+    /* CABAC context */
+    cabac_ctxt_t *ps_cabac_ctxt = ps_ent_ctxt->ps_cabac;
+
+    /* slice header */
+    slice_header_t *ps_slice_hdr = ps_ent_ctxt->ps_slice_hdr_base;
+    const UWORD8 u1_slice_type = ps_slice_hdr->u1_slice_type;
+    WORD8 i1_cabac_init_idc = 0;
+    bin_ctxt_model *au1_cabac_ctxt_table = ps_cabac_ctxt->au1_cabac_ctxt_table;
+    UWORD8 u1_qp_y = ps_slice_hdr->i1_slice_qp;
+
+    ih264e_init_cabac_enc_envirnoment(&ps_cabac_ctxt->s_cab_enc_env);
+
+    ps_cabac_ctxt->i1_prevps_mb_qp_delta_ctxt = 0;
+
+    if (ISLICE != u1_slice_type)
+    {
+        i1_cabac_init_idc = ps_slice_hdr->i1_cabac_init_idc;
+    }
+    else
+    {
+        i1_cabac_init_idc = 3;
+
+    }
+
+    memcpy(au1_cabac_ctxt_table,
+           gau1_ih264_cabac_ctxt_init_table[i1_cabac_init_idc][u1_qp_y],
+           NUM_CABAC_CTXTS * sizeof(bin_ctxt_model));
+
+}
diff --git a/encoder/ih264e_cabac_structs.h b/encoder/ih264e_cabac_structs.h
new file mode 100644
index 0000000..82938ca
--- /dev/null
+++ b/encoder/ih264e_cabac_structs.h
@@ -0,0 +1,221 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+ *******************************************************************************
+ * @file
+ *  ih264e_cabac_structs.h
+ *
+ * @brief
+ *  This file contains cabac related structure definitions.
+ *
+ * @author
+ *  Doney Alex
+ *
+ * @remarks
+ *  none
+ *
+ *******************************************************************************
+ */
+
+#ifndef IH264E_CABAC_STRUCTS_H_
+#define IH264E_CABAC_STRUCTS_H_
+
+
+
+#define CABAC_INIT_IDC 2
+
+
+/**
+ ******************************************************************************
+ *  @brief     typedef for  context model
+ ******************************************************************************
+ */
+
+/* bits 0 to 5 :state
+   bit 6       :mps */
+typedef UWORD8 bin_ctxt_model;
+
+/**
+ ******************************************************************************
+ *  @brief      MB info for cabac
+ ******************************************************************************
+ */
+typedef struct
+{
+    /* Neighbour availability Variables needed to get CtxtInc, for CABAC */
+    UWORD8 u1_mb_type; /* !< macroblock type: I/P/B/SI/SP */
+
+    UWORD8 u1_cbp; /* !< Coded Block Pattern */
+    UWORD8 u1_intrapred_chroma_mode;
+
+    /*************************************************************************/
+    /*               Arrangnment of AC CSBP                                  */
+    /*        bits:  b7 b6 b5 b4 b3 b2 b1 b0                                 */
+    /*        CSBP:  V1 V0 U1 U0 Y3 Y2 Y1 Y0                                 */
+    /*************************************************************************/
+    UWORD8 u1_yuv_ac_csbp;
+    /*************************************************************************/
+    /*               Arrangnment of DC CSBP                                  */
+    /*        bits:  b7  b6  b5  b4  b3  b2  b1  b0                          */
+    /*        CSBP:   x   x   x   x   x  Vdc Udc Ydc                         */
+    /*************************************************************************/
+    UWORD8 u1_yuv_dc_csbp;
+
+    WORD8 i1_ref_idx[4];
+    UWORD8 u1_mv[4][4];
+} mb_info_ctxt_t;
+
+
+/**
+ ******************************************************************************
+ *  @brief      CSBP info for CABAC
+ ******************************************************************************
+ */
+typedef struct
+{
+    /*************************************************************************/
+    /*               Arrangnment of Luma AC CSBP for leftMb                  */
+    /*        bits:  b7 b6 b5 b4 b3 b2 b1 b0                                 */
+    /*        CSBP:   X  X  X  X Y3 Y2 Y1 Y0                                 */
+    /*************************************************************************/
+    /*************************************************************************/
+    /*  Points either to u1_y_ac_csbp_top_mb or  u1_y_ac_csbp_bot_mb         */
+    /*************************************************************************/
+    UWORD8 u1_y_ac_csbp_top_mb;
+    UWORD8 u1_y_ac_csbp_bot_mb;
+
+    /*************************************************************************/
+    /*               Arrangnment of Chroma AC CSBP for leftMb                */
+    /*        bits:  b7 b6 b5 b4 b3 b2 b1 b0                                 */
+    /*        CSBP:   X  X  X  X V1 V0 U1 U0                                 */
+    /*************************************************************************/
+    /*************************************************************************/
+    /*  Points either to u1_uv_ac_csbp_top_mb or  u1_uv_ac_csbp_bot_mb       */
+    /*************************************************************************/
+    UWORD8 u1_uv_ac_csbp_top_mb;
+    UWORD8 u1_uv_ac_csbp_bot_mb;
+
+    /*************************************************************************/
+    /*               Arrangnment of DC CSBP                                  */
+    /*        bits:  b7  b6  b5  b4  b3  b2  b1  b0                          */
+    /*        CSBP:   x   x   x   x   x  Vdc Udc Ydc                         */
+    /*************************************************************************/
+    /*************************************************************************/
+    /*  Points either to u1_yuv_dc_csbp_top_mb or  u1_yuv_dc_csbp_bot_mb     */
+    /*************************************************************************/
+    UWORD8 u1_yuv_dc_csbp_top_mb;
+    UWORD8 u1_yuv_dc_csbp_bot_mb;
+} cab_csbp_t;
+
+/**
+ ******************************************************************************
+ *  @brief      CABAC Encoding Environment
+ ******************************************************************************
+ */
+
+typedef struct
+{
+    /** cabac interval start L  */
+    UWORD32 u4_code_int_low;
+
+    /** cabac interval range R  */
+    UWORD32 u4_code_int_range;
+
+    /** bytes_outsanding; number of 0xFF bits that occur during renorm
+    *  These  will be accumulated till the carry bit is knwon
+    */
+    UWORD32  u4_out_standing_bytes;
+
+    /** bits generated during renormalization
+    *   A byte is put to stream/u4_out_standing_bytes from u4_low(L) when
+    *   u4_bits_gen exceeds 8
+    */
+    UWORD32  u4_bits_gen;
+} encoding_envirnoment_t;
+
+
+/**
+ ******************************************************************************
+ *  @brief      CABAC Context structure : Variables to handle Cabac
+ ******************************************************************************
+ */
+typedef struct
+{
+
+    /*  Base pointer to all the cabac contexts  */
+    bin_ctxt_model au1_cabac_ctxt_table[NUM_CABAC_CTXTS];
+
+
+    cab_csbp_t s_lft_csbp;
+
+    /**
+     * pointer to Bitstream structure
+     */
+    bitstrm_t *ps_bitstrm;
+
+    /* Pointer to mb_info_ctxt_t map_base */
+    mb_info_ctxt_t *ps_mb_map_ctxt_inc_base;
+
+    /* Pointer to encoding_envirnoment_t */
+    encoding_envirnoment_t s_cab_enc_env;
+
+    /* These things need to be updated at each MbLevel */
+
+    /* Prev ps_mb_qp_delta_ctxt */
+    WORD8 i1_prevps_mb_qp_delta_ctxt;
+
+    /* Pointer to mb_info_ctxt_t map */
+    mb_info_ctxt_t *ps_mb_map_ctxt_inc;
+
+    /* Pointer to default mb_info_ctxt_t */
+    mb_info_ctxt_t *ps_def_ctxt_mb_info;
+
+    /* Pointer to current mb_info_ctxt_t */
+    mb_info_ctxt_t *ps_curr_ctxt_mb_info;
+
+    /* Pointer to left mb_info_ctxt_t */
+    mb_info_ctxt_t *ps_left_ctxt_mb_info;
+
+    /* Pointer to top mb_info_ctxt_t  */
+    mb_info_ctxt_t *ps_top_ctxt_mb_info;
+
+    /* Poniter to left csbp structure */
+    cab_csbp_t *ps_lft_csbp;
+    UWORD8 *pu1_left_y_ac_csbp;
+    UWORD8 *pu1_left_uv_ac_csbp;
+    UWORD8 *pu1_left_yuv_dc_csbp;
+
+    /***************************************************************************/
+    /*       Ref_idx contexts  are stored in the following way                 */
+    /*  Array Idx 0,1 for reference indices in Forward direction               */
+    /*  Array Idx 2,3 for reference indices in backward direction              */
+    /***************************************************************************/
+    /* Dimensions for u1_left_ref_ctxt_inc_arr is [2][4] for Mbaff:Top and Bot */
+    WORD8 i1_left_ref_idx_ctx_inc_arr[2][4];
+    WORD8 *pi1_left_ref_idx_ctxt_inc;
+
+    /* Dimensions for u1_left_mv_ctxt_inc_arr is [2][4][4] for Mbaff case */
+    UWORD8 u1_left_mv_ctxt_inc_arr[2][4][4];
+    UWORD8 (*pu1_left_mv_ctxt_inc)[4];
+
+} cabac_ctxt_t;
+
+#endif /* IH264E_CABAC_STRUCTS_H_ */
diff --git a/encoder/ih264e_cavlc.c b/encoder/ih264e_cavlc.c
index 1f98b6a..5d819d9 100644
--- a/encoder/ih264e_cavlc.c
+++ b/encoder/ih264e_cavlc.c
@@ -35,8 +35,8 @@
 *  - ih264e_write_coeff4x4_cavlc()
 *  - ih264e_write_coeff8x8_cavlc()
 *  - ih264e_encode_residue()
-*  - ih264e_write_islice_mb()
-*  - ih264e_write_pslice_mb()
+*  - ih264e_write_islice_mb_cavlc()
+*  - ih264e_write_pslice_mb_cavlc()
 *
 * @remarks
 *  None
@@ -65,8 +65,8 @@
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -75,9 +75,11 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_encode_header.h"
 #include "ih264_cavlc_tables.h"
@@ -712,8 +714,8 @@ static IH264E_ERROR_T ih264e_encode_residue(entropy_ctxt_t *ps_ent_ctxt,
     /* temp var */
     UWORD32 u4_nC, u4_ngbr_avlb;
     UWORD8 au1_nnz[4], *pu1_ngbr_avlb, *pu1_top_nnz, *pu1_left_nnz;
-    UWORD16 au2_sig_coeff_map[4];
-    WORD16 *pi2_res_block[4];
+    UWORD16 au2_sig_coeff_map[4] = {0};
+    WORD16 *pi2_res_block[4] = {NULL};
     UWORD8 *pu1_slice_idx = ps_ent_ctxt->pu1_slice_idx;
     tu_sblk_coeff_data_t *ps_mb_coeff_data;
     ENTROPY_BLK_TYPE e_entropy_blk_type = CAVLC_LUMA_4x4;
@@ -925,7 +927,6 @@ static IH264E_ERROR_T ih264e_encode_residue(entropy_ctxt_t *ps_ent_ctxt,
     return error_status;
 }
 
-#define GET_NUM_BITS(ps_bitstream) ((ps_bitstream->u4_strm_buf_offset << 3) + 32 - ps_bitstream->i4_bits_left_in_cw)
 
 /**
 *******************************************************************************
@@ -948,7 +949,7 @@ static IH264E_ERROR_T ih264e_encode_residue(entropy_ctxt_t *ps_ent_ctxt,
 *
 *******************************************************************************
 */
-IH264E_ERROR_T ih264e_write_islice_mb(entropy_ctxt_t *ps_ent_ctxt)
+IH264E_ERROR_T ih264e_write_islice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt)
 {
     /* error status */
     IH264E_ERROR_T error_status = IH264E_SUCCESS;
@@ -1170,7 +1171,7 @@ IH264E_ERROR_T ih264e_write_islice_mb(entropy_ctxt_t *ps_ent_ctxt)
 *
 *******************************************************************************
 */
-IH264E_ERROR_T ih264e_write_pslice_mb(entropy_ctxt_t *ps_ent_ctxt)
+IH264E_ERROR_T ih264e_write_pslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt)
 {
     /* error status */
     IH264E_ERROR_T error_status = IH264E_SUCCESS;
@@ -1406,7 +1407,6 @@ IH264E_ERROR_T ih264e_write_pslice_mb(entropy_ctxt_t *ps_ent_ctxt)
         for (i = 0; i < (WORD32)u4_part_cnt; i++)
         {
             PUT_BITS_SEV(ps_bitstream, *pi2_mv_ptr++, error_status, "mv x");
-
             PUT_BITS_SEV(ps_bitstream, *pi2_mv_ptr++, error_status, "mv y");
         }
 
@@ -1425,6 +1425,323 @@ IH264E_ERROR_T ih264e_write_pslice_mb(entropy_ctxt_t *ps_ent_ctxt)
         PUT_BITS_SEV(ps_bitstream, mb_qp_delta, error_status, "mb_qp_delta");
     }
 
+    /* Ending bitstream offset for header in bits */
+    bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+
+    ps_ent_ctxt->u4_header_bits[is_inter] += bitstream_end_offset - bitstream_start_offset;
+
+    /* start bitstream offset for residue in bits */
+    bitstream_start_offset = bitstream_end_offset;
+
+    /* residual */
+    error_status = ih264e_encode_residue(ps_ent_ctxt, mb_type, cbp);
+
+    /* Ending bitstream offset for residue in bits */
+    bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+
+    ps_ent_ctxt->u4_residue_bits[is_inter] += bitstream_end_offset - bitstream_start_offset;
+
+    /* store the index of the next mb syntax layer */
+    ps_ent_ctxt->pv_mb_header_data = pu1_byte;
+
+    return error_status;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  This function generates CAVLC coded bit stream for B slices
+*
+* @description
+*  The mb syntax layer for inter slices constitutes luma mb mode, luma sub modes
+*  (if present), mb qp delta, coded block pattern, chroma mb mode and
+*  luma/chroma residue. These syntax elements are written as directed by table
+*  7.3.5 of h264 specification
+*
+* @param[in] ps_ent_ctxt
+*  pointer to entropy context
+*
+* @returns error code
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+IH264E_ERROR_T ih264e_write_bslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt)
+{
+    /* error status */
+    IH264E_ERROR_T error_status = IH264E_SUCCESS;
+
+    /* bit stream ptr */
+    bitstrm_t *ps_bitstream = ps_ent_ctxt->ps_bitstrm;
+
+    /* packed header data */
+    UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data;
+
+    /* mb header info */
+    /*
+     * mb_tpm : mb type plus mode
+     * mb_type : luma mb type and chroma mb type are packed
+     * cbp : coded block pattern
+     * mb_qp_delta : mb qp delta
+     * chroma_intra_mode : chroma intra mode
+     * luma_intra_mode : luma intra mode
+     * ps_pu :  Pointer to the array of structures having motion vectors, size
+     * and position of sub partitions
+     */
+    WORD32 mb_tpm, mb_type, cbp, chroma_intra_mode, luma_intra_mode;
+    WORD8 mb_qp_delta;
+
+    /* temp var */
+    WORD32 i, mb_type_stream, cbptable = 1;
+
+    WORD32 is_inter = 0;
+
+    WORD32 bitstream_start_offset, bitstream_end_offset;
+
+    /* Starting bitstream offset for header in bits */
+    bitstream_start_offset = GET_NUM_BITS(ps_bitstream);
+
+    /********************************************************************/
+    /*                    BEGIN HEADER GENERATION                       */
+    /********************************************************************/
+
+    mb_tpm = *pu1_byte++;
+
+    /* mb type */
+    mb_type = mb_tpm & 0xF;
+
+    /* check for skip */
+    if (mb_type == BSKIP)
+    {
+        UWORD32 *nnz;
+
+        is_inter = 1;
+
+        /* increment skip counter */
+        (*ps_ent_ctxt->pi4_mb_skip_run)++;
+
+        /* store the index of the next mb syntax layer */
+        ps_ent_ctxt->pv_mb_header_data = pu1_byte;
+
+        /* set nnz to zero */
+        ps_ent_ctxt->u4_left_nnz_luma = 0;
+        nnz = (UWORD32 *)ps_ent_ctxt->pu1_top_nnz_luma[ps_ent_ctxt->i4_mb_x];
+        *nnz = 0;
+        ps_ent_ctxt->u4_left_nnz_cbcr = 0;
+        nnz = (UWORD32 *)ps_ent_ctxt->pu1_top_nnz_cbcr[ps_ent_ctxt->i4_mb_x];
+        *nnz = 0;
+
+        /* residual */
+        error_status = ih264e_encode_residue(ps_ent_ctxt, B16x16, 0);
+
+        bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
+
+        ps_ent_ctxt->u4_header_bits[is_inter] += bitstream_end_offset
+                        - bitstream_start_offset;
+
+        return error_status;
+    }
+
+
+    /* remaining mb header info */
+    cbp = *pu1_byte++;
+    mb_qp_delta = *pu1_byte++;
+
+    /* mb skip run */
+    PUT_BITS_UEV(ps_bitstream, *ps_ent_ctxt->pi4_mb_skip_run, error_status, "mb skip run");
+
+    /* reset skip counter */
+    *ps_ent_ctxt->pi4_mb_skip_run = 0;
+
+    /* is intra ? */
+    if (mb_type == I16x16)
+    {
+        UWORD32 u4_cbp_l, u4_cbp_c;
+
+        is_inter = 0;
+
+        u4_cbp_c = (cbp >> 4);
+        u4_cbp_l = (cbp & 0xF);
+        luma_intra_mode = (mb_tpm >> 4) & 3;
+        chroma_intra_mode = (mb_tpm >> 6);
+
+        mb_type_stream =  luma_intra_mode + 1 + (u4_cbp_c << 2) + (u4_cbp_l == 15) * 12;
+
+        mb_type_stream += 23;
+
+        /* write mb type */
+        PUT_BITS_UEV(ps_bitstream, mb_type_stream, error_status, "mb type");
+
+        /* intra_chroma_pred_mode */
+        PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode");
+    }
+    else if (mb_type == I4x4)
+    {
+        /* mb sub blk modes */
+        WORD32 intra_pred_mode_flag, rem_intra_mode;
+        WORD32 byte;
+
+        is_inter = 0;
+
+        chroma_intra_mode = (mb_tpm >> 6);
+        cbptable = 0;
+
+        /* write mb type */
+        PUT_BITS_UEV(ps_bitstream, 23, error_status, "mb type");
+
+        for (i = 0; i < 16; i += 2)
+        {
+            /* sub blk idx 1 */
+            byte = *pu1_byte++;
+
+            intra_pred_mode_flag = byte & 0x1;
+
+            /* prev_intra4x4_pred_mode_flag */
+            PUT_BITS(ps_bitstream, intra_pred_mode_flag, 1, error_status, "prev_intra4x4_pred_mode_flag");
+
+            /* rem_intra4x4_pred_mode */
+            if (!intra_pred_mode_flag)
+            {
+                rem_intra_mode = (byte & 0xF) >> 1;
+                PUT_BITS(ps_bitstream, rem_intra_mode, 3, error_status, "rem_intra4x4_pred_mode");
+            }
+
+            /* sub blk idx 2 */
+            byte >>= 4;
+
+            intra_pred_mode_flag = byte & 0x1;
+
+            /* prev_intra4x4_pred_mode_flag */
+            PUT_BITS(ps_bitstream, intra_pred_mode_flag, 1, error_status, "prev_intra4x4_pred_mode_flag");
+
+            /* rem_intra4x4_pred_mode */
+            if (!intra_pred_mode_flag)
+            {
+                rem_intra_mode = (byte & 0xF) >> 1;
+                PUT_BITS(ps_bitstream, rem_intra_mode, 3, error_status, "rem_intra4x4_pred_mode");
+            }
+        }
+
+        /* intra_chroma_pred_mode */
+        PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode");
+    }
+    else if (mb_type == I8x8)
+    {
+        /* transform 8x8 flag */
+        UWORD32 u4_transform_size_8x8_flag = ps_ent_ctxt->i1_transform_8x8_mode_flag;
+
+        /* mb sub blk modes */
+        WORD32 intra_pred_mode_flag, rem_intra_mode;
+        WORD32 byte;
+
+        is_inter = 0;
+
+        chroma_intra_mode = (mb_tpm >> 6);
+        cbptable = 0;
+
+        ASSERT(0);
+
+        /* write mb type */
+        PUT_BITS_UEV(ps_bitstream, 23, error_status, "mb type");
+
+        /* u4_transform_size_8x8_flag */
+        PUT_BITS(ps_bitstream, u4_transform_size_8x8_flag, 1, error_status, "u4_transform_size_8x8_flag");
+
+        /* write sub block modes */
+        for (i = 0; i < 4; i++)
+        {
+            /* sub blk idx 1 */
+            byte = *pu1_byte++;
+
+            intra_pred_mode_flag = byte & 0x1;
+
+            /* prev_intra4x4_pred_mode_flag */
+            PUT_BITS(ps_bitstream, intra_pred_mode_flag, 1, error_status, "prev_intra4x4_pred_mode_flag");
+
+            /* rem_intra4x4_pred_mode */
+            if (!intra_pred_mode_flag)
+            {
+                rem_intra_mode = (byte & 0xF) >> 1;
+                PUT_BITS(ps_bitstream, rem_intra_mode, 3, error_status, "rem_intra4x4_pred_mode");
+            }
+
+            /* sub blk idx 2 */
+            byte >>= 4;
+
+            intra_pred_mode_flag = byte & 0x1;
+
+            /* prev_intra4x4_pred_mode_flag */
+            PUT_BITS(ps_bitstream, intra_pred_mode_flag, 1, error_status, "prev_intra4x4_pred_mode_flag");
+
+            /* rem_intra4x4_pred_mode */
+            if (!intra_pred_mode_flag)
+            {
+                rem_intra_mode = (byte & 0xF) >> 1;
+                PUT_BITS(ps_bitstream, rem_intra_mode, 3, error_status, "rem_intra4x4_pred_mode");
+            }
+        }
+
+        /* intra_chroma_pred_mode */
+        PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode");
+    }
+    else if(mb_type == BDIRECT)
+    {
+        is_inter = 1;
+        /* write mb type */
+        PUT_BITS_UEV(ps_bitstream, B_DIRECT_16x16, error_status, "mb type");
+    }
+    else /* if mb_type == B16x16 */
+    {
+        /* inter macro block partition cnt for 16x16 16x8 8x16 8x8 */
+        const UWORD8 au1_part_cnt[] = { 1, 2, 2, 4 };
+
+        /* mv ptr */
+        WORD16 *pi2_mvd_ptr = (WORD16 *)pu1_byte;
+
+        /* number of partitions for the current mb */
+        UWORD32 u4_part_cnt = au1_part_cnt[mb_type - B16x16];
+
+        /* Get the pred modes */
+        WORD32 i4_mb_part_pred_mode = (mb_tpm >> 4);
+
+        is_inter = 1;
+
+        mb_type_stream = mb_type - B16x16 + B_L0_16x16 + i4_mb_part_pred_mode;
+
+        /* write mb type */
+        PUT_BITS_UEV(ps_bitstream, mb_type_stream, error_status, "mb type");
+
+        for (i = 0; i < (WORD32)u4_part_cnt; i++)
+        {
+            if (i4_mb_part_pred_mode != PRED_L1)/* || PRED_BI */
+            {
+                PUT_BITS_SEV(ps_bitstream, *pi2_mvd_ptr++, error_status, "mv l0 x");
+                PUT_BITS_SEV(ps_bitstream, *pi2_mvd_ptr++, error_status, "mv l0 y");
+            }
+            if (i4_mb_part_pred_mode != PRED_L0)/* || PRED_BI */
+            {
+                PUT_BITS_SEV(ps_bitstream, *pi2_mvd_ptr++, error_status, "mv l1 x");
+                PUT_BITS_SEV(ps_bitstream, *pi2_mvd_ptr++, error_status, "mv l1 y");
+            }
+        }
+
+        pu1_byte = (UWORD8 *)pi2_mvd_ptr;
+    }
+
+    /* coded_block_pattern */
+    if (mb_type != I16x16)
+    {
+        PUT_BITS_UEV(ps_bitstream, gu1_cbp_map_tables[cbp][cbptable], error_status, "coded_block_pattern");
+    }
+
+    if (cbp || mb_type == I16x16)
+    {
+        /* mb_qp_delta */
+        PUT_BITS_SEV(ps_bitstream, mb_qp_delta, error_status, "mb_qp_delta");
+    }
 
     /* Ending bitstream offset for header in bits */
     bitstream_end_offset = GET_NUM_BITS(ps_bitstream);
diff --git a/encoder/ih264e_cavlc.h b/encoder/ih264e_cavlc.h
index acd0def..8da2cea 100644
--- a/encoder/ih264e_cavlc.h
+++ b/encoder/ih264e_cavlc.h
@@ -42,23 +42,6 @@
 /* Function macro definitions                                                */
 /*****************************************************************************/
 
-#define PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, u4_nnz, u4_sig_coeff_map, pi2_res_block)   \
-{\
-    ps_mb_coeff_data = pv_mb_coeff_data; \
-    u4_nnz = ps_mb_coeff_data->i4_sig_map_nnz & 0xff;    \
-    if (u4_nnz)\
-    {\
-        u4_sig_coeff_map = ps_mb_coeff_data->i4_sig_map_nnz >> 16; \
-        pi2_res_block = ps_mb_coeff_data->ai2_residue; \
-        pv_mb_coeff_data = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz); \
-    }\
-    else\
-    {\
-      pv_mb_coeff_data = ps_mb_coeff_data->ai2_residue;\
-    }\
-}
-
-
 /*****************************************************************************/
 /* Extern Function Declarations                                              */
 /*****************************************************************************/
@@ -84,7 +67,7 @@
 *
 *******************************************************************************
 */
-IH264E_ERROR_T ih264e_write_islice_mb(entropy_ctxt_t *ps_ent_ctxt);
+IH264E_ERROR_T ih264e_write_islice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt);
 
 /**
 *******************************************************************************
@@ -107,6 +90,29 @@ IH264E_ERROR_T ih264e_write_islice_mb(entropy_ctxt_t *ps_ent_ctxt);
 *
 *******************************************************************************
 */
-IH264E_ERROR_T ih264e_write_pslice_mb(entropy_ctxt_t *ps_ent_ctxt);
+IH264E_ERROR_T ih264e_write_pslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt);
+
+/**
+*******************************************************************************
+*
+* @brief
+*  This function generates CAVLC coded bit stream for Inter(B) slices
+*
+* @description
+*  The mb syntax layer for inter slices constitutes luma mb mode, luma sub modes
+*  (if present), mb qp delta, coded block pattern, chroma mb mode and
+*  luma/chroma residue. These syntax elements are written as directed by table
+*  7.3.5 of h264 specification
+*
+* @param[in] ps_ent_ctxt
+*  pointer to entropy context
+*
+* @returns error code
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+IH264E_ERROR_T ih264e_write_bslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt);
 
 #endif /* IH264E_CAVLC_H_ */
diff --git a/encoder/ih264e_core_coding.c b/encoder/ih264e_core_coding.c
index 89243a5..76266d7 100644
--- a/encoder/ih264e_core_coding.c
+++ b/encoder/ih264e_core_coding.c
@@ -65,6 +65,7 @@
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -73,9 +74,11 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_globals.h"
 #include "ih264e_core_coding.h"
@@ -1249,7 +1252,7 @@ void ih264e_pack_c_mb(WORD16 *pi2_res_mb,
      * For that there are two paths we need to look for
      * One is the path to bitstream , these variables should have the proper input
      * configured UV or VU
-     * For the other path the inverse transform variables should have ehat ever 0ordering the
+     * For the other path the inverse transform variables should have what ever ordering the
      * input had
      */
 
@@ -2019,7 +2022,7 @@ UWORD8 ih264e_code_chroma_intra_macroblock_8x8(process_ctxt_t *ps_proc)
     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
 
     /* strides */
-    WORD32 i4_src_strd = ps_proc->i4_src_strd;
+    WORD32 i4_src_strd = ps_proc->i4_src_chroma_strd;
     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
     WORD32 i4_res_strd = ps_proc->i4_res_strd;
@@ -2281,7 +2284,7 @@ UWORD8 ih264e_code_chroma_inter_macroblock_8x8(process_ctxt_t *ps_proc)
     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
 
     /* strides */
-    WORD32 i4_src_strd = ps_proc->i4_src_strd;
+    WORD32 i4_src_strd = ps_proc->i4_src_chroma_strd;
     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
     WORD32 i4_res_strd = ps_proc->i4_res_strd;
diff --git a/encoder/ih264e_deblk.c b/encoder/ih264e_deblk.c
index 8a11bdb..db176ac 100644
--- a/encoder/ih264e_deblk.c
+++ b/encoder/ih264e_deblk.c
@@ -63,6 +63,7 @@
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -71,12 +72,13 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264_trans_data.h"
-#include "ih264_deblk_edge_filters.h"
 #include "ih264_deblk_tables.h"
 #include "ih264e_deblk.h"
 
@@ -150,20 +152,15 @@ static const UWORD16  ih264e_gu2_4x4_v2h_reorder[16] =
 * @param[in] u4_left_mb_csbp
 *  coded sub block pattern of top mb
 *
-* @param[in] ps_leftMvPred
-*  MV of left mb
-*
-* @param[in] ps_topMvPred
-*  MV of top mb
+* @param[in] ps_left_pu
+*  PU for left MB
 *
-* @param[in] ps_curMvPred
-*  MV of curr mb
+* @param[in] ps_top_pu
+*  PU for top MB
 *
-* @param[in] u1_left_intra
-*  is left intra
+* @param[in] ps_curr_pu
+*  PU for current MB
 *
-* @param[in] u1_top_intra
-*  is top intra
 *
 * @returns  none
 *
@@ -176,64 +173,65 @@ static void ih264e_fill_bs_1mv_1ref_non_mbaff(UWORD32 *pu4_horz_bs,
                                               UWORD32 u4_left_mb_csbp,
                                               UWORD32 u4_top_mb_csbp,
                                               UWORD32 u4_cur_mb_csbp,
-                                              mv_t *ps_leftMvPred,
-                                              mv_t *ps_topMvPred,
-                                              mv_t *ps_curMvPred,
-                                              UWORD8 u1_left_intra,
-                                              UWORD8 u1_top_intra)
+                                              enc_pu_t *ps_left_pu,
+                                              enc_pu_t *ps_top_pu,
+                                              enc_pu_t *ps_curr_pu)
 {
     /* motion vectors of blks p & q */
-    WORD16   i16_qMv0, i16_qMv1, i16_pMv0, i16_pMv1;
+    WORD16 i16_qMvl0_x, i16_qMvl0_y, i16_pMvl0_x, i16_pMvl0_y;
+    WORD16 i16_qMvl1_x, i16_qMvl1_y, i16_pMvl1_x, i16_pMvl1_y;
 
     /* temp var */
-    UWORD32  u4_lft_flag, u4_top_flag;
-    const UWORD32  *bs_map;
-    UWORD32  u4_reordered_vert_bs_enc, u4_temp;
+    UWORD32 u4_left_flag, u4_top_flag;
+    const UWORD32 *bs_map;
+    UWORD32 u4_reordered_vert_bs_enc, u4_temp;
 
     /* Coded Pattern for Horizontal Edge */
     /*-----------------------------------------------------------------------*/
     /*u4_nbr_horz_csbp=11C|10C|9C|8C|7C|6C|5C|4C|3C|2C|1C|0C|15T|14T|13T|12T */
     /*-----------------------------------------------------------------------*/
-    UWORD32 u4_nbr_horz_csbp        = (u4_cur_mb_csbp << 4) | (u4_top_mb_csbp >> 12);
-    UWORD32 u4_horz_bs_enc          = u4_cur_mb_csbp | u4_nbr_horz_csbp;
+    UWORD32 u4_nbr_horz_csbp = (u4_cur_mb_csbp << 4) | (u4_top_mb_csbp >> 12);
+    UWORD32 u4_horz_bs_enc = u4_cur_mb_csbp | u4_nbr_horz_csbp;
 
     /* Coded Pattern for Vertical Edge */
     /*-----------------------------------------------------------------------*/
     /*u4_left_mb_masked_csbp = 15L|0|0|0|11L|0|0|0|7L|0|0|0|3L|0|0|0         */
     /*-----------------------------------------------------------------------*/
-    UWORD32 u4_left_mb_masked_csbp = u4_left_mb_csbp  & CSBP_RIGHT_BLOCK_MASK;
+    UWORD32 u4_left_mb_masked_csbp = u4_left_mb_csbp & CSBP_RIGHT_BLOCK_MASK;
 
     /*-----------------------------------------------------------------------*/
     /*u4_cur_mb_masked_csbp =14C|13C|12C|x|10C|9C|8C|x|6C|5C|4C|x|2C|1C|0C|x */
     /*-----------------------------------------------------------------------*/
-    UWORD32 u4_cur_mb_masked_csbp =(u4_cur_mb_csbp<<1)&(~CSBP_LEFT_BLOCK_MASK);
+    UWORD32 u4_cur_mb_masked_csbp = (u4_cur_mb_csbp << 1)
+                    & (~CSBP_LEFT_BLOCK_MASK);
 
     /*-----------------------------------------------------------------------*/
     /*u4_nbr_vert_csbp=14C|13C|12C|15L|10C|9C|8C|11L|6C|5C|4C|7L|2C|1C|0C|3L */
     /*-----------------------------------------------------------------------*/
-    UWORD32 u4_nbr_vert_csbp    = (u4_cur_mb_masked_csbp) | (u4_left_mb_masked_csbp >> 3);
-    UWORD32 u4_vert_bs_enc      = u4_cur_mb_csbp | u4_nbr_vert_csbp;
+    UWORD32 u4_nbr_vert_csbp = (u4_cur_mb_masked_csbp)
+                    | (u4_left_mb_masked_csbp >> 3);
+    UWORD32 u4_vert_bs_enc = u4_cur_mb_csbp | u4_nbr_vert_csbp;
 
     /* BS Calculation for MB Boundary Edges */
 
     /* BS calculation for 1 2 3 horizontal boundary */
-    bs_map  = gu4_bs_table[0];
+    bs_map = gu4_bs_table[0];
     pu4_horz_bs[1] = bs_map[(u4_horz_bs_enc >> 4) & 0xF];
     pu4_horz_bs[2] = bs_map[(u4_horz_bs_enc >> 8) & 0xF];
     pu4_horz_bs[3] = bs_map[(u4_horz_bs_enc >> 12) & 0xF];
 
     /* BS calculation for 5 6 7 vertical boundary */
     /* Do 4x4 tranpose of u4_vert_bs_enc by using look up table for reorder */
-    u4_reordered_vert_bs_enc    = ih264e_gu2_4x4_v2h_reorder[u4_vert_bs_enc & 0xF];
+    u4_reordered_vert_bs_enc = ih264e_gu2_4x4_v2h_reorder[u4_vert_bs_enc & 0xF];
 
-    u4_temp                     = ih264e_gu2_4x4_v2h_reorder[(u4_vert_bs_enc >> 4) & 0xF];
-    u4_reordered_vert_bs_enc   |= (u4_temp << 1);
+    u4_temp = ih264e_gu2_4x4_v2h_reorder[(u4_vert_bs_enc >> 4) & 0xF];
+    u4_reordered_vert_bs_enc |= (u4_temp << 1);
 
-    u4_temp                     = ih264e_gu2_4x4_v2h_reorder[(u4_vert_bs_enc >> 8) & 0xF];
-    u4_reordered_vert_bs_enc   |= (u4_temp << 2);
+    u4_temp = ih264e_gu2_4x4_v2h_reorder[(u4_vert_bs_enc >> 8) & 0xF];
+    u4_reordered_vert_bs_enc |= (u4_temp << 2);
 
-    u4_temp                     = ih264e_gu2_4x4_v2h_reorder[(u4_vert_bs_enc >> 12) & 0xF];
-    u4_reordered_vert_bs_enc   |= (u4_temp << 3);
+    u4_temp = ih264e_gu2_4x4_v2h_reorder[(u4_vert_bs_enc >> 12) & 0xF];
+    u4_reordered_vert_bs_enc |= (u4_temp << 3);
 
     pu4_vert_bs[1] = bs_map[(u4_reordered_vert_bs_enc >> 4) & 0xF];
     pu4_vert_bs[2] = bs_map[(u4_reordered_vert_bs_enc >> 8) & 0xF];
@@ -241,39 +239,96 @@ static void ih264e_fill_bs_1mv_1ref_non_mbaff(UWORD32 *pu4_horz_bs,
 
 
     /* BS Calculation for MB Boundary Edges */
-    i16_qMv0  = ps_curMvPred->i2_mvx;
-    i16_qMv1  = ps_curMvPred->i2_mvy;
-
-    if (u1_top_intra)
+    if (ps_top_pu->b1_intra_flag)
     {
         pu4_horz_bs[0] = 0x04040404;
     }
     else
     {
-        i16_pMv0  = ps_topMvPred->i2_mvx;
-        i16_pMv1  = ps_topMvPred->i2_mvy;
+        if (ps_curr_pu->b2_pred_mode != ps_top_pu->b2_pred_mode)
+        {
+            u4_top_flag = 1;
+        }
+        else if(ps_curr_pu->b2_pred_mode != 2)
+        {
+            i16_pMvl0_x = ps_top_pu->s_me_info[ps_top_pu->b2_pred_mode].s_mv.i2_mvx;
+            i16_pMvl0_y = ps_top_pu->s_me_info[ps_top_pu->b2_pred_mode].s_mv.i2_mvy;
+
+            i16_qMvl0_x = ps_curr_pu->s_me_info[ps_curr_pu->b2_pred_mode].s_mv.i2_mvx;
+            i16_qMvl0_y = ps_curr_pu->s_me_info[ps_curr_pu->b2_pred_mode].s_mv.i2_mvy;
 
-        u4_top_flag = (ABS((i16_pMv0 - i16_qMv0)) >= 4 ) |
-                        (ABS((i16_pMv1 - i16_qMv1)) >= 4);
 
-        bs_map  = gu4_bs_table[!!u4_top_flag];
+            u4_top_flag =  (ABS((i16_pMvl0_x - i16_qMvl0_x)) >= 4)
+                         | (ABS((i16_pMvl0_y - i16_qMvl0_y)) >= 4);
+        }
+        else
+        {
+
+            i16_pMvl0_x = ps_top_pu->s_me_info[PRED_L0].s_mv.i2_mvx;
+            i16_pMvl0_y = ps_top_pu->s_me_info[PRED_L0].s_mv.i2_mvy;
+            i16_pMvl1_x = ps_top_pu->s_me_info[PRED_L1].s_mv.i2_mvx;
+            i16_pMvl1_y = ps_top_pu->s_me_info[PRED_L1].s_mv.i2_mvy;
+
+            i16_qMvl0_x = ps_curr_pu->s_me_info[PRED_L0].s_mv.i2_mvx;
+            i16_qMvl0_y = ps_curr_pu->s_me_info[PRED_L0].s_mv.i2_mvy;
+            i16_qMvl1_x = ps_curr_pu->s_me_info[PRED_L1].s_mv.i2_mvx;
+            i16_qMvl1_y = ps_curr_pu->s_me_info[PRED_L1].s_mv.i2_mvy;
+
+
+            u4_top_flag =  (ABS((i16_pMvl0_x - i16_qMvl0_x)) >= 4)
+                         | (ABS((i16_pMvl0_y - i16_qMvl0_y)) >= 4)
+                         | (ABS((i16_pMvl1_x - i16_qMvl1_x)) >= 4)
+                         | (ABS((i16_pMvl1_y - i16_qMvl1_y)) >= 4);
+        }
+
+        bs_map = gu4_bs_table[!!u4_top_flag];
         pu4_horz_bs[0] = bs_map[u4_horz_bs_enc & 0xF];
     }
 
-    if (u1_left_intra)
+
+    if (ps_left_pu->b1_intra_flag)
     {
         pu4_vert_bs[0] = 0x04040404;
     }
     else
     {
-        i16_pMv0  = ps_leftMvPred->i2_mvx;
-        i16_pMv1  = ps_leftMvPred->i2_mvy;
+        if (ps_curr_pu->b2_pred_mode != ps_left_pu->b2_pred_mode)
+        {
+            u4_left_flag = 1;
+        }
+        else if(ps_curr_pu->b2_pred_mode != 2)/* Not bipred */
+        {
+            i16_pMvl0_x = ps_left_pu->s_me_info[ps_left_pu->b2_pred_mode].s_mv.i2_mvx;
+            i16_pMvl0_y = ps_left_pu->s_me_info[ps_left_pu->b2_pred_mode].s_mv.i2_mvy;
+
+            i16_qMvl0_x = ps_curr_pu->s_me_info[ps_curr_pu->b2_pred_mode].s_mv.i2_mvx;
+            i16_qMvl0_y = ps_curr_pu->s_me_info[ps_curr_pu->b2_pred_mode].s_mv.i2_mvy;
 
 
-        u4_lft_flag = (ABS((i16_pMv0 - i16_qMv0)) >= 4 ) |
-                        (ABS((i16_pMv1 - i16_qMv1)) >= 4);
+            u4_left_flag =  (ABS((i16_pMvl0_x - i16_qMvl0_x)) >= 4)
+                          | (ABS((i16_pMvl0_y - i16_qMvl0_y)) >= 4);
+        }
+        else
+        {
+
+            i16_pMvl0_x = ps_left_pu->s_me_info[PRED_L0].s_mv.i2_mvx;
+            i16_pMvl0_y = ps_left_pu->s_me_info[PRED_L0].s_mv.i2_mvy;
+            i16_pMvl1_x = ps_left_pu->s_me_info[PRED_L1].s_mv.i2_mvx;
+            i16_pMvl1_y = ps_left_pu->s_me_info[PRED_L1].s_mv.i2_mvy;
+
+            i16_qMvl0_x = ps_curr_pu->s_me_info[PRED_L0].s_mv.i2_mvx;
+            i16_qMvl0_y = ps_curr_pu->s_me_info[PRED_L0].s_mv.i2_mvy;
+            i16_qMvl1_x = ps_curr_pu->s_me_info[PRED_L1].s_mv.i2_mvx;
+            i16_qMvl1_y = ps_curr_pu->s_me_info[PRED_L1].s_mv.i2_mvy;
+
+
+            u4_left_flag =  (ABS((i16_pMvl0_x - i16_qMvl0_x)) >= 4)
+                          | (ABS((i16_pMvl0_y - i16_qMvl0_y)) >= 4)
+                          | (ABS((i16_pMvl1_x - i16_qMvl1_x)) >= 4)
+                          | (ABS((i16_pMvl1_y - i16_qMvl1_y)) >= 4);
+        }
 
-        bs_map  = gu4_bs_table[!!u4_lft_flag];
+        bs_map = gu4_bs_table[!!u4_left_flag];
         pu4_vert_bs[0] = bs_map[u4_reordered_vert_bs_enc & 0xF];
     }
 }
@@ -331,8 +386,7 @@ static UWORD32 ih264e_calculate_csbp(process_ctxt_t *ps_proc)
 *
 * @returns  none
 *
-* @remarks In this module it is assumed that their is only single reference
-* frame and is always the most recently used anchor frame
+* @remarks
 *
 *******************************************************************************
 */
@@ -394,14 +448,18 @@ void ih264e_compute_bs(process_ctxt_t * ps_proc)
         if (i4_mb_x == 0)
         {
             ps_left_mb_syntax_ele->u4_csbp = 0;
-            ps_left_mb_syntax_ele->u2_is_intra = 0;
-            ps_proc->s_left_mb_pu.s_l0_mv = ps_proc->ps_pu->s_l0_mv;
+            ps_proc->s_left_mb_pu.b1_intra_flag = 0;
+            ps_proc->s_left_mb_pu.b2_pred_mode = ps_proc->ps_pu->b2_pred_mode;
+            ps_proc->s_left_mb_pu.s_me_info[0].s_mv = ps_proc->ps_pu->s_me_info[0].s_mv;
+            ps_proc->s_left_mb_pu.s_me_info[1].s_mv = ps_proc->ps_pu->s_me_info[1].s_mv;
         }
         if (i4_mb_y == 0)
         {
             ps_top_mb_syntax_ele->u4_csbp = 0;
-            ps_top_mb_syntax_ele->u2_is_intra = 0;
-            ps_top_row_pu->s_l0_mv = ps_proc->ps_pu->s_l0_mv;
+            ps_top_row_pu->b1_intra_flag = 0;
+            ps_top_row_pu->b2_pred_mode = ps_proc->ps_pu->b2_pred_mode;
+            ps_top_row_pu->s_me_info[0].s_mv = ps_proc->ps_pu->s_me_info[0].s_mv;
+            ps_top_row_pu->s_me_info[1].s_mv = ps_proc->ps_pu->s_me_info[1].s_mv;
         }
 
         ih264e_fill_bs_1mv_1ref_non_mbaff(pu4_pic_horz_bs,
@@ -409,11 +467,9 @@ void ih264e_compute_bs(process_ctxt_t * ps_proc)
                                           ps_left_mb_syntax_ele->u4_csbp,
                                           ps_top_mb_syntax_ele->u4_csbp,
                                           ps_proc->u4_csbp,
-                                          &ps_proc->s_left_mb_pu.s_l0_mv,
-                                          &ps_top_row_pu->s_l0_mv,
-                                          &ps_proc->ps_pu->s_l0_mv,
-                                          ps_left_mb_syntax_ele->u2_is_intra,
-                                          ps_top_mb_syntax_ele->u2_is_intra);
+                                          &ps_proc->s_left_mb_pu,
+                                          ps_top_row_pu,
+                                          ps_proc->ps_pu);
     }
 
     return ;
diff --git a/encoder/ih264e_defs.h b/encoder/ih264e_defs.h
index 76929ef..aee270e 100644
--- a/encoder/ih264e_defs.h
+++ b/encoder/ih264e_defs.h
@@ -38,6 +38,22 @@
 #define IH264E_DEFS_H_
 
 
+#define PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, u4_nnz, u4_sig_coeff_map, pi2_res_block)   \
+{                                                                          \
+    ps_mb_coeff_data = pv_mb_coeff_data;                                   \
+    u4_nnz = ps_mb_coeff_data->i4_sig_map_nnz & 0xff;                      \
+    if (u4_nnz)                                                            \
+    {                                                                      \
+        u4_sig_coeff_map = ps_mb_coeff_data->i4_sig_map_nnz >> 16;         \
+        pi2_res_block = ps_mb_coeff_data->ai2_residue;                     \
+        pv_mb_coeff_data = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz); \
+    }                                                                      \
+    else                                                                   \
+    {                                                                      \
+      pv_mb_coeff_data = ps_mb_coeff_data->ai2_residue;                    \
+    }                                                                      \
+}
+
 /*****************************************************************************/
 /* Width and height restrictions                                             */
 /*****************************************************************************/
@@ -104,11 +120,31 @@
 /* Number of frame restrictions                                              */
 /*****************************************************************************/
 /**
+ *  Maximum number of reference pictures
+ */
+#define MAX_REF_PIC_CNT  2
+
+/**
+ *  Minimum number of reference pictures
+ */
+#define MIN_REF_PIC_CNT  1
+
+/**
+ *  Maximum number of B pictures between two I/P pictures
+ */
+#define MAX_NUM_BFRAMES     10
+
+/**
  *  Maximum number of reference buffers in DPB manager
  */
 #define MAX_REF_CNT  32
 
 /*****************************************************************************/
+/* Minimum size of inter prediction unit supported by encoder                */
+/*****************************************************************************/
+#define ENC_MIN_PU_SIZE     16
+
+/*****************************************************************************/
 /* Num cores releated defs                                                   */
 /*****************************************************************************/
 /**
@@ -125,7 +161,7 @@
  * Maximum process context sets
  * Used to stagger encoding of MAX_CTXT_SETS in parallel
  */
-#define MAX_CTXT_SETS   2
+#define MAX_CTXT_SETS   1
 /**
  * Maximum number of contexts
  * Kept as twice the number of threads, to make it easier to initialize the contexts
@@ -165,6 +201,7 @@
 #define DEFAULT_RC                      IVE_RC_STORAGE
 #define DEFAULT_MAX_FRAMERATE           120000
 #define DEFAULT_MAX_BITRATE             20000000
+#define DEFAULT_MAX_NUM_BFRAMES         0
 #define DEFAULT_MAX_SRCH_RANGE_X        256
 #define DEFAULT_MAX_SRCH_RANGE_Y        256
 #define DEFAULT_SLICE_PARAM             256
@@ -206,6 +243,7 @@
 #define DEFAULT_ENC_SPEED_PRESET        IVE_USER_DEFINED
 #define DEFAULT_PRE_ENC_ME              0
 #define DEFAULT_PRE_ENC_IPE             0
+#define DEFAULT_ENTROPY_CODING_MODE     0
 
 /** Maximum number of entries in input buffer list */
 #define MAX_INP_BUF_LIST_ENTRIES         32
@@ -217,7 +255,10 @@
 #define MAX_REC_LIST_ENTRIES             16
 
 /** Number of buffers created to hold half-pel planes for every reference buffer */
-    #define HPEL_PLANES_CNT                 1
+#define HPEL_PLANES_CNT                 1
+
+/** Number of buffers Needed for SUBPEL and BIPRED computation */
+#define SUBPEL_BUFF_CNT                 4
 
 /**
  *****************************************************************************
@@ -262,6 +303,16 @@ enum
     MEM_REC_CODEC,
 
     /**
+     * Cabac context
+     */
+    MEM_REC_CABAC,
+
+    /**
+     * Cabac context_mb_info
+     */
+    MEM_REC_CABAC_MB_INFO,
+
+    /**
      * entropy context
      */
     MEM_REC_ENTROPY,
@@ -483,8 +534,6 @@ enum
 #define MIN_RAW_BUFS_RGBA8888_COMP   1
 #define MIN_RAW_BUFS_420SP_COMP      2
 
-#define MAX_NMB 120
-
 /** Maximum number of active config paramter sets */
 #define MAX_ACTIVE_CONFIG_PARAMS 32
 
@@ -525,9 +574,9 @@ enum
 /* [0 - 00 - 00110] */
 #define NAL_SEI_FIRST_BYTE 0x06
 
-#define H264_ALLOC_INTER_FRM_INTV        1
+#define H264_ALLOC_INTER_FRM_INTV        2
 
-#define H264_MPEG_QP_MAP    191
+#define H264_MPEG_QP_MAP    255
 
 #define MPEG2_QP_ELEM       (H264_MPEG_QP_MAP + 1)
 #define H264_QP_ELEM        (MAX_H264_QP + 1)
diff --git a/encoder/ih264e_encode.c b/encoder/ih264e_encode.c
index ffc6fb7..c027321 100644
--- a/encoder/ih264e_encode.c
+++ b/encoder/ih264e_encode.c
@@ -48,7 +48,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
-
+#include <limits.h>
 /* User Include files */
 #include "ih264e_config.h"
 #include "ih264_typedefs.h"
@@ -63,26 +63,25 @@
 #include "ih264_platform_macros.h"
 #include "ih264_error.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
-#include "ih264_error.h"
-#include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
 #include "ih264_inter_pred_filters.h"
 #include "ih264_mem_fns.h"
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "ih264_list.h"
 #include "ih264e_error.h"
 #include "ih264e_defs.h"
-#include "ih264_padding.h"
 #include "ih264e_bitstream.h"
 #include "irc_mem_req_and_acq.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
 #include "ih264e_time_stamp.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_master.h"
 #include "ih264e_process.h"
@@ -90,7 +89,6 @@
 #include "ih264_dpb_mgr.h"
 #include "ih264e_utils.h"
 #include "ih264e_fmt_conv.h"
-#include "ih264e_config.h"
 #include "ih264e_statistics.h"
 #include "ih264e_trace.h"
 #include "ih264e_debug.h"
@@ -217,7 +215,7 @@ WORD32 ih264e_encode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
     out_buf_t s_out_buf;
 
     /* temp var */
-    WORD32 ctxt_sel = 0, i;
+    WORD32 ctxt_sel = 0, i, i4_rc_pre_enc_skip;
 
     /********************************************************************/
     /*                            BEGIN INIT                            */
@@ -228,30 +226,27 @@ WORD32 ih264e_encode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
     ps_video_encode_op->s_ive_op.dump_recon = 0;
     ps_video_encode_op->s_ive_op.u4_encoded_frame_type = IV_NA_FRAME;
 
-    /* copy input info. to internal structure */
-    s_inp_buf.s_raw_buf = ps_video_encode_ip->s_ive_ip.s_inp_buf;
-    s_inp_buf.u4_timestamp_low = ps_video_encode_ip->s_ive_ip.u4_timestamp_low;
-    s_inp_buf.u4_timestamp_high = ps_video_encode_ip->s_ive_ip.u4_timestamp_high;
-    s_inp_buf.u4_is_last = ps_video_encode_ip->s_ive_ip.u4_is_last;
-    s_inp_buf.pv_mb_info = ps_video_encode_ip->s_ive_ip.pv_mb_info;
-    s_inp_buf.u4_mb_info_type = ps_video_encode_ip->s_ive_ip.u4_mb_info_type;
-    s_inp_buf.pv_pic_info = ps_video_encode_ip->s_ive_ip.pv_pic_info;
-    s_inp_buf.u4_pic_info_type = ps_video_encode_ip->s_ive_ip.u4_pic_info_type;
+    /* Check for output memory allocation size */
+    if (ps_video_encode_ip->s_ive_ip.s_out_buf.u4_bufsize < MIN_STREAM_SIZE)
+    {
+        error_status |= IH264E_INSUFFICIENT_OUTPUT_BUFFER;
+        SET_ERROR_ON_RETURN(error_status,
+                            IVE_UNSUPPORTEDPARAM,
+                            ps_video_encode_op->s_ive_op.u4_error_code,
+                            IV_FAIL);
+    }
 
     /* copy output info. to internal structure */
     s_out_buf.s_bits_buf = ps_video_encode_ip->s_ive_ip.s_out_buf;
-    s_out_buf.u4_is_last = ps_video_encode_ip->s_ive_ip.u4_is_last;
+    s_out_buf.u4_is_last = 0;
     s_out_buf.u4_timestamp_low = ps_video_encode_ip->s_ive_ip.u4_timestamp_low;
     s_out_buf.u4_timestamp_high = ps_video_encode_ip->s_ive_ip.u4_timestamp_high;
 
     /* api call cnt */
     ps_codec->i4_encode_api_call_cnt += 1;
 
-    /* curr pic cnt */
-    ps_codec->i4_pic_cnt += 1;
-
     /* codec context selector */
-    ctxt_sel = ps_codec->i4_encode_api_call_cnt & 1;
+    ctxt_sel = ps_codec->i4_encode_api_call_cnt % MAX_CTXT_SETS;
 
     /* reset status flags */
     ps_codec->ai4_pic_cnt[ctxt_sel] = -1;
@@ -274,8 +269,8 @@ WORD32 ih264e_encode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
 
         if (1 == ps_cfg->u4_is_valid)
         {
-            if ( ((ps_cfg->u4_timestamp_high == s_inp_buf.u4_timestamp_high) &&
-                            (ps_cfg->u4_timestamp_low == s_inp_buf.u4_timestamp_low)) ||
+            if ( ((ps_cfg->u4_timestamp_high == ps_video_encode_ip->s_ive_ip.u4_timestamp_high) &&
+                            (ps_cfg->u4_timestamp_low == ps_video_encode_ip->s_ive_ip.u4_timestamp_low)) ||
                             ((WORD32)ps_cfg->u4_timestamp_high == -1) ||
                             ((WORD32)ps_cfg->u4_timestamp_low == -1) )
             {
@@ -309,6 +304,12 @@ WORD32 ih264e_encode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
     }
 #endif /*LOGO_EN*/
 
+    /* In case of alt ref and B pics we will have non reference frame in stream */
+    if (ps_codec->s_cfg.u4_enable_alt_ref || ps_codec->s_cfg.u4_num_bframes)
+    {
+        ps_codec->i4_non_ref_frames_in_stream = 1;
+    }
+
     if (ps_codec->i4_encode_api_call_cnt == 0)
     {
         /********************************************************************/
@@ -355,11 +356,9 @@ WORD32 ih264e_encode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
         /* api call cnt */
         ps_codec->i4_encode_api_call_cnt --;
 
-        /* curr pic cnt */
-        ps_codec->i4_pic_cnt --;
-
         /* header mode tag is not sticky */
         ps_codec->i4_header_mode = 0;
+        ps_codec->i4_gen_header = 0;
 
         /* send the input to app */
         ps_video_encode_op->s_ive_op.s_inp_buf = s_inp_buf.s_raw_buf;
@@ -381,8 +380,18 @@ WORD32 ih264e_encode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
         return IV_SUCCESS;
     }
 
+    /* curr pic cnt */
+     ps_codec->i4_pic_cnt += 1;
+
+    i4_rc_pre_enc_skip = 0;
+    i4_rc_pre_enc_skip = ih264e_input_queue_update(
+                    ps_codec, &ps_video_encode_ip->s_ive_ip, &s_inp_buf);
+
+    s_out_buf.u4_is_last = s_inp_buf.u4_is_last;
+    ps_video_encode_op->s_ive_op.u4_is_last = s_inp_buf.u4_is_last;
 
-    if (s_inp_buf.s_raw_buf.apv_bufs[0] != NULL)
+    /* Only encode if the current frame is not pre-encode skip */
+    if (!i4_rc_pre_enc_skip && s_inp_buf.s_raw_buf.apv_bufs[0])
     {
         /* array giving pic cnt that is being processed in curr context set */
         ps_codec->ai4_pic_cnt[ctxt_sel] = ps_codec->i4_pic_cnt;
@@ -394,172 +403,282 @@ WORD32 ih264e_encode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
                             ps_video_encode_op->s_ive_op.u4_error_code,
                             IV_FAIL);
 
-        if (ps_codec->s_rate_control.pre_encode_skip[ctxt_sel] == 0)
-        {
-            /* proc ctxt base idx */
-            WORD32 proc_ctxt_select = ctxt_sel * MAX_PROCESS_THREADS;
+        /* proc ctxt base idx */
+        WORD32 proc_ctxt_select = ctxt_sel * MAX_PROCESS_THREADS;
 
-            /* proc ctxt */
-            process_ctxt_t *ps_proc = &ps_codec->as_process[proc_ctxt_select];
+        /* proc ctxt */
+        process_ctxt_t *ps_proc = &ps_codec->as_process[proc_ctxt_select];
 
-            WORD32 ret = 0;
+        WORD32 ret = 0;
 
-            /* number of addl. threads to be created */
-            WORD32 num_thread_cnt = ps_codec->s_cfg.u4_num_cores - 1;
+        /* number of addl. threads to be created */
+        WORD32 num_thread_cnt = ps_codec->s_cfg.u4_num_cores - 1;
 
-            for (i = 0; i < num_thread_cnt; i++)
+        for (i = 0; i < num_thread_cnt; i++)
+        {
+            ret = ithread_create(ps_codec->apv_proc_thread_handle[i],
+                                 NULL,
+                                 (void *)ih264e_process_thread,
+                                 &ps_codec->as_process[i + 1]);
+            if (ret != 0)
             {
-                ret = ithread_create(ps_codec->apv_proc_thread_handle[i],
-                                     NULL,
-                                     (void*)ih264e_process_thread,
-                                     &ps_codec->as_process[i + 1]);
-                if (ret != 0)
-                {
-                    printf("pthread Create Failed");
-                    assert(0);
-                }
+                printf("pthread Create Failed");
+                assert(0);
+            }
 
-                ps_codec->ai4_process_thread_created[i] = 1;
+            ps_codec->ai4_process_thread_created[i] = 1;
 
-                ps_codec->i4_proc_thread_cnt++;
-            }
+            ps_codec->i4_proc_thread_cnt++;
+        }
 
 
-            /* launch job */
-            ih264e_process_thread(ps_proc);
+        /* launch job */
+        ih264e_process_thread(ps_proc);
 
-            /* Join threads at the end of encoding a frame */
-            ih264e_join_threads(ps_codec);
+        /* Join threads at the end of encoding a frame */
+        ih264e_join_threads(ps_codec);
 
-            ih264_list_reset(ps_codec->pv_proc_jobq);
+        ih264_list_reset(ps_codec->pv_proc_jobq);
 
-            ih264_list_reset(ps_codec->pv_entropy_jobq);
-        }
+        ih264_list_reset(ps_codec->pv_entropy_jobq);
     }
 
-    if (-1 != ps_codec->ai4_pic_cnt[ctxt_sel])
-    {
-        /* proc ctxt base idx */
-        WORD32 proc_ctxt_select = ctxt_sel * MAX_PROCESS_THREADS;
 
-        /* proc ctxt */
-        process_ctxt_t *ps_proc = &ps_codec->as_process[proc_ctxt_select];
+   /****************************************************************************
+   * RECON
+   *    Since we have forward dependent frames, we cannot return recon in encoding
+   *    order. It must be in poc order, or input pic order. To achieve this we
+   *    introduce a delay of 1 to the recon wrt encode. Now since we have that
+   *    delay, at any point minimum of pic_cnt in our ref buffer will be the
+   *    correct frame. For ex let our GOP be IBBP [1 2 3 4] . The encode order
+   *    will be [1 4 2 3] .Now since we have a delay of 1, when we are done with
+   *    encoding 4, the min in the list will be 1. After encoding 2, it will be
+   *    2, 3 after 3 and 4 after 4. Hence we can return in sequence. Note
+   *    that the 1 delay is critical. Hence if we have post enc skip, we must
+   *    skip here too. Note that since post enc skip already frees the recon
+   *    buffer we need not do any thing here
+   *
+   *    We need to return a recon when ever we consume an input buffer. This
+   *    comsumption include a pre or post enc skip. Thus dump recon is set for
+   *    all cases except when
+   *    1) We are waiting -> ps_codec->i4_frame_num > 1
+   *    2) When the input buffer is null [ ie we are not consuming any inp]
+   *        An exception need to be made for the case when we have the last buffer
+   *        since we need to flush out the on remainig recon.
+   ****************************************************************************/
 
-        /* receive output back from codec */
-        s_out_buf = ps_codec->as_out_buf[ctxt_sel];
+    ps_video_encode_op->s_ive_op.dump_recon = 0;
 
-        /* send the output to app */
-        ps_video_encode_op->s_ive_op.output_present  = 1;
-        ps_video_encode_op->s_ive_op.dump_recon = 1;
-        ps_video_encode_op->s_ive_op.s_out_buf = s_out_buf.s_bits_buf;
-        ps_video_encode_op->s_ive_op.u4_error_code = IV_SUCCESS;
+    if (ps_codec->s_cfg.u4_enable_recon && (ps_codec->i4_frame_num > 1)
+                    && (s_inp_buf.s_raw_buf.apv_bufs[0] || s_inp_buf.u4_is_last))
+    {
+        /* error status */
+        IH264_ERROR_T ret = IH264_SUCCESS;
+        pic_buf_t *ps_pic_buf = NULL;
+        WORD32 i4_buf_status, i4_curr_poc = 32768;
 
-        /* receive input back from codec */
-        s_inp_buf = ps_proc->s_inp_buf;
+        /* In case of skips we return recon, but indicate that buffer is zero size */
+        if (ps_codec->s_rate_control.post_encode_skip[ctxt_sel]
+                        || i4_rc_pre_enc_skip)
+        {
 
-        /* send the input to app */
-        ps_video_encode_op->s_ive_op.s_inp_buf = s_inp_buf.s_raw_buf;
+            ps_video_encode_op->s_ive_op.dump_recon = 1;
+            ps_video_encode_op->s_ive_op.s_recon_buf.au4_wd[0] = 0;
+            ps_video_encode_op->s_ive_op.s_recon_buf.au4_wd[1] = 0;
 
-        if (ps_codec->s_cfg.u4_enable_recon &&
-                        ps_codec->s_rate_control.pre_encode_skip[ctxt_sel] == 0)
+        }
+        else
         {
-            /* error status */
-            IH264_ERROR_T ret = IH264_SUCCESS;
-
-            /* recon buffer */
-            rec_buf_t *ps_rec_buf = &ps_codec->as_rec_buf[ctxt_sel];
-
-            ps_video_encode_op->s_ive_op.s_recon_buf = ps_video_encode_ip->s_ive_ip.s_recon_buf;
-
-            /* copy/convert the recon buffer and return */
-            ih264e_fmt_conv(ps_codec, &ps_rec_buf->s_pic_buf,
-                            ps_video_encode_ip->s_ive_ip.s_recon_buf.apv_bufs[0],
-                            ps_video_encode_ip->s_ive_ip.s_recon_buf.apv_bufs[1],
-                            ps_video_encode_ip->s_ive_ip.s_recon_buf.apv_bufs[2],
-                            ps_video_encode_ip->s_ive_ip.s_recon_buf.au4_wd[0],
-                            ps_video_encode_ip->s_ive_ip.s_recon_buf.au4_wd[1],
-                            0,
-                            ps_codec->s_cfg.u4_disp_ht);
-
-            ret = ih264_buf_mgr_release(ps_codec->pv_ref_buf_mgr, ps_rec_buf->s_pic_buf.i4_buf_id, BUF_MGR_IO);
-            if (IH264_SUCCESS != ret)
+            for (i = 0; i < ps_codec->i4_ref_buf_cnt; i++)
             {
-                SET_ERROR_ON_RETURN((IH264E_ERROR_T)ret,
-                                    IVE_FATALERROR,
+                if (ps_codec->as_ref_set[i].i4_pic_cnt == -1)
+                    continue;
+
+                i4_buf_status = ih264_buf_mgr_get_status(
+                                ps_codec->pv_ref_buf_mgr,
+                                ps_codec->as_ref_set[i].ps_pic_buf->i4_buf_id);
+
+                if ((i4_buf_status & BUF_MGR_IO)
+                                && (ps_codec->as_ref_set[i].i4_poc < i4_curr_poc))
+                {
+                    ps_pic_buf = ps_codec->as_ref_set[i].ps_pic_buf;
+                    i4_curr_poc = ps_codec->as_ref_set[i].i4_poc;
+                }
+            }
+
+            ps_video_encode_op->s_ive_op.s_recon_buf =
+                            ps_video_encode_ip->s_ive_ip.s_recon_buf;
+
+            /*
+             * If we get a valid buffer. output and free recon.
+             *
+             * we may get an invalid buffer if num_b_frames is 0. This is because
+             * We assume that there will be a ref frame in ref list after encoding
+             * the last frame. With B frames this is correct since its forward ref
+             * pic will be in the ref list. But if num_b_frames is 0, we will not
+             * have a forward ref pic
+             */
+
+            if (ps_pic_buf)
+            {
+                /* copy/convert the recon buffer and return */
+                ih264e_fmt_conv(ps_codec,
+                                ps_pic_buf,
+                                ps_video_encode_ip->s_ive_ip.s_recon_buf.apv_bufs[0],
+                                ps_video_encode_ip->s_ive_ip.s_recon_buf.apv_bufs[1],
+                                ps_video_encode_ip->s_ive_ip.s_recon_buf.apv_bufs[2],
+                                ps_video_encode_ip->s_ive_ip.s_recon_buf.au4_wd[0],
+                                ps_video_encode_ip->s_ive_ip.s_recon_buf.au4_wd[1],
+                                0, ps_codec->s_cfg.u4_disp_ht);
+
+                ps_video_encode_op->s_ive_op.dump_recon = 1;
+
+                ret = ih264_buf_mgr_release(ps_codec->pv_ref_buf_mgr,
+                                            ps_pic_buf->i4_buf_id, BUF_MGR_IO);
+
+                if (IH264_SUCCESS != ret)
+                {
+                    SET_ERROR_ON_RETURN(
+                                    (IH264E_ERROR_T)ret, IVE_FATALERROR,
                                     ps_video_encode_op->s_ive_op.u4_error_code,
                                     IV_FAIL);
+                }
             }
         }
+    }
 
-        /* release buffers from ref list */
-        if (ps_codec->s_rate_control.post_encode_skip[ctxt_sel] == 1)
-        {
-            /* pic info */
-            pic_buf_t *ps_cur_pic;
 
-            /* mv info */
-            mv_buf_t *ps_cur_mv_buf;
+    /***************************************************************************
+     * Free reference buffers:
+     * In case of a post enc skip, we have to ensure that those pics will not
+     * be used as reference anymore. In all other cases we will not even mark
+     * the ref buffers
+     ***************************************************************************/
+    if (ps_codec->s_rate_control.post_encode_skip[ctxt_sel])
+    {
+        /* pic info */
+        pic_buf_t *ps_cur_pic;
+
+        /* mv info */
+        mv_buf_t *ps_cur_mv_buf;
 
-            /* error status */
-            IH264_ERROR_T ret = IH264_SUCCESS;
+        /* error status */
+        IH264_ERROR_T ret = IH264_SUCCESS;
 
-            /* Decrement coded pic count */
-            ps_codec->i4_coded_pic_cnt--;
+        /* Decrement coded pic count */
+        ps_codec->i4_poc--;
 
-            /* loop through to get the min pic cnt among the list of pics stored in ref list */
-            /* since the skipped frame may not be on reference list, we may not have an MV bank
-             * hence free only if we have allocated */
-            for (i = 0; i < ps_codec->i4_ref_buf_cnt; i++)
+        /* loop through to get the min pic cnt among the list of pics stored in ref list */
+        /* since the skipped frame may not be on reference list, we may not have an MV bank
+         * hence free only if we have allocated */
+        for (i = 0; i < ps_codec->i4_ref_buf_cnt; i++)
+        {
+            if (ps_codec->i4_pic_cnt == ps_codec->as_ref_set[i].i4_pic_cnt)
             {
-                if (ps_codec->i4_pic_cnt == ps_codec->as_ref_set[i].i4_pic_cnt)
-                {
-                    ps_codec->as_ref_set[i].i4_pic_cnt = -1;
-                    ps_codec->as_ref_set[i].i4_poc = -1;
-
-                    ps_cur_pic = ps_codec->as_ref_set[i].ps_pic_buf;
-
-                    ps_cur_mv_buf = ps_codec->as_ref_set[i].ps_mv_buf;
-
-                    /* release this frame from reference list */
-                    ret = ih264_buf_mgr_release(ps_codec->pv_mv_buf_mgr, ps_cur_mv_buf->i4_buf_id , BUF_MGR_REF);
-                    SET_ERROR_ON_RETURN((IH264E_ERROR_T)ret,
-                                        IVE_FATALERROR,
-                                        ps_video_encode_op->s_ive_op.u4_error_code,
-                                        IV_FAIL);
-
-                    ret = ih264_buf_mgr_release(ps_codec->pv_ref_buf_mgr, ps_cur_pic->i4_buf_id , BUF_MGR_REF);
-                    SET_ERROR_ON_RETURN((IH264E_ERROR_T)ret,
-                                        IVE_FATALERROR,
-                                        ps_video_encode_op->s_ive_op.u4_error_code,
-                                        IV_FAIL);
-                    break;
-                }
+
+                ps_cur_pic = ps_codec->as_ref_set[i].ps_pic_buf;
+
+                ps_cur_mv_buf = ps_codec->as_ref_set[i].ps_mv_buf;
+
+                /* release this frame from reference list and recon list */
+                ret = ih264_buf_mgr_release(ps_codec->pv_mv_buf_mgr, ps_cur_mv_buf->i4_buf_id , BUF_MGR_REF);
+                ret |= ih264_buf_mgr_release(ps_codec->pv_mv_buf_mgr, ps_cur_mv_buf->i4_buf_id , BUF_MGR_IO);
+                SET_ERROR_ON_RETURN((IH264E_ERROR_T)ret,
+                                    IVE_FATALERROR,
+                                    ps_video_encode_op->s_ive_op.u4_error_code,
+                                    IV_FAIL);
+
+                ret = ih264_buf_mgr_release(ps_codec->pv_ref_buf_mgr, ps_cur_pic->i4_buf_id , BUF_MGR_REF);
+                ret |= ih264_buf_mgr_release(ps_codec->pv_ref_buf_mgr, ps_cur_pic->i4_buf_id , BUF_MGR_IO);
+                SET_ERROR_ON_RETURN((IH264E_ERROR_T)ret,
+                                    IVE_FATALERROR,
+                                    ps_video_encode_op->s_ive_op.u4_error_code,
+                                    IV_FAIL);
+                break;
             }
         }
+    }
+
+    /*
+     * Since recon is not in sync with output, ie there can be frame to be
+     * given back as recon even after last output. Hence we need to mark that
+     * the output is not the last.
+     * Hence search through reflist and mark appropriately
+     */
+    if (ps_codec->s_cfg.u4_enable_recon)
+    {
+        WORD32 i4_buf_status = 0;
 
-        if ((ps_codec->s_rate_control.post_encode_skip[ctxt_sel] == 1) ||
-                        (ps_codec->s_rate_control.pre_encode_skip[ctxt_sel] == 1))
+        for (i = 0; i < ps_codec->i4_ref_buf_cnt; i++)
         {
-            ps_video_encode_op->s_ive_op.dump_recon = 0;
+            if (ps_codec->as_ref_set[i].i4_pic_cnt == -1)
+                continue;
+
+            i4_buf_status |= ih264_buf_mgr_get_status(
+                            ps_codec->pv_ref_buf_mgr,
+                            ps_codec->as_ref_set[i].ps_pic_buf->i4_buf_id);
         }
-        else
+
+        if (i4_buf_status & BUF_MGR_IO)
         {
-            /* set output pic type */
-            if (ps_codec->i4_slice_type == PSLICE)
-            {
-                ps_video_encode_op->s_ive_op.u4_encoded_frame_type = IV_P_FRAME;
-            }
-            else if (ps_codec->i4_slice_type == ISLICE && ps_codec->u4_is_idr != 1)
-            {
+            s_out_buf.u4_is_last = 0;
+            ps_video_encode_op->s_ive_op.u4_is_last = 0;
+        }
+    }
+
+
+    /**************************************************************************
+     * Signaling to APP
+     *  1) If we valid a valid output mark it so
+     *  2) Set the codec output ps_video_encode_op
+     *  3) Set the error status
+     *  4) Set the return Pic type
+     *      Note that we already has marked recon properly
+     *  5)Send the consumed input back to app so that it can free it if possible
+     *
+     *  We will have to return the output and input buffers unconditionally
+     *  so that app can release them
+     **************************************************************************/
+    if (!i4_rc_pre_enc_skip
+                    && !ps_codec->s_rate_control.post_encode_skip[ctxt_sel]
+                    && s_inp_buf.s_raw_buf.apv_bufs[0])
+    {
+
+        /* receive output back from codec */
+        s_out_buf = ps_codec->as_out_buf[ctxt_sel];
+
+        /* send the output to app */
+        ps_video_encode_op->s_ive_op.output_present  = 1;
+        ps_video_encode_op->s_ive_op.u4_error_code = IV_SUCCESS;
+
+        /* Set the time stamps of the encodec input */
+        ps_video_encode_op->s_ive_op.u4_timestamp_low = s_inp_buf.u4_timestamp_low;
+        ps_video_encode_op->s_ive_op.u4_timestamp_high = s_inp_buf.u4_timestamp_high;
+
+
+        switch (ps_codec->pic_type)
+        {
+            case PIC_IDR:
+                ps_video_encode_op->s_ive_op.u4_encoded_frame_type =IV_IDR_FRAME;
+                break;
+
+            case PIC_I:
                 ps_video_encode_op->s_ive_op.u4_encoded_frame_type = IV_I_FRAME;
-            }
-            else
-            {
-                ps_video_encode_op->s_ive_op.u4_encoded_frame_type = IV_IDR_FRAME;
-            }
+                break;
+
+            case PIC_P:
+                ps_video_encode_op->s_ive_op.u4_encoded_frame_type = IV_P_FRAME;
+                break;
+
+            case PIC_B:
+                ps_video_encode_op->s_ive_op.u4_encoded_frame_type = IV_B_FRAME;
+                break;
+
+            default:
+                ps_video_encode_op->s_ive_op.u4_encoded_frame_type = IV_NA_FRAME;
+                break;
         }
 
-        /* loop through to get the error status */
         for (i = 0; i < (WORD32)ps_codec->s_cfg.u4_num_cores; i++)
         {
             error_status |= ps_codec->as_process[ctxt_sel + i].i4_error_code;
@@ -569,6 +688,36 @@ WORD32 ih264e_encode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
                             ps_video_encode_op->s_ive_op.u4_error_code,
                             IV_FAIL);
     }
+    else
+    {
+        /* proc ctxt base idx */
+        WORD32 proc_ctxt_select = ctxt_sel * MAX_PROCESS_THREADS;
+
+        /* proc ctxt */
+        process_ctxt_t *ps_proc = &ps_codec->as_process[proc_ctxt_select];
+
+        /* receive output back from codec */
+        s_out_buf = ps_codec->as_out_buf[ctxt_sel];
+
+        ps_video_encode_op->s_ive_op.output_present = 0;
+        ps_video_encode_op->s_ive_op.u4_error_code = IV_SUCCESS;
+
+        /* Set the time stamps of the encodec input */
+        ps_video_encode_op->s_ive_op.u4_timestamp_low = 0;
+        ps_video_encode_op->s_ive_op.u4_timestamp_high = 0;
+
+        /* receive input back from codec and send it to app */
+        s_inp_buf = ps_proc->s_inp_buf;
+        ps_video_encode_op->s_ive_op.s_inp_buf = s_inp_buf.s_raw_buf;
+
+        ps_video_encode_op->s_ive_op.u4_encoded_frame_type =  IV_NA_FRAME;
+
+    }
+
+    /* Send the input to encoder so that it can free it if possible */
+    ps_video_encode_op->s_ive_op.s_out_buf = s_out_buf.s_bits_buf;
+    ps_video_encode_op->s_ive_op.s_inp_buf = s_inp_buf.s_raw_buf;
+
 
     if (1 == s_inp_buf.u4_is_last)
     {
diff --git a/encoder/ih264e_encode_header.c b/encoder/ih264e_encode_header.c
index 67e5409..cc81e1b 100644
--- a/encoder/ih264e_encode_header.c
+++ b/encoder/ih264e_encode_header.c
@@ -65,14 +65,13 @@
 #include "ithread.h"
 #include "ih264e_config.h"
 #include "ih264e_trace.h"
-#include "ih264_typedefs.h"
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ih264_debug.h"
 #include "ih264_defs.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -81,14 +80,17 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "ih264e_defs.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_encode_header.h"
 #include "ih264_common_tables.h"
 #include "ih264_macros.h"
+#include "ih264e_utils.h"
 
 
 /*****************************************************************************/
@@ -523,11 +525,12 @@ WORD32 ih264e_generate_slice_header(bitstrm_t *ps_bitstrm,
         {
             /* num_ref_idx_l0_active_minus1 */
             PUT_BITS_UEV(ps_bitstrm, ps_slice_hdr->i1_num_ref_idx_l0_active - 1, return_status, "num_ref_idx_l0_active_minus1");
-        }
-        if (ps_slice_hdr->u1_slice_type == BSLICE)
-        {
-            /* num_ref_idx_l1_active_minus1 */
-            PUT_BITS_UEV(ps_bitstrm, ps_slice_hdr->i1_num_ref_idx_l1_active - 1, return_status, "num_ref_idx_l1_active_minus1");
+
+            if (ps_slice_hdr->u1_slice_type == BSLICE)
+            {
+                /* num_ref_idx_l1_active_minus1 */
+                PUT_BITS_UEV(ps_bitstrm, ps_slice_hdr->i1_num_ref_idx_l1_active - 1, return_status, "num_ref_idx_l1_active_minus1");
+            }
         }
     }
 
@@ -544,9 +547,20 @@ WORD32 ih264e_generate_slice_header(bitstrm_t *ps_bitstrm,
         }
     }
 
+    if (ps_slice_hdr->u1_slice_type == BSLICE)
+    {
+        /* ref_pic_list_reordering_flag_l1 */
+        PUT_BITS(ps_bitstrm, ps_slice_hdr->u1_ref_idx_reordering_flag_l1, 1, return_status, "ref_pic_list_reordering_flag_l1");
+
+        if (ps_slice_hdr->u1_ref_idx_reordering_flag_l1)
+        {
+
+        }
+    }
+
     if ((ps_pps->i1_weighted_pred_flag &&
                     (ps_slice_hdr->u1_slice_type == PSLICE || ps_slice_hdr->u1_slice_type == SPSLICE)) ||
-                    (ps_slice_hdr->u1_weighted_bipred_idc == 1 && ps_slice_hdr->u1_slice_type == BSLICE))
+                    (ps_slice_hdr->u1_slice_type == BSLICE && ps_pps->i1_weighted_bipred_idc == 1))
     {
         /* TODO_LATER: Currently there is no support for weighted prediction.
          This needs to be updated when the support is added */
@@ -662,8 +676,8 @@ IH264E_ERROR_T ih264e_populate_sps(codec_t *ps_codec, sps_t *ps_sps)
      * To the constrained baseline profile if we add support for B slices, support for encoding interlaced frames,
      * support for weighted prediction and introduce CABAC entropy coding then we have Main Profile.
      */
-    if ((ps_cfg->u4_num_b_frames) || (ps_cfg->e_content_type != IV_PROGRESSIVE) ||
-         (ps_cfg->u4_entropy_coding_mode == CABAC) || (ps_cfg->u4_weighted_prediction))
+    if ((ps_cfg->u4_num_bframes) || (ps_cfg->e_content_type != IV_PROGRESSIVE) ||
+        (ps_cfg->u4_entropy_coding_mode == CABAC) || (ps_cfg->u4_weighted_prediction))
     {
         ps_sps->u1_profile_idc = IH264_PROFILE_MAIN;
     }
@@ -673,17 +687,8 @@ IH264E_ERROR_T ih264e_populate_sps(codec_t *ps_codec, sps_t *ps_sps)
     }
 
     /* level */
-    ps_sps->u1_level_idc = ps_cfg->u4_max_level;
-//    i4_err_code = ih264e_get_level(ps_cfg, &level_idc);
-//    if (i4_err_code == IH264E_SUCCESS)
-//    {
-//        ps_sps->u1_level_idc = level_idc;
-//
-//    }
-//    else
-//    {
-//        return i4_err_code;
-//    }
+    ps_sps->u1_level_idc = MAX(ps_cfg->u4_max_level,
+                               (UWORD32)ih264e_get_min_level(ps_cfg->u4_max_wd, ps_cfg->u4_max_ht));
 
     /* constrained flags */
     /*
@@ -748,8 +753,10 @@ IH264E_ERROR_T ih264e_populate_sps(codec_t *ps_codec, sps_t *ps_sps)
     /* pic_order_cnt_type */
     ps_sps->i1_pic_order_cnt_type = 2;
 
-    if(ps_cfg->u4_enable_alt_ref)
+    if (ps_codec->i4_non_ref_frames_in_stream)
+    {
         ps_sps->i1_pic_order_cnt_type = 0;
+    }
 
     /* log2_max_pic_order_cnt_lsb_minus4 */
     ps_sps->i1_log2_max_pic_order_cnt_lsb = 8;
@@ -765,8 +772,15 @@ IH264E_ERROR_T ih264e_populate_sps(codec_t *ps_codec, sps_t *ps_sps)
     }
 
     /* num_ref_frames */
-    /* FIXME : Fix this hard coding */
-    ps_sps->u1_max_num_ref_frames = 1;
+    /* TODO : Should we have a flexible num ref frames */
+    if (ps_codec->s_cfg.u4_num_bframes > 0)
+    {
+        ps_sps->u1_max_num_ref_frames = 2;
+    }
+    else
+    {
+        ps_sps->u1_max_num_ref_frames = 1;
+    }
 
     /* gaps_in_frame_num_value_allowed_flag */
     ps_sps->i1_gaps_in_frame_num_value_allowed_flag = 0;
@@ -852,7 +866,7 @@ IH264E_ERROR_T ih264e_populate_pps(codec_t *ps_codec, pps_t *ps_pps)
     /* entropy_coding_mode */
     ps_pps->u1_entropy_coding_mode_flag = ps_cfg->u4_entropy_coding_mode;
 
-    /* pic_order_present_flag is unset for POC type 2 */
+    /* pic_order_present_flag is unset if we don't have feilds */
     ps_pps->u1_pic_order_present_flag = 0;
 
     /* Currently number of slice groups supported are 1 */
@@ -980,18 +994,17 @@ WORD32 ih264e_populate_slice_header(process_ctxt_t *ps_proc,
     if (ps_sps->i1_pic_order_cnt_type == 0)
     {
 
-        WORD32 val;
-        val = ps_codec->i4_coded_pic_cnt;
-        val %= (1 << ps_sps->i1_log2_max_pic_order_cnt_lsb);
-        ps_slice_hdr->i4_pic_order_cnt_lsb = val;
+        WORD32 i4_poc;
+        i4_poc = ps_codec->i4_poc;
+        i4_poc %= (1 << ps_sps->i1_log2_max_pic_order_cnt_lsb);
+        ps_slice_hdr->i4_pic_order_cnt_lsb = i4_poc;
     }
+    /* TODO add support for poc type 1 */
     else if (ps_sps->i1_pic_order_cnt_type == 1)
     {
 
     }
 
-    if(0 == ps_slice_hdr->u2_first_mb_in_slice)
-        ps_codec->i4_coded_pic_cnt++;
 
     /*
      * redundant slices are not currently supported.
@@ -1005,7 +1018,7 @@ WORD32 ih264e_populate_slice_header(process_ctxt_t *ps_proc,
     /* direct spatial mv pred flag */
     if (ps_proc->i4_slice_type == BSLICE)
     {
-
+        ps_slice_hdr->u1_direct_spatial_mv_pred_flag = 1;
     }
 
     if (ps_proc->i4_slice_type == PSLICE || ps_proc->i4_slice_type == SPSLICE || ps_proc->i4_slice_type == BSLICE)
@@ -1036,11 +1049,23 @@ WORD32 ih264e_populate_slice_header(process_ctxt_t *ps_proc,
         {
 
         }
+
+        /* ref_pic_list_reordering_flag_l1 */
+        ps_slice_hdr->u1_ref_idx_reordering_flag_l1 = 0;
+
+        if (ps_slice_hdr->u1_ref_idx_reordering_flag_l1)
+        {
+
+        }
     }
 
+
+    /* Currently we do not support weighted pred */
+    /* ps_slice_hdr->u1_weighted_bipred_idc = 0; */
+
     if ((ps_pps->i1_weighted_pred_flag &&
                     (ps_proc->i4_slice_type == PSLICE || ps_proc->i4_slice_type == SPSLICE)) ||
-                    (ps_slice_hdr->u1_weighted_bipred_idc == 1 && ps_proc->i4_slice_type == BSLICE))
+                    (ps_proc->i4_slice_type == BSLICE && ps_pps->i1_weighted_bipred_idc == 1))
     {
         /* TODO_LATER: Currently there is no support for weighted prediction.
              This needs to be updated when the support is added */
@@ -1114,6 +1139,8 @@ WORD32 ih264e_populate_slice_header(process_ctxt_t *ps_proc,
          * If this is not the case, we have to add Slice group map type to the bit stream */
     }
 
+    ps_slice_hdr->i1_cabac_init_idc = CABAC_INIT_IDC;
+
     return IH264E_SUCCESS;
 }
 
diff --git a/encoder/ih264e_error.h b/encoder/ih264e_error.h
index 8fe9dac..1eba46c 100644
--- a/encoder/ih264e_error.h
+++ b/encoder/ih264e_error.h
@@ -218,7 +218,10 @@ typedef enum
     IH264E_INVALID_ALT_REF_OPTION                                   = IH264E_CODEC_ERROR_START + 0x2E,
 
     /**No free picture buffer available to store recon pic */
-    IH264E_NO_FREE_RECONBUF                                           = IH264E_CODEC_ERROR_START + 0x2F,
+    IH264E_NO_FREE_RECONBUF                                         = IH264E_CODEC_ERROR_START + 0x2F,
+
+    /**Not enough memory allocated as output buffer */
+    IH264E_INSUFFICIENT_OUTPUT_BUFFER                               = IH264E_CODEC_ERROR_START + 0x30,
 
     /**max failure error code to ensure enum is 32 bits wide */
     IH264E_FAIL                                                     = -1,
diff --git a/encoder/ih264e_fmt_conv.c b/encoder/ih264e_fmt_conv.c
index 393d6ca..e06aea1 100644
--- a/encoder/ih264e_fmt_conv.c
+++ b/encoder/ih264e_fmt_conv.c
@@ -65,8 +65,8 @@
 #include "ih264_defs.h"
 #include "ih264_debug.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -75,9 +75,9 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "ih264_macros.h"
 #include "ih264_platform_macros.h"
-#include "ih264_error.h"
 #include "ih264_buf_mgr.h"
 #include "ih264e_defs.h"
 #include "ih264e_error.h"
@@ -85,6 +85,7 @@
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_fmt_conv.h"
 
diff --git a/encoder/ih264e_function_selector_generic.c b/encoder/ih264e_function_selector_generic.c
index 65f943a..8305fd2 100644
--- a/encoder/ih264e_function_selector_generic.c
+++ b/encoder/ih264e_function_selector_generic.c
@@ -60,8 +60,8 @@
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -70,24 +70,21 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_platform_macros.h"
-#include "ih264_intra_pred_filters.h"
-#include "ih264_trans_quant_itrans_iquant.h"
-#include "ih264e_defs.h"
-#include "ih264e_structs.h"
-#include "ih264_deblk_edge_filters.h"
+#include "ih264e_cabac.h"
 #include "ih264e_core_coding.h"
 #include "ih264_cavlc_tables.h"
 #include "ih264e_cavlc.h"
-#include "ih264_padding.h"
 #include "ih264e_intra_modes_eval.h"
-#include "ih264_mem_fns.h"
 #include "ih264e_fmt_conv.h"
 #include "ih264e_half_pel.h"
+#include "ih264e_me.h"
 
 
 /*****************************************************************************/
@@ -197,8 +194,12 @@ void ih264e_init_function_ptr_generic(codec_t *ps_codec)
     ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4;
 
     /* write mb syntax layer */
-    ps_codec->pf_write_mb_syntax_layer[ISLICE] = ih264e_write_islice_mb;
-    ps_codec->pf_write_mb_syntax_layer[PSLICE] = ih264e_write_pslice_mb;
+    ps_codec->pf_write_mb_syntax_layer[CAVLC][ISLICE] = ih264e_write_islice_mb_cavlc;
+    ps_codec->pf_write_mb_syntax_layer[CAVLC][PSLICE] = ih264e_write_pslice_mb_cavlc;
+    ps_codec->pf_write_mb_syntax_layer[CAVLC][BSLICE] = ih264e_write_bslice_mb_cavlc;
+    ps_codec->pf_write_mb_syntax_layer[CABAC][ISLICE] = ih264e_write_islice_mb_cabac;
+    ps_codec->pf_write_mb_syntax_layer[CABAC][PSLICE] = ih264e_write_pslice_mb_cabac;
+    ps_codec->pf_write_mb_syntax_layer[CABAC][BSLICE] = ih264e_write_bslice_mb_cabac;
 
     /* Padding Functions */
     ps_codec->pf_pad_top = ih264_pad_top;
@@ -255,5 +256,14 @@ void ih264e_init_function_ptr_generic(codec_t *ps_codec)
     ps_codec->pf_ih264e_sixtapfilter_horz = ih264e_sixtapfilter_horz;
     ps_codec->pf_ih264e_sixtap_filter_2dvh_vert = ih264e_sixtap_filter_2dvh_vert;
 
+    /* ME compute */
+    ps_codec->apf_compute_me[PSLICE] = &ih264e_compute_me_single_reflist;
+    ps_codec->apf_compute_me[BSLICE] = &ih264e_compute_me_multi_reflist;
+
+    /* skip decision */
+    ps_codec->apf_find_skip_params_me[PSLICE] = &ih264e_find_pskip_params_me;
+    ps_codec->apf_find_skip_params_me[BSLICE] = &ih264e_find_bskip_params_me;
+
+
     return;
 }
diff --git a/encoder/ih264e_globals.c b/encoder/ih264e_globals.c
index e2b46a4..6719c5f 100644
--- a/encoder/ih264e_globals.c
+++ b/encoder/ih264e_globals.c
@@ -196,66 +196,138 @@ const WORD8 gi1_mv_pred_condition[8] =
      -1,    0,    1,    -1,    2,    -1,    -1,    -1
 };
 
-/**
-******************************************************************************
-* @brief  maps the h264 quantizer to the mpeg2 quantizer scale
-* input  : h264 qp
-* output : equivalent mpeg 2 qp
-* @remarks mpeg2qscale = 2 ^ [((h264qp - 12) / 6) + 1]
-******************************************************************************
-*/
+
+/*******************************************************************************
+ * Translation of MPEG QP to H264 QP
+ ******************************************************************************/
+/*
+ * Note : RC library models QP and bits assuming the QP to be MPEG2.
+ *        Since MPEG qp varies linearly, when the relationship is computed,
+ *        it learns that delta(qp) => delta(bits). Now what we are doing by the
+ *        transation of qp is that
+ *              QPrc = a + b*2^(QPen)
+ *        By not considering the weight matrix in both MPEG and H264 we in effect
+ *        only changing the relation to
+ *              QPrc = c + d*2^(QPen)
+ *        This will only entatil changin the RC model parameters, and this will
+ *        not affect rc relation at all
+ *
+ *
+ * We have MPEG qp which varies from 0-228. The quantization factor has a linear
+ * relation ship with the size of quantized values
+ *
+ * We also have H264 Qp, which varies such that for a change in QP of 6 , we
+ * double the corresponding scaling factor. Hence the scaling is linear in terms
+ * of 2^(QPh/6)
+ *
+ * Now we want to have translation between QPm and QPh. Hence we can write
+ *
+ * QPm = a + b*2^(QPh/6)
+ *
+ * Appling boundary condition that
+ *      1) QPm = 0.625 if QPh = 0
+ *      2) QPm =   224 if QPh = 51,
+ *
+ * we will have
+ *  a = 0.0063, b = 0.6187
+ *
+ * Hence the relatiohship is
+ *  QPm = a + b*2^(Qph/6)
+ *  QPh = 6*log((Qpm - a)/b)
+ *
+ *
+ * Unrounded values for gau1_h264_to_mpeg2_qmap[H264_QP_ELEM] =
+ *
+ *   0.625       0.70077     0.78581     0.88127     0.98843     1.10870
+ *   1.24370     1.39523     1.56533     1.75625     1.97055     2.21110
+ *   2.48110     2.78417     3.12435     3.50620     3.93480     4.41589
+ *   4.95590     5.56204     6.24241     7.00609     7.86330     8.82548
+ *   9.90550     11.11778    12.47851    14.00588    15.72030    17.64467
+ *   19.80470    22.22925    24.95072    28.00547    31.43430    35.28304
+ *   39.60310    44.45221    49.89514    56.00463    62.86230    70.55978
+ *   79.19990    88.89811    99.78398    112.00296   125.71830   141.11325
+ *   158.39350   177.78992   199.56167   223.99963
+ *
+ *
+ *
+ * Unrounded values for gau1_mpeg2_to_h264_qmap[MPEG2_QP_ELEM]
+ *
+ *   0         4.1014    10.1288   13.6477   16.1425   18.0768   19.6568
+ *   20.9925   22.1493   23.1696   24.0822   24.9078   25.6614   26.3546
+ *   26.9964   27.5938   28.1527   28.6777   29.1726   29.6408   30.0850
+ *   30.5074   30.9102   31.2951   31.6636   32.0171   32.3567   32.6834
+ *   32.9983   33.3021   33.5957   33.8795   34.1544   34.4208   34.6793
+ *   34.9303   35.1742   35.4114   35.6423   35.8671   36.0863   36.3001
+ *   36.5087   36.7124   36.9115   37.1060   37.2963   37.4825   37.6648
+ *   37.8433   38.0182   38.1896   38.3577   38.5226   38.6844   38.8433
+ *   38.9993   39.1525   39.3031   39.4511   39.5966   39.7397   39.8804
+ *   40.0189   40.1553   40.2895   40.4217   40.5518   40.6801   40.8065
+ *   40.9310   41.0538   41.1749   41.2943   41.4121   41.5283   41.6430
+ *   41.7561   41.8678   41.9781   42.0870   42.1946   42.3008   42.4057
+ *   42.5094   42.6118   42.7131   42.8132   42.9121   43.0099   43.1066
+ *   43.2023   43.2969   43.3905   43.4831   43.5747   43.6653   43.7550
+ *   43.8438   43.9317   44.0187   44.1049   44.1901   44.2746   44.3582
+ *   44.4411   44.5231   44.6044   44.6849   44.7647   44.8438   44.9221
+ *   44.9998   45.0767   45.1530   45.2286   45.3035   45.3779   45.4515
+ *   45.5246   45.5970   45.6689   45.7401   45.8108   45.8809   45.9504
+ *   46.0194   46.0878   46.1557   46.2231   46.2899   46.3563   46.4221
+ *   46.4874   46.5523   46.6166   46.6805   46.7439   46.8069   46.8694
+ *   46.9314   46.9930   47.0542   47.1150   47.1753   47.2352   47.2947
+ *   47.3538   47.4125   47.4708   47.5287   47.5862   47.6433   47.7001
+ *   47.7565   47.8125   47.8682   47.9235   47.9785   48.0331   48.0874
+ *   48.1413   48.1949   48.2482   48.3011   48.3537   48.4060   48.4580
+ *   48.5097   48.5611   48.6122   48.6629   48.7134   48.7636   48.8135
+ *   48.8631   48.9124   48.9615   49.0102   49.0587   49.1069   49.1549
+ *   49.2026   49.2500   49.2972   49.3441   49.3908   49.4372   49.4834
+ *   49.5293   49.5750   49.6204   49.6656   49.7106   49.7553   49.7998
+ *   49.8441   49.8882   49.9320   49.9756   50.0190   50.0622   50.1051
+ *   50.1479   50.1904   50.2327   50.2749   50.3168   50.3585   50.4000
+ *   50.4413   50.4825   50.5234   50.5641   50.6047   50.6450   50.6852
+ *   50.7252   50.7650   50.8046   50.8440   50.8833   50.9224   50.9613
+ *   51.0000
+ */
+
 const UWORD8 gau1_h264_to_mpeg2_qmap[H264_QP_ELEM] =
 {
-       1,      1,      1,      1,      1,      1,      1,      1,
-       2,      2,      2,      2,      3,      3,      3,      4,
-       4,      4,      5,      6,      6,      7,      8,      9,
-      10,     11,     13,     14,     16,     18,     20,     23,
-      25,     29,     32,     36,     40,     45,     51,     57,
-      64,     72,     81,     91,    102,    114,    128,    144,
-     161,    181,    203,    228,
+     1,    1,    1,    1,   1,    1,    1,   1,
+     2,    2,    2,    2,   2,    3,    3,   4,
+     4,    4,    5,    6,   6,    7,    8,   9,
+     10,   11,   12,   14,  16,   18,   20,  22,
+     25,   28,   31,   35,  40,   44,   50,  56,
+     63,   71,   79,   89,  100,  112,  126, 141,
+     158,  178,  200,  224
 };
 
-/**
-******************************************************************************
-* @brief  maps the mpeg2 quantizer to the h264 quantizer scale
-* input  : mpeg2 qp
-* output : equivalent h264qp
-* @remarks  MPEG-2 dequantization: (2*QFij + k)*Wij*qscale/32
-*      k = 0 (for intra)  k = sign(QFij)
-*   H.264 dequantization: (QFij*R(QP%6,i,j))>>(6 - QP/6)
-*
-*   Excluding the portion of R(QP%6,i,j) that is due to
-*   the DCT scale factors, the 6 entries after dividing by 64 (2^6)
-*   correspond to dequant values of
-*   2.5, 2.8125, 3.125, 3.5625, 3.9375, 4.4375.
-*   (a=0.5 b=sqrt(2/5) - refer to JVT-B038.doc)
-*
-*   Assuming that h264Qp=12 corresponds to MPEG2 qscale of 2
-*   (the actual mapping seems to be to MPEG2 qscale of 2.5),
-*   and the fact that the effective h264 quantizer changes by
-*   a factor of 2 for every 6 steps, the following mapping is
-*   obtained:
-*    h264qp = 6*(log2(mpeg2qscale/2)) + 12.
-*
-*   Note that the quant matrix entry assumed for the above
-*   equality is 16. Hence when the mpeg2 quant matrix entries
-*   are all 16, this lookup can be used as is (which is the
-*   default inter quant matrix in mpeg-2).
-******************************************************************************
-*/
 const UWORD8 gau1_mpeg2_to_h264_qmap[MPEG2_QP_ELEM] =
 {
-       0,      4,     10,     14,     16,     18,     20,     21,     22,     23,     24,     25,     26,     26,     27,     27,
-      28,     29,     29,     29,     30,     30,     31,     31,     32,     32,     32,     33,     33,     33,     33,     34,
-      34,     34,     35,     35,     35,     35,     35,     36,     36,     36,     36,     37,     37,     37,     37,     37,
-      38,     38,     38,     38,     38,     38,     39,     39,     39,     39,     39,     39,     39,     40,     40,     40,
-      40,     40,     40,     40,     41,     41,     41,     41,     41,     41,     41,     41,     41,     42,     42,     42,
-      42,     42,     42,     42,     42,     42,     43,     43,     43,     43,     43,     43,     43,     43,     43,     43,
-      44,     44,     44,     44,     44,     44,     44,     44,     44,     44,     44,     44,     45,     45,     45,     45,
-      45,     45,     45,     45,     45,     45,     45,     45,     45,     46,     46,     46,     46,     46,     46,     46,
-      46,     46,     46,     46,     46,     46,     46,     46,     47,     47,     47,     47,     47,     47,     47,     47,
-      47,     47,     47,     47,     47,     47,     47,     47,     47,     48,     48,     48,     48,     48,     48,     48,
-      48,     48,     48,     48,     48,     48,     48,     48,     48,     48,     48,     49,     49,     49,     49,     49,
-      49,     49,     49,     49,     49,     49,     49,     49,     49,     49,     49,     49,     49,     49,     49,     49,
+     0,    4,    10,  14,   16,   18,  20,  21,
+     22,   23,   24,  25,   26,   26,  27,  28,
+     28,   29,   29,  30,   30,   31,  31,  31,
+     32,   32,   32,  33,   33,   33,  34,  34,
+     34,   34,   35,  35,   35,   35,  36,  36,
+     36,   36,   37,  37,   37,   37,  37,  37,
+     38,   38,   38,  38,   38,   39,  39,  39,
+     39,   39,   39,  39,   40,   40,  40,  40,
+     40,   40,   40,  41,   41,   41,  41,  41,
+     41,   41,   41,  42,   42,   42,  42,  42,
+     42,   42,   42,  42,   43,   43,  43,  43,
+     43,   43,   43,  43,   43,   43,  43,  44,
+     44,   44,   44,  44,   44,   44,  44,  44,
+     44,   44,   45,  45,   45,   45,  45,  45,
+     45,   45,   45,  45,   45,   45,  45,  46,
+     46,   46,   46,  46,   46,   46,  46,  46,
+     46,   46,   46,  46,   46,   46,  47,  47,
+     47,   47,   47,  47,   47,   47,  47,  47,
+     47,   47,   47,  47,   47,   47,  48,  48,
+     48,   48,   48,  48,   48,   48,  48,  48,
+     48,   48,   48,  48,   48,   48,  48,  48,
+     49,   49,   49,  49,   49,   49,  49,  49,
+     49,   49,   49,  49,   49,   49,  49,  49,
+     49,   49,   49,  49,   49,   50,  50,  50,
+     50,   50,   50,  50,   50,   50,  50,  50,
+     50,   50,   50,  50,   50,   50,  50,  50,
+     50,   50,   50,  50,   51,   51,  51,  51,
+     51,   51,   51,  51,   51,   51,  51,  51,
+     51
 };
 
diff --git a/encoder/ih264e_half_pel.c b/encoder/ih264e_half_pel.c
index cb475a1..4871f40 100644
--- a/encoder/ih264e_half_pel.c
+++ b/encoder/ih264e_half_pel.c
@@ -55,7 +55,6 @@
 #include "ih264_defs.h"
 #include "ih264e_half_pel.h"
 #include "ih264_macros.h"
-#include "ih264e_half_pel.h"
 #include "ih264e_debug.h"
 #include "ih264_inter_pred_filters.h"
 #include "ih264_mem_fns.h"
diff --git a/encoder/ih264e_intra_modes_eval.c b/encoder/ih264e_intra_modes_eval.c
index b41d717..52b3034 100644
--- a/encoder/ih264e_intra_modes_eval.c
+++ b/encoder/ih264e_intra_modes_eval.c
@@ -74,15 +74,17 @@
 #include "ih264_inter_pred_filters.h"
 #include "ih264_mem_fns.h"
 #include "ih264_padding.h"
-#include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "ime_distortion_metrics.h"
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_intra_modes_eval.h"
 #include "ih264e_globals.h"
@@ -372,9 +374,10 @@ void ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t *ps
     UWORD32 i, u4_enable_fast_sad = 0, offset = 0;
 
     /* init temp var */
-    if (ps_proc->i4_slice_type == PSLICE)
+    if (ps_proc->i4_slice_type != ISLICE)
     {
-        offset = 5;
+        /* Offset for MBtype */
+        offset = (ps_proc->i4_slice_type == PSLICE) ? 5 : 23;
         u4_enable_fast_sad = ps_proc->s_me_ctxt.u4_enable_fast_sad;
     }
 
@@ -1315,7 +1318,7 @@ void ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_
     UWORD8 *pu1_pred_mb_plane = ps_proc->pu1_pred_mb_intra_chroma_plane;
 
     /* strides */
-    WORD32 i4_src_strd_c = ps_proc->i4_src_strd;
+    WORD32 i4_src_strd_c = ps_proc->i4_src_chroma_strd;
     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
     WORD32 i4_rec_strd_c = ps_proc->i4_rec_strd;
 
diff --git a/encoder/ih264e_mc.c b/encoder/ih264e_mc.c
index 2dd0974..2b19dd1 100644
--- a/encoder/ih264e_mc.c
+++ b/encoder/ih264e_mc.c
@@ -19,25 +19,25 @@
 */
 
 /**
-*******************************************************************************
-* @file
-*  ih264e_mc.c
-*
-* @brief
-*  Contains definition of functions for motion compensation
-*
-* @author
-*  ittiam
-*
-* @par List of Functions:
-*  - ih264e_motion_comp_luma()
-*  - ih264e_motion_comp_chroma()
-*
-* @remarks
-*  None
-*
-*******************************************************************************
-*/
+ *******************************************************************************
+ * @file
+ *  ih264e_mc.c
+ *
+ * @brief
+ *  Contains definition of functions for motion compensation
+ *
+ * @author
+ *  ittiam
+ *
+ * @par List of Functions:
+ *  - ih264e_motion_comp_luma()
+ *  - ih264e_motion_comp_chroma()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
 
 /*****************************************************************************/
 /* File Includes                                                             */
@@ -52,6 +52,7 @@
 #include "iv2.h"
 #include "ive2.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
 #include "ih264_structs.h"
 #include "ih264_inter_pred_filters.h"
@@ -60,57 +61,52 @@
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
 #include "ih264_trans_quant_itrans_iquant.h"
-#include "ih264_inter_pred_filters.h"
-#include "ih264_mem_fns.h"
-#include "ih264_padding.h"
-#include "ih264_intra_pred_filters.h"
-#include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "ih264e_defs.h"
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_mc.h"
 #include "ih264e_half_pel.h"
 
-
 /*****************************************************************************/
 /* Function Definitions                                                      */
 /*****************************************************************************/
 
 /**
-******************************************************************************
-*
-* @brief
-*  performs motion compensation for a luma mb for the given mv.
-*
-* @par Description
-*  This routine performs motion compensation of an inter mb. When the inter
-*  mb mode is P16x16, there is no need to copy 16x16 unit from reference buffer
-*  to pred buffer. In this case the function returns pointer and stride of the
-*  ref. buffer and this info is used in place of pred buffer else where.
-*  In other cases, the pred buffer is populated via copy / filtering + copy
-*  (q pel cases) and returned.
-*
-* @param[in] ps_proc
-*  pointer to current proc ctxt
-*
-* @param[out] pu1_pseudo_pred
-*  pseudo prediction buffer
-*
-* @param[out] u4_pseudo_pred_strd
-*  pseudo pred buffer stride
-*
-* @return  none
-*
-* @remarks Assumes half pel buffers for the entire frame are populated.
-*
-******************************************************************************
-*/
-void ih264e_motion_comp_luma(process_ctxt_t *ps_proc,
-                             UWORD8 **pu1_pseudo_pred,
+ ******************************************************************************
+ *
+ * @brief
+ *  performs motion compensation for a luma mb for the given mv.
+ *
+ * @par Description
+ *  This routine performs motion compensation of an inter mb. When the inter
+ *  mb mode is P16x16, there is no need to copy 16x16 unit from reference buffer
+ *  to pred buffer. In this case the function returns pointer and stride of the
+ *  ref. buffer and this info is used in place of pred buffer else where.
+ *  In other cases, the pred buffer is populated via copy / filtering + copy
+ *  (q pel cases) and returned.
+ *
+ * @param[in] ps_proc
+ *  pointer to current proc ctxt
+ *
+ * @param[out] pu1_pseudo_pred
+ *  pseudo prediction buffer
+ *
+ * @param[out] u4_pseudo_pred_strd
+ *  pseudo pred buffer stride
+ *
+ * @return  none
+ *
+ * @remarks Assumes half pel buffers for the entire frame are populated.
+ *
+ ******************************************************************************
+ */
+void ih264e_motion_comp_luma(process_ctxt_t *ps_proc, UWORD8 **pu1_pseudo_pred,
                              WORD32 *pi4_pseudo_pred_strd)
 {
     /* codec context */
@@ -152,51 +148,96 @@ void ih264e_motion_comp_luma(process_ctxt_t *ps_proc,
     /* half / qpel coefficient */
     UWORD32 u4_subpel_factor;
 
+    /* BIPRED Flag */
+    WORD32 i4_bipred_flag;
+
     /* temp var */
     UWORD32 u4_lkup_idx1;
 
     /* Init */
     i4_ref_strd[0] = ps_proc->i4_rec_strd;
 
-    i4_ref_strd[1] = i4_ref_strd[2] = i4_ref_strd[3] = ps_me_ctxt->u4_hp_buf_strd;
+    i4_ref_strd[1] = i4_ref_strd[2] = i4_ref_strd[3] =
+                    ps_me_ctxt->u4_subpel_buf_strd;
 
-    for (u4_num_prtn = 0; u4_num_prtn < ps_proc->u4_num_sub_partitions; u4_num_prtn++)
+    for (u4_num_prtn = 0; u4_num_prtn < ps_proc->u4_num_sub_partitions;
+                    u4_num_prtn++)
     {
+        mv_t *ps_curr_mv;
+
         /* update ptr to curr partition */
         ps_curr_pu = ps_proc->ps_pu + u4_num_prtn;
 
+        /* Set no no bipred */
+        i4_bipred_flag = 0;
+
+        switch (ps_curr_pu->b2_pred_mode)
+        {
+            case PRED_L0:
+                ps_curr_mv = &ps_curr_pu->s_me_info[0].s_mv;
+                pu1_ref[0] = ps_proc->apu1_ref_buf_luma[0];
+                break;
+
+            case PRED_L1:
+                ps_curr_mv = &ps_curr_pu->s_me_info[1].s_mv;
+                pu1_ref[0] = ps_proc->apu1_ref_buf_luma[1];
+                break;
+
+            case PRED_BI:
+                /*
+                 * In case of PRED_BI, we only need to ensure that
+                 * the reference buffer that gets selected is
+                 * ps_proc->pu1_best_subpel_buf
+                 */
+
+                /* Dummy */
+                ps_curr_mv = &ps_curr_pu->s_me_info[0].s_mv;
+                pu1_ref[0] = ps_proc->apu1_ref_buf_luma[0];
+
+                i4_bipred_flag = 1;
+                break;
+
+            default:
+                ps_curr_mv = &ps_curr_pu->s_me_info[0].s_mv;
+                pu1_ref[0] = ps_proc->apu1_ref_buf_luma[0];
+                break;
+
+        }
 
         /* get full pel mv's (full pel units) */
-        u4_mv_x_full = ps_curr_pu->s_l0_mv.i2_mvx >> 2;
-        u4_mv_y_full = ps_curr_pu->s_l0_mv.i2_mvy >> 2;
+        u4_mv_x_full = ps_curr_mv->i2_mvx >> 2;
+        u4_mv_y_full = ps_curr_mv->i2_mvy >> 2;
 
         /* get half pel mv's */
-        u4_mv_x_hpel = (ps_curr_pu->s_l0_mv.i2_mvx & 0x2) >> 1;
-        u4_mv_y_hpel = (ps_curr_pu->s_l0_mv.i2_mvy & 0x2) >> 1;
+        u4_mv_x_hpel = (ps_curr_mv->i2_mvx & 0x2) >> 1;
+        u4_mv_y_hpel = (ps_curr_mv->i2_mvy & 0x2) >> 1;
 
         /* get quarter pel mv's */
-        u4_mv_x_qpel = (ps_curr_pu->s_l0_mv.i2_mvx & 0x1);
-        u4_mv_y_qpel = (ps_curr_pu->s_l0_mv.i2_mvy & 0x1);
+        u4_mv_x_qpel = (ps_curr_mv->i2_mvx & 0x1);
+        u4_mv_y_qpel = (ps_curr_mv->i2_mvy & 0x1);
 
         /* width and height of partition */
         wd = (ps_curr_pu->b4_wd + 1) << 2;
         ht = (ps_curr_pu->b4_ht + 1) << 2;
 
         /* decision ? qpel/hpel, fpel */
-        u4_subpel_factor = (u4_mv_y_hpel << 3) + (u4_mv_x_hpel << 2) + (u4_mv_y_qpel << 1) + (u4_mv_x_qpel);
+        u4_subpel_factor = (u4_mv_y_hpel << 3) + (u4_mv_x_hpel << 2)
+                        + (u4_mv_y_qpel << 1) + (u4_mv_x_qpel);
 
-        /* update ref buffer ptrs */
-        pu1_ref[0] = ps_proc->pu1_ref_buf_luma + (u4_mv_y_full * i4_ref_strd[0]) + u4_mv_x_full;
+        /* Move ref to position given by MV */
+        pu1_ref[0] += ((u4_mv_y_full * i4_ref_strd[0]) + u4_mv_x_full);
 
-        pu1_ref[1] =  ps_proc->pu1_best_subpel_buf;
+        /* Sub pel ptrs/ Biperd pointers init */
+        pu1_ref[1] = ps_proc->pu1_best_subpel_buf;
         i4_ref_strd[1] = ps_proc->u4_bst_spel_buf_strd;
 
-
         /* update pred buff ptr */
-        pu1_pred = ps_proc->pu1_pred_mb + 4 * ps_curr_pu->b4_pos_y * i4_pred_strd + 4 * ps_curr_pu->b4_pos_x;
+        pu1_pred = ps_proc->pu1_pred_mb
+                        + 4 * ps_curr_pu->b4_pos_y * i4_pred_strd
+                        + 4 * ps_curr_pu->b4_pos_x;
 
-        /*u4_lkup_idx1 will be non zero for half pel*/
-        u4_lkup_idx1 = (u4_subpel_factor >> 2 ) != 0 ;
+        /* u4_lkup_idx1 will be non zero for half pel and bipred */
+        u4_lkup_idx1 = ((u4_subpel_factor >> 2) != 0) || i4_bipred_flag;
 
         {
             /********************************************************************/
@@ -218,7 +259,11 @@ void ih264e_motion_comp_luma(process_ctxt_t *ps_proc,
              */
             else
             {
-                ps_codec->pf_inter_pred_luma_copy(pu1_ref[u4_lkup_idx1], pu1_pred, i4_ref_strd[u4_lkup_idx1], i4_pred_strd, ht, wd, NULL, 0);
+                ps_codec->pf_inter_pred_luma_copy(pu1_ref[u4_lkup_idx1],
+                                                  pu1_pred,
+                                                  i4_ref_strd[u4_lkup_idx1],
+                                                  i4_pred_strd, ht, wd, NULL,
+                                                  0);
             }
 
         }
@@ -226,24 +271,24 @@ void ih264e_motion_comp_luma(process_ctxt_t *ps_proc,
 }
 
 /**
-******************************************************************************
-*
-* @brief
-*  performs motion compensation for chroma mb
-*
-* @par   Description
-*  Copies a MB of data from the reference buffer (Full pel, half pel or q pel)
-*  according to the motion vectors given
-*
-* @param[in] ps_proc
-*  pointer to current proc ctxt
-*
-* @return  none
-*
-* @remarks Assumes half pel and quarter pel buffers for the entire frame are
-*  populated.
-******************************************************************************
-*/
+ ******************************************************************************
+ *
+ * @brief
+ *  performs motion compensation for chroma mb
+ *
+ * @par   Description
+ *  Copies a MB of data from the reference buffer (Full pel, half pel or q pel)
+ *  according to the motion vectors given
+ *
+ * @param[in] ps_proc
+ *  pointer to current proc ctxt
+ *
+ * @return  none
+ *
+ * @remarks Assumes half pel and quarter pel buffers for the entire frame are
+ *  populated.
+ ******************************************************************************
+ */
 void ih264e_motion_comp_chroma(process_ctxt_t *ps_proc)
 {
     /* codec context */
@@ -283,38 +328,122 @@ void ih264e_motion_comp_chroma(process_ctxt_t *ps_proc)
     WORD32 u4_mv_y;
     UWORD8 u1_dx, u1_dy;
 
-    for (u4_num_prtn = 0; u4_num_prtn < ps_proc->u4_num_sub_partitions; u4_num_prtn++)
+    for (u4_num_prtn = 0; u4_num_prtn < ps_proc->u4_num_sub_partitions;
+                    u4_num_prtn++)
     {
-        ps_curr_pu =ps_proc->ps_pu + u4_num_prtn;
+        mv_t *ps_curr_mv;
 
-        u4_mv_x = ps_curr_pu->s_l0_mv.i2_mvx >> 3;
-        u4_mv_y = ps_curr_pu->s_l0_mv.i2_mvy >> 3;
+        ps_curr_pu = ps_proc->ps_pu + u4_num_prtn;
 
-        /*  corresponds to full pel motion vector in luma, but in chroma corresponds to pel formed with dx, dy =4*/
-        u4_mv_x_full = (ps_curr_pu->s_l0_mv.i2_mvx & 0x4) >> 2;
-        u4_mv_y_full = (ps_curr_pu->s_l0_mv.i2_mvy & 0x4) >> 2;
+        if (ps_curr_pu->b2_pred_mode != PRED_BI)
+        {
+            ps_curr_mv = &ps_curr_pu->s_me_info[ps_curr_pu->b2_pred_mode].s_mv;
+            pu1_ref = ps_proc->apu1_ref_buf_chroma[ps_curr_pu->b2_pred_mode];
 
-        /* get half pel mv's */
-        u4_mv_x_hpel = (ps_curr_pu->s_l0_mv.i2_mvx & 0x2) >> 1;
-        u4_mv_y_hpel = (ps_curr_pu->s_l0_mv.i2_mvy & 0x2) >> 1;
+            u4_mv_x = ps_curr_mv->i2_mvx >> 3;
+            u4_mv_y = ps_curr_mv->i2_mvy >> 3;
 
-        /* get quarter pel mv's */
-        u4_mv_x_qpel = (ps_curr_pu->s_l0_mv.i2_mvx & 0x1);
-        u4_mv_y_qpel = (ps_curr_pu->s_l0_mv.i2_mvy & 0x1);
+            /*  corresponds to full pel motion vector in luma, but in chroma corresponds to pel formed wiith dx, dy =4 */
+            u4_mv_x_full = (ps_curr_mv->i2_mvx & 0x4) >> 2;
+            u4_mv_y_full = (ps_curr_mv->i2_mvy & 0x4) >> 2;
+
+            /* get half pel mv's */
+            u4_mv_x_hpel = (ps_curr_mv->i2_mvx & 0x2) >> 1;
+            u4_mv_y_hpel = (ps_curr_mv->i2_mvy & 0x2) >> 1;
+
+            /* get quarter pel mv's */
+            u4_mv_x_qpel = (ps_curr_mv->i2_mvx & 0x1);
+            u4_mv_y_qpel = (ps_curr_mv->i2_mvy & 0x1);
 
-        /* width and height of sub macro block */
-        wd = (ps_curr_pu->b4_wd + 1) << 1;
-        ht = (ps_curr_pu->b4_ht + 1) << 1;
+            /* width and height of sub macro block */
+            wd = (ps_curr_pu->b4_wd + 1) << 1;
+            ht = (ps_curr_pu->b4_ht + 1) << 1;
 
-        /* move the pointers so that they point to the motion compensated locations */
-        pu1_ref = ps_proc->pu1_ref_buf_chroma + (u4_mv_y * i4_ref_strd) + (u4_mv_x << 1);
+            /* move the pointers so that they point to the motion compensated locations */
+            pu1_ref += ((u4_mv_y * i4_ref_strd) + (u4_mv_x << 1));
 
-        pu1_pred = ps_proc->pu1_pred_mb + 4 * ps_curr_pu->b4_pos_y * i4_pred_strd + 2 * ps_curr_pu->b4_pos_x;
+            pu1_pred = ps_proc->pu1_pred_mb
+                            + 4 * ps_curr_pu->b4_pos_y * i4_pred_strd
+                            + 2 * ps_curr_pu->b4_pos_x;
 
-        u1_dx = (u4_mv_x_full << 2) + (u4_mv_x_hpel << 1) + (u4_mv_x_qpel);
-        u1_dy = (u4_mv_y_full << 2) + (u4_mv_y_hpel << 1) + (u4_mv_y_qpel);
+            u1_dx = (u4_mv_x_full << 2) + (u4_mv_x_hpel << 1) + (u4_mv_x_qpel);
+            u1_dy = (u4_mv_y_full << 2) + (u4_mv_y_hpel << 1) + (u4_mv_y_qpel);
 
-        ps_codec->pf_inter_pred_chroma(pu1_ref, pu1_pred, i4_ref_strd, i4_pred_strd,
-                                   u1_dx, u1_dy, ht, wd);
+            /* cases where u1_dx = 0 or u1_dy = 0 are dealt separately in neon with
+             * separate functions for better performance
+             *
+             * ih264_inter_pred_chroma_dx_zero_a9q
+             * and
+             * ih264_inter_pred_chroma_dy_zero_a9q
+             */
+
+            ps_codec->pf_inter_pred_chroma(pu1_ref, pu1_pred, i4_ref_strd,
+                                           i4_pred_strd, u1_dx, u1_dy, ht, wd);
+        }
+        else /* If the pred mode is PRED_BI */
+        {
+            /*
+             * We need to interpolate the L0 and L1 ref pics with the chorma MV
+             * then use them to average for bilinrar interpred
+             */
+            WORD32 i4_predmode;
+            UWORD8 *pu1_ref_buf[2];
+
+            /* Temporary buffers to store the interpolated value from L0 and L1 */
+            pu1_ref_buf[PRED_L0] = ps_proc->apu1_subpel_buffs[0];
+            pu1_ref_buf[PRED_L1] = ps_proc->apu1_subpel_buffs[1];
+
+
+            for (i4_predmode = 0; i4_predmode < PRED_BI; i4_predmode++)
+            {
+                ps_curr_mv = &ps_curr_pu->s_me_info[i4_predmode].s_mv;
+                pu1_ref = ps_proc->apu1_ref_buf_chroma[i4_predmode];
+
+                u4_mv_x = ps_curr_mv->i2_mvx >> 3;
+                u4_mv_y = ps_curr_mv->i2_mvy >> 3;
+
+                /*
+                 * corresponds to full pel motion vector in luma, but in chroma
+                 * corresponds to pel formed wiith dx, dy =4
+                 */
+                u4_mv_x_full = (ps_curr_mv->i2_mvx & 0x4) >> 2;
+                u4_mv_y_full = (ps_curr_mv->i2_mvy & 0x4) >> 2;
+
+                /* get half pel mv's */
+                u4_mv_x_hpel = (ps_curr_mv->i2_mvx & 0x2) >> 1;
+                u4_mv_y_hpel = (ps_curr_mv->i2_mvy & 0x2) >> 1;
+
+                /* get quarter pel mv's */
+                u4_mv_x_qpel = (ps_curr_mv->i2_mvx & 0x1);
+                u4_mv_y_qpel = (ps_curr_mv->i2_mvy & 0x1);
+
+                /* width and height of sub macro block */
+                wd = (ps_curr_pu->b4_wd + 1) << 1;
+                ht = (ps_curr_pu->b4_ht + 1) << 1;
+
+                /* move the pointers so that they point to the motion compensated locations */
+                pu1_ref += ((u4_mv_y * i4_ref_strd) + (u4_mv_x << 1));
+
+                pu1_pred = ps_proc->pu1_pred_mb
+                                + 4 * ps_curr_pu->b4_pos_y * i4_pred_strd
+                                + 2 * ps_curr_pu->b4_pos_x;
+
+                u1_dx = (u4_mv_x_full << 2) + (u4_mv_x_hpel << 1)
+                                + (u4_mv_x_qpel);
+                u1_dy = (u4_mv_y_full << 2) + (u4_mv_y_hpel << 1)
+                                + (u4_mv_y_qpel);
+
+                ps_codec->pf_inter_pred_chroma(pu1_ref,
+                                               pu1_ref_buf[i4_predmode],
+                                               i4_ref_strd, MB_SIZE, u1_dx,
+                                               u1_dy, ht, wd);
+            }
+
+            ps_codec->pf_inter_pred_luma_bilinear(pu1_ref_buf[PRED_L0],
+                                                  pu1_ref_buf[PRED_L1], pu1_pred,
+                                                  MB_SIZE, MB_SIZE,
+                                                  i4_pred_strd, MB_SIZE >> 1,
+                                                  MB_SIZE);
+        }
     }
 }
diff --git a/encoder/ih264e_me.c b/encoder/ih264e_me.c
index 9e8d7a3..68bdea6 100644
--- a/encoder/ih264e_me.c
+++ b/encoder/ih264e_me.c
@@ -75,20 +75,20 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "ih264e_defs.h"
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_globals.h"
 #include "ih264_macros.h"
 #include "ih264e_me.h"
 #include "ime.h"
-#include "ime_distortion_metrics.h"
 #include "ih264_debug.h"
-#include "ithread.h"
 #include "ih264e_intra_modes_eval.h"
 #include "ih264e_core_coding.h"
 #include "ih264e_mc.h"
@@ -164,6 +164,8 @@ void ih264e_init_mv_bits(me_ctxt_t *ps_me_ctxt)
     }
 }
 
+
+
 /**
 *******************************************************************************
 *
@@ -204,37 +206,25 @@ void ih264e_init_mv_bits(me_ctxt_t *ps_me_ctxt)
 * number of such MVs
 *
 * @remarks
-*   Assumptions : 1. Assumes Single reference frame
-*                 2. Assumes Only partition of size 16x16
+*   Assumptions : 1. Assumes Only partition of size 16x16
 *
 *******************************************************************************
 */
 static void ih264e_get_search_candidates(process_ctxt_t *ps_proc,
-                                         me_ctxt_t *ps_me_ctxt)
+                                         me_ctxt_t *ps_me_ctxt,
+                                         WORD32 i4_reflist)
 {
     /* curr mb indices */
     WORD32 i4_mb_x = ps_proc->i4_mb_x;
 
-    /* left mb motion vector */
-    mv_t *ps_left_mv;
-
-    /* top left mb motion vector */
-    mv_t *ps_top_mv;
-
-    /* top left mb motion vector */
-    mv_t *ps_top_left_mv;
+    /* Motion vector */
+    mv_t *ps_left_mv, *ps_top_mv, *ps_top_left_mv, *ps_top_right_mv;
 
-    /* top left mb motion vector */
-    mv_t *ps_top_right_mv;
-
-    /* skip mv */
-    mv_t *ps_skip_mv = ps_proc->ps_skip_mv;
+    /* Pred modes */
+    WORD32 i4_left_mode, i4_top_mode, i4_top_left_mode, i4_top_right_mode;
 
     /* mb part info */
-    mb_part_ctxt *ps_mb_part = &ps_me_ctxt->s_mb_part;
-
-    /* num of candidate search candidates */
-    UWORD32 u4_num_candidates = 0;
+    mb_part_ctxt *ps_mb_part = &ps_me_ctxt->as_mb_part[i4_reflist];
 
     /* mvs */
     WORD32 mvx, mvy;
@@ -242,29 +232,36 @@ static void ih264e_get_search_candidates(process_ctxt_t *ps_proc,
     /* ngbr availability */
     block_neighbors_t *ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
 
+    /* Current mode */
+    WORD32 i4_cmpl_predmode = (i4_reflist == 0) ? PRED_L1 : PRED_L0;
+
     /* srch range*/
     WORD32 i4_srch_range_n = ps_me_ctxt->i4_srch_range_n;
     WORD32 i4_srch_range_s = ps_me_ctxt->i4_srch_range_s;
     WORD32 i4_srch_range_e = ps_me_ctxt->i4_srch_range_e;
     WORD32 i4_srch_range_w = ps_me_ctxt->i4_srch_range_w;
 
-    ps_left_mv = &ps_proc->s_left_mb_pu_ME.s_l0_mv;
-    ps_top_mv = &(ps_proc->ps_top_row_pu_ME + i4_mb_x)->s_l0_mv;
-    ps_top_left_mv = &ps_proc->s_top_left_mb_pu_ME.s_l0_mv;
-    ps_top_right_mv = &(ps_proc->ps_top_row_pu_ME + i4_mb_x + 1)->s_l0_mv;
+    ps_left_mv = &ps_proc->s_left_mb_pu_ME.s_me_info[i4_reflist].s_mv;
+    ps_top_mv = &(ps_proc->ps_top_row_pu_ME + i4_mb_x)->s_me_info[i4_reflist].s_mv;
+    ps_top_left_mv = &ps_proc->s_top_left_mb_pu_ME.s_me_info[i4_reflist].s_mv;
+    ps_top_right_mv = &(ps_proc->ps_top_row_pu_ME + i4_mb_x + 1)->s_me_info[i4_reflist].s_mv;
+
+    i4_left_mode = ps_proc->s_left_mb_pu_ME.b2_pred_mode != i4_cmpl_predmode;
+    i4_top_mode = (ps_proc->ps_top_row_pu_ME + i4_mb_x)->b2_pred_mode != i4_cmpl_predmode;
+    i4_top_left_mode = ps_proc->s_top_left_mb_pu_ME.b2_pred_mode != i4_cmpl_predmode;
+    i4_top_right_mode = (ps_proc->ps_top_row_pu_ME + i4_mb_x + 1)->b2_pred_mode != i4_cmpl_predmode;
+
+    /* num of candidate search candidates */
+    UWORD32 u4_num_candidates =0 ;
 
-    /************************************************************/
     /* Taking the Zero motion vector as one of the candidates   */
-    /************************************************************/
-    ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvx = 0;
-    ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvy = 0;
+    ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvx = 0;
+    ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvy = 0;
 
     u4_num_candidates++;
 
-    /************************************************************/
     /* Taking the Left MV Predictor as one of the candidates    */
-    /************************************************************/
-    if (ps_ngbr_avbl->u1_mb_a)
+    if (ps_ngbr_avbl->u1_mb_a && i4_left_mode)
     {
         mvx      = (ps_left_mv->i2_mvx + 2) >> 2;
         mvy      = (ps_left_mv->i2_mvy + 2) >> 2;
@@ -272,21 +269,14 @@ static void ih264e_get_search_candidates(process_ctxt_t *ps_proc,
         mvx = CLIP3(i4_srch_range_w, i4_srch_range_e, mvx);
         mvy = CLIP3(i4_srch_range_n, i4_srch_range_s, mvy);
 
-        ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvx = mvx;
-        ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvy = mvy;
+        ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvx = mvx;
+        ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvy = mvy;
 
         u4_num_candidates ++;
     }
-    /*else
-    {
-        ps_me_ctxt->as_mv_init_search[LEFT_CAND].i2_mvx = 0;
-        ps_me_ctxt->as_mv_init_search[LEFT_CAND].i2_mvy = 0;
-    }*/
 
-    /************************************************************/
     /* Taking the Top MV Predictor as one of the candidates     */
-    /************************************************************/
-    if (ps_ngbr_avbl->u1_mb_b)
+    if (ps_ngbr_avbl->u1_mb_b && i4_top_mode)
     {
         mvx      = (ps_top_mv->i2_mvx + 2) >> 2;
         mvy      = (ps_top_mv->i2_mvy + 2) >> 2;
@@ -294,15 +284,13 @@ static void ih264e_get_search_candidates(process_ctxt_t *ps_proc,
         mvx = CLIP3(i4_srch_range_w, i4_srch_range_e, mvx);
         mvy = CLIP3(i4_srch_range_n, i4_srch_range_s, mvy);
 
-        ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvx = mvx;
-        ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvy = mvy;
+        ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvx = mvx;
+        ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvy = mvy;
 
         u4_num_candidates ++;
 
-        /************************************************************/
         /* Taking the TopRt MV Predictor as one of the candidates   */
-        /************************************************************/
-        if (ps_ngbr_avbl->u1_mb_c)
+        if (ps_ngbr_avbl->u1_mb_c && i4_top_right_mode)
         {
             mvx      = (ps_top_right_mv->i2_mvx + 2) >> 2;
             mvy      = (ps_top_right_mv->i2_mvy + 2)>> 2;
@@ -310,15 +298,13 @@ static void ih264e_get_search_candidates(process_ctxt_t *ps_proc,
             mvx = CLIP3(i4_srch_range_w, i4_srch_range_e, mvx);
             mvy = CLIP3(i4_srch_range_n, i4_srch_range_s, mvy);
 
-            ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvx = mvx;
-            ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvy = mvy;
+            ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvx = mvx;
+            ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvy = mvy;
 
             u4_num_candidates ++;
         }
-        /************************************************************/
         /* Taking the TopLt MV Predictor as one of the candidates   */
-        /************************************************************/
-        else if (ps_ngbr_avbl->u1_mb_d)
+        else if(ps_ngbr_avbl->u1_mb_d && i4_top_left_mode)
         {
             mvx      = (ps_top_left_mv->i2_mvx + 2) >> 2;
             mvy      = (ps_top_left_mv->i2_mvy + 2) >> 2;
@@ -326,84 +312,84 @@ static void ih264e_get_search_candidates(process_ctxt_t *ps_proc,
             mvx = CLIP3(i4_srch_range_w, i4_srch_range_e, mvx);
             mvy = CLIP3(i4_srch_range_n, i4_srch_range_s, mvy);
 
-            ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvx = mvx;
-            ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvy = mvy;
+            ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvx = mvx;
+            ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvy = mvy;
 
             u4_num_candidates ++;
         }
-        /*else
-        {
-            ps_me_ctxt->as_mv_init_search[TOPR_CAND].i2_mvx = 0;
-            ps_me_ctxt->as_mv_init_search[TOPR_CAND].i2_mvy = 0;
-        }*/
     }
-    /*else
-    {
-        ps_me_ctxt->as_mv_init_search[TOP_CAND].i2_mvx = 0;
-        ps_me_ctxt->as_mv_init_search[TOP_CAND].i2_mvy = 0;
-
-        ps_me_ctxt->as_mv_init_search[TOPR_CAND].i2_mvx = 0;
-        ps_me_ctxt->as_mv_init_search[TOPR_CAND].i2_mvy = 0;
-    }*/
 
 
     /********************************************************************/
     /*                            MV Prediction                         */
     /********************************************************************/
-    ih264e_mv_pred_me(ps_proc);
+    ih264e_mv_pred_me(ps_proc, i4_reflist);
 
-    ps_mb_part->s_mv_pred.i2_mvx = ps_proc->ps_pred_mv->i2_mvx;
-    ps_mb_part->s_mv_pred.i2_mvy = ps_proc->ps_pred_mv->i2_mvy;
+    ps_mb_part->s_mv_pred.i2_mvx = ps_proc->ps_pred_mv[i4_reflist].s_mv.i2_mvx;
+    ps_mb_part->s_mv_pred.i2_mvy = ps_proc->ps_pred_mv[i4_reflist].s_mv.i2_mvy;
 
-    /************************************************************/
     /* Get the skip motion vector                               */
-    /************************************************************/
-    ih264e_find_skip_motion_vector(ps_proc, 1);
+    {
+        ps_me_ctxt->i4_skip_type = ps_proc->ps_codec->apf_find_skip_params_me
+                                    [ps_proc->i4_slice_type](ps_proc, i4_reflist);
 
-    /************************************************************/
-    /* Taking the Skip motion vector as one of the candidates   */
-    /************************************************************/
-    mvx = (ps_skip_mv->i2_mvx + 2) >> 2;
-    mvy = (ps_skip_mv->i2_mvy + 2) >> 2;
+        /* Taking the Skip motion vector as one of the candidates   */
+        mvx = (ps_proc->ps_skip_mv[i4_reflist].s_mv.i2_mvx + 2) >> 2;
+        mvy = (ps_proc->ps_skip_mv[i4_reflist].s_mv.i2_mvy + 2) >> 2;
 
-    mvx = CLIP3(i4_srch_range_w, i4_srch_range_e, mvx);
-    mvy = CLIP3(i4_srch_range_n, i4_srch_range_s, mvy);
+        mvx = CLIP3(i4_srch_range_w, i4_srch_range_e, mvx);
+        mvy = CLIP3(i4_srch_range_n, i4_srch_range_s, mvy);
 
-    ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvx = mvx;
-    ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvy = mvy;
+        ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvx = mvx;
+        ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvy = mvy;
+        u4_num_candidates++;
 
-    u4_num_candidates++;
+        if (ps_proc->i4_slice_type == BSLICE)
+        {
+            /* Taking the temporal Skip motion vector as one of the candidates   */
+            mvx = (ps_proc->ps_skip_mv[i4_reflist + 2].s_mv.i2_mvx + 2) >> 2;
+            mvy = (ps_proc->ps_skip_mv[i4_reflist + 2].s_mv.i2_mvy + 2) >> 2;
+
+            mvx = CLIP3(i4_srch_range_w, i4_srch_range_e, mvx);
+            mvy = CLIP3(i4_srch_range_n, i4_srch_range_s, mvy);
 
-    ASSERT(u4_num_candidates <= 5);
+            ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvx = mvx;
+            ps_me_ctxt->as_mv_init_search[i4_reflist][u4_num_candidates].i2_mvy = mvy;
+            u4_num_candidates++;
+        }
+    }
 
-    ps_me_ctxt->u4_num_candidates = u4_num_candidates;
+    ASSERT(u4_num_candidates <= 6);
+
+    ps_me_ctxt->u4_num_candidates[i4_reflist] = u4_num_candidates;
 }
 
 /**
 *******************************************************************************
 *
-* @brief The function gives the skip motion vector
+* @brief The function computes parameters for a PSKIP MB
 *
 * @par Description:
-*  The function gives the skip motion vector
+*  The function updates the skip motion vector and checks if the current
+*  MB can be a skip PSKIP mB or not
 *
-* @param[in] ps_left_mb_pu
-*  pointer to left mb motion vector info
+* @param[in] ps_proc
+*  Pointer to process context
 *
-* @param[in] ps_top_row_pu
-*  pointer to top & top right mb motion vector info
+* @param[in] u4_for_me
+*  Flag to indicate function is called for ME or not
 *
-* @param[out] ps_pred_mv
-*  pointer to candidate predictors for the current block
+* @param[out] i4_ref_list
+*  Current active refernce list
 *
-* @returns The x & y components of the MV predictor.
+* @returns Flag indicating if the current MB can be marked as skip
 *
-* @remarks The code implements the logic as described in sec 8.4.1.1 in H264
+* @remarks The code implements the logic as described in sec 8.4.1.2.2 in H264
 *   specification.
 *
 *******************************************************************************
 */
-void ih264e_find_skip_motion_vector(process_ctxt_t *ps_proc, UWORD32 u4_for_me)
+WORD32 ih264e_find_pskip_params(process_ctxt_t *ps_proc, WORD32 i4_reflist)
 {
     /* left mb motion vector */
     enc_pu_t *ps_left_mb_pu ;
@@ -411,35 +397,116 @@ void ih264e_find_skip_motion_vector(process_ctxt_t *ps_proc, UWORD32 u4_for_me)
     /* top mb motion vector */
     enc_pu_t *ps_top_mb_pu ;
 
-    /* skip mv */
-    mv_t *ps_skip_mv = ps_proc->ps_skip_mv;
+    /* Skip mv */
+    mv_t *ps_skip_mv = &ps_proc->ps_skip_mv[PRED_L0].s_mv;
+
+    UNUSED(i4_reflist);
+
+    ps_left_mb_pu = &ps_proc->s_left_mb_pu ;
+    ps_top_mb_pu = ps_proc->ps_top_row_pu + ps_proc->i4_mb_x;
+
+    if ((!ps_proc->ps_ngbr_avbl->u1_mb_a) ||
+        (!ps_proc->ps_ngbr_avbl->u1_mb_b) ||
+        (
+          (ps_left_mb_pu->s_me_info[PRED_L0].i1_ref_idx == -1) &&
+          (ps_left_mb_pu->s_me_info[PRED_L0].s_mv.i2_mvx == 0) &&
+          (ps_left_mb_pu->s_me_info[PRED_L0].s_mv.i2_mvy == 0)
+       ) ||
+       (
+          (ps_top_mb_pu->s_me_info[PRED_L0].i1_ref_idx == -1) &&
+          (ps_top_mb_pu->s_me_info[PRED_L0].s_mv.i2_mvx == 0) &&
+          (ps_top_mb_pu->s_me_info[PRED_L0].s_mv.i2_mvy == 0)
+       )
+     )
 
-    if (u4_for_me == 1)
     {
-        ps_left_mb_pu = &ps_proc->s_left_mb_pu_ME;
-        ps_top_mb_pu = ps_proc->ps_top_row_pu_ME + ps_proc->i4_mb_x;
+        ps_skip_mv->i2_mvx = 0;
+        ps_skip_mv->i2_mvy = 0;
     }
     else
     {
-        ps_left_mb_pu = &ps_proc->s_left_mb_pu ;
-        ps_top_mb_pu = ps_proc->ps_top_row_pu + ps_proc->i4_mb_x;
+        ps_skip_mv->i2_mvx = ps_proc->ps_pred_mv[PRED_L0].s_mv.i2_mvx;
+        ps_skip_mv->i2_mvy = ps_proc->ps_pred_mv[PRED_L0].s_mv.i2_mvy;
+    }
+
+    if ( (ps_proc->ps_pu->s_me_info[PRED_L0].s_mv.i2_mvx == ps_skip_mv->i2_mvx)
+     && (ps_proc->ps_pu->s_me_info[PRED_L0].s_mv.i2_mvy == ps_skip_mv->i2_mvy))
+    {
+        return 1;
     }
 
-    if (  (!ps_proc->ps_ngbr_avbl->u1_mb_a) ||
-          (!ps_proc->ps_ngbr_avbl->u1_mb_b) ||
-          ((ps_left_mb_pu->i1_l0_ref_idx | ps_left_mb_pu->s_l0_mv.i2_mvx | ps_left_mb_pu->s_l0_mv.i2_mvy) == 0) ||
-          ((ps_top_mb_pu->i1_l0_ref_idx | ps_top_mb_pu->s_l0_mv.i2_mvx | ps_top_mb_pu->s_l0_mv.i2_mvy) == 0) )
+    return 0;
+}
+
+/**
+*******************************************************************************
+*
+* @brief The function computes parameters for a PSKIP MB
+*
+* @par Description:
+*  The function updates the skip motion vector and checks if the current
+*  MB can be a skip PSKIP mB or not
+*
+* @param[in] ps_proc
+*  Pointer to process context
+*
+* @param[in] u4_for_me
+*  Flag to dincate fucntion is called for ME or not
+*
+* @param[out] i4_ref_list
+*  Current active refernce list
+*
+* @returns Flag indicating if the current MB can be marked as skip
+*
+* @remarks The code implements the logic as described in sec 8.4.1.2.2 in H264
+*   specification.
+*
+*******************************************************************************
+*/
+WORD32 ih264e_find_pskip_params_me(process_ctxt_t *ps_proc, WORD32 i4_reflist)
+{
+    /* left mb motion vector */
+    enc_pu_t *ps_left_mb_pu ;
+
+    /* top mb motion vector */
+    enc_pu_t *ps_top_mb_pu ;
+
+    /* Skip mv */
+    mv_t *ps_skip_mv = &ps_proc->ps_skip_mv[PRED_L0].s_mv;
+
+    UNUSED(i4_reflist);
+
+    ps_left_mb_pu = &ps_proc->s_left_mb_pu_ME;
+    ps_top_mb_pu = ps_proc->ps_top_row_pu_ME + ps_proc->i4_mb_x;
+
+    if ((!ps_proc->ps_ngbr_avbl->u1_mb_a) ||
+        (!ps_proc->ps_ngbr_avbl->u1_mb_b) ||
+        (
+          (ps_left_mb_pu->s_me_info[PRED_L0].i1_ref_idx == -1) &&
+          (ps_left_mb_pu->s_me_info[PRED_L0].s_mv.i2_mvx == 0) &&
+          (ps_left_mb_pu->s_me_info[PRED_L0].s_mv.i2_mvy == 0)
+        ) ||
+        (
+          (ps_top_mb_pu->s_me_info[PRED_L0].i1_ref_idx == -1) &&
+          (ps_top_mb_pu->s_me_info[PRED_L0].s_mv.i2_mvx == 0) &&
+          (ps_top_mb_pu->s_me_info[PRED_L0].s_mv.i2_mvy == 0)
+        )
+     )
+
     {
         ps_skip_mv->i2_mvx = 0;
         ps_skip_mv->i2_mvy = 0;
     }
     else
     {
-        ps_skip_mv->i2_mvx = ps_proc->ps_pred_mv->i2_mvx;
-        ps_skip_mv->i2_mvy = ps_proc->ps_pred_mv->i2_mvy;
+        ps_skip_mv->i2_mvx = ps_proc->ps_pred_mv[PRED_L0].s_mv.i2_mvx;
+        ps_skip_mv->i2_mvy = ps_proc->ps_pred_mv[PRED_L0].s_mv.i2_mvy;
     }
+
+    return PRED_L0;
 }
 
+
 /**
 *******************************************************************************
 *
@@ -469,61 +536,64 @@ void ih264e_find_skip_motion_vector(process_ctxt_t *ps_proc, UWORD32 u4_for_me)
 */
 void ih264e_get_mv_predictor(enc_pu_t *ps_left_mb_pu,
                              enc_pu_t *ps_top_row_pu,
-                             mv_t *ps_pred_mv)
+                             enc_pu_mv_t *ps_pred_mv,
+                             WORD32 i4_ref_list)
 {
-    /* curr frame ref idx */
-    /* we are assuming that we are operating on single reference frame
-     * hence the ref idx is insignificant during mv prediction.
-     */
-    WORD32 u4_ref_idx = 0;
 
-    /* temp var */
-    WORD32 pred_algo = 3, a, b, c;
-
-    /* If only one of the candidate blocks has a reference frame equal to
-     * the current block then use the same block as the final predictor */
-    a = (ps_left_mb_pu->i1_l0_ref_idx == u4_ref_idx)? 0:-1;
-    b = (ps_top_row_pu[0].i1_l0_ref_idx == u4_ref_idx)? 0:-1;
-    c = (ps_top_row_pu[1].i1_l0_ref_idx == u4_ref_idx)? 0:-1;
-
-    if (a == 0 && b == -1 && c == -1)
-        pred_algo = 0; /* LEFT */
-    else if (a == -1 && b == 0 && c == -1)
-        pred_algo = 1; /* TOP */
-    else if (a == -1 && b == -1 && c == 0)
-        pred_algo = 2; /* TOP RIGHT */
-
-    switch (pred_algo)
-    {
-        case 0:
-            /* left */
-            ps_pred_mv->i2_mvx = ps_left_mb_pu->s_l0_mv.i2_mvx;
-            ps_pred_mv->i2_mvy = ps_left_mb_pu->s_l0_mv.i2_mvy;
-            break;
-        case 1:
-            /* top */
-            ps_pred_mv->i2_mvx = ps_top_row_pu[0].s_l0_mv.i2_mvx;
-            ps_pred_mv->i2_mvy = ps_top_row_pu[0].s_l0_mv.i2_mvy;
-            break;
-        case 2:
-            /* top right */
-            ps_pred_mv->i2_mvx = ps_top_row_pu[1].s_l0_mv.i2_mvx;
-            ps_pred_mv->i2_mvy = ps_top_row_pu[1].s_l0_mv.i2_mvy;
-            break;
-        case 3:
-            /* median */
-            MEDIAN(ps_left_mb_pu->s_l0_mv.i2_mvx,
-                   ps_top_row_pu[0].s_l0_mv.i2_mvx,
-                   ps_top_row_pu[1].s_l0_mv.i2_mvx,
-                   ps_pred_mv->i2_mvx);
-            MEDIAN(ps_left_mb_pu->s_l0_mv.i2_mvy,
-                   ps_top_row_pu[0].s_l0_mv.i2_mvy,
-                   ps_top_row_pu[1].s_l0_mv.i2_mvy,
-                   ps_pred_mv->i2_mvy);
+    /* Indicated the current ref */
+    WORD8 i1_ref_idx;
 
-            break;
-        default:
-            break;
+    /* For pred L0 */
+    i1_ref_idx = -1;
+    {
+        /* temp var */
+        WORD32 pred_algo = 3, a, b, c;
+
+        /* If only one of the candidate blocks has a reference frame equal to
+         * the current block then use the same block as the final predictor */
+        a = (ps_left_mb_pu->s_me_info[i4_ref_list].i1_ref_idx == i1_ref_idx) ? 0 : -1;
+        b = (ps_top_row_pu[0].s_me_info[i4_ref_list].i1_ref_idx == i1_ref_idx) ? 0 : -1;
+        c = (ps_top_row_pu[1].s_me_info[i4_ref_list].i1_ref_idx == i1_ref_idx) ? 0 : -1;
+
+        if (a == 0 && b == -1 && c == -1)
+            pred_algo = 0; /* LEFT */
+        else if(a == -1 && b == 0 && c == -1)
+            pred_algo = 1; /* TOP */
+        else if(a == -1 && b == -1 && c == 0)
+            pred_algo = 2; /* TOP RIGHT */
+
+        switch (pred_algo)
+        {
+            case 0:
+                /* left */
+                ps_pred_mv->s_mv.i2_mvx = ps_left_mb_pu->s_me_info[i4_ref_list].s_mv.i2_mvx;
+                ps_pred_mv->s_mv.i2_mvy = ps_left_mb_pu->s_me_info[i4_ref_list].s_mv.i2_mvy;
+                break;
+            case 1:
+                /* top */
+                ps_pred_mv->s_mv.i2_mvx = ps_top_row_pu[0].s_me_info[i4_ref_list].s_mv.i2_mvx;
+                ps_pred_mv->s_mv.i2_mvy = ps_top_row_pu[0].s_me_info[i4_ref_list].s_mv.i2_mvy;
+                break;
+            case 2:
+                /* top right */
+                ps_pred_mv->s_mv.i2_mvx = ps_top_row_pu[1].s_me_info[i4_ref_list].s_mv.i2_mvx;
+                ps_pred_mv->s_mv.i2_mvy = ps_top_row_pu[1].s_me_info[i4_ref_list].s_mv.i2_mvy;
+                break;
+            case 3:
+                /* median */
+                MEDIAN(ps_left_mb_pu->s_me_info[i4_ref_list].s_mv.i2_mvx,
+                       ps_top_row_pu[0].s_me_info[i4_ref_list].s_mv.i2_mvx,
+                       ps_top_row_pu[1].s_me_info[i4_ref_list].s_mv.i2_mvx,
+                       ps_pred_mv->s_mv.i2_mvx);
+                MEDIAN(ps_left_mb_pu->s_me_info[i4_ref_list].s_mv.i2_mvy,
+                       ps_top_row_pu[0].s_me_info[i4_ref_list].s_mv.i2_mvy,
+                       ps_top_row_pu[1].s_me_info[i4_ref_list].s_mv.i2_mvy,
+                       ps_pred_mv->s_mv.i2_mvy);
+
+                break;
+            default:
+                break;
+        }
     }
 }
 
@@ -545,31 +615,34 @@ void ih264e_get_mv_predictor(enc_pu_t *ps_left_mb_pu,
 *
 *******************************************************************************
 */
-void ih264e_mv_pred(process_ctxt_t *ps_proc)
+void ih264e_mv_pred(process_ctxt_t *ps_proc, WORD32 i4_slice_type)
 {
 
     /* left mb motion vector */
-    enc_pu_t *ps_left_mb_pu ;
+    enc_pu_t *ps_left_mb_pu;
 
     /* top left mb motion vector */
-    enc_pu_t *ps_top_left_mb_pu ;
+    enc_pu_t *ps_top_left_mb_pu;
 
     /* top row motion vector info */
     enc_pu_t *ps_top_row_pu;
 
     /* predicted motion vector */
-    mv_t *ps_pred_mv = ps_proc->ps_pred_mv;
+    enc_pu_mv_t *ps_pred_mv = ps_proc->ps_pred_mv;
 
     /* zero mv */
-    mv_t zero_mv = {0, 0};
+    mv_t zero_mv = { 0, 0 };
 
     /*  mb neighbor availability */
     block_neighbors_t *ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
 
     /* mb syntax elements of neighbors */
-    mb_info_t   *ps_top_syn = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
-    mb_info_t   *ps_top_left_syn;
-    UWORD32     u4_left_is_intra;
+    mb_info_t *ps_top_syn = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
+    mb_info_t *ps_top_left_syn;
+    UWORD32 u4_left_is_intra;
+
+    /* Temp var */
+    WORD32 i4_reflist, max_reflist, i4_cmpl_predmode;
 
     ps_top_left_syn = &(ps_proc->s_top_left_mb_syntax_ele);
     u4_left_is_intra = ps_proc->s_left_mb_syntax_ele.u2_is_intra;
@@ -577,44 +650,58 @@ void ih264e_mv_pred(process_ctxt_t *ps_proc)
     ps_top_left_mb_pu = &ps_proc->s_top_left_mb_pu;
     ps_top_row_pu = (ps_proc->ps_top_row_pu + ps_proc->i4_mb_x);
 
-    /* Before performing mv prediction prepare the ngbr information and
-     * reset motion vectors basing on their availability */
-    if (!ps_ngbr_avbl->u1_mb_a || (u4_left_is_intra == 1) )
-    {
-        /* left mv */
-        ps_left_mb_pu->i1_l0_ref_idx = -1;
-        ps_left_mb_pu->s_l0_mv = zero_mv;
-    }
-    if (!ps_ngbr_avbl->u1_mb_b || ps_top_syn->u2_is_intra)
-    {
-        /* top mv */
-        ps_top_row_pu[0].i1_l0_ref_idx = -1;
-        ps_top_row_pu[0].s_l0_mv = zero_mv;
-    }
-    if (!ps_ngbr_avbl->u1_mb_c)
+    /* Number of ref lists to process */
+    max_reflist = (i4_slice_type == PSLICE) ? 1 : 2;
+
+    for (i4_reflist = 0; i4_reflist < max_reflist; i4_reflist++)
     {
-        /* top right mv - When top right partition is not available for
-         * prediction if top left is available use it for prediction else
-         * set the mv information to -1 and (0, 0)
-         * */
-        if (!ps_ngbr_avbl->u1_mb_d || ps_top_left_syn->u2_is_intra)
+        i4_cmpl_predmode = (i4_reflist == 0) ? PRED_L1 : PRED_L0;
+
+        /* Before performing mv prediction prepare the ngbr information and
+         * reset motion vectors basing on their availability */
+        if (!ps_ngbr_avbl->u1_mb_a || (u4_left_is_intra == 1)
+                        || (ps_left_mb_pu->b2_pred_mode == i4_cmpl_predmode))
         {
-            ps_top_row_pu[1].i1_l0_ref_idx = -1;
-            ps_top_row_pu[1].s_l0_mv = zero_mv;
+            /* left mv */
+            ps_left_mb_pu->s_me_info[i4_reflist].i1_ref_idx = 0;
+            ps_left_mb_pu->s_me_info[i4_reflist].s_mv = zero_mv;
         }
-        else
+        if (!ps_ngbr_avbl->u1_mb_b || ps_top_syn->u2_is_intra
+                        || (ps_top_row_pu[0].b2_pred_mode == i4_cmpl_predmode))
         {
-            ps_top_row_pu[1].i1_l0_ref_idx = ps_top_left_mb_pu->i1_l0_ref_idx;
-            ps_top_row_pu[1].s_l0_mv = ps_top_left_mb_pu->s_l0_mv;
+            /* top mv */
+            ps_top_row_pu[0].s_me_info[i4_reflist].i1_ref_idx = 0;
+            ps_top_row_pu[0].s_me_info[i4_reflist].s_mv = zero_mv;
         }
-    }
-    else if (ps_top_syn[1].u2_is_intra)
-    {
-        ps_top_row_pu[1].i1_l0_ref_idx = -1;
-        ps_top_row_pu[1].s_l0_mv = zero_mv;
+
+        if (!ps_ngbr_avbl->u1_mb_c)
+        {
+            /* top right mv - When top right partition is not available for
+             * prediction if top left is available use it for prediction else
+             * set the mv information to -1 and (0, 0)
+             * */
+            if (!ps_ngbr_avbl->u1_mb_d || ps_top_left_syn->u2_is_intra
+                            || (ps_top_left_mb_pu->b2_pred_mode == i4_cmpl_predmode))
+            {
+                ps_top_row_pu[1].s_me_info[i4_reflist].i1_ref_idx = 0;
+                ps_top_row_pu[1].s_me_info[i4_reflist].s_mv = zero_mv;
+            }
+            else
+            {
+                ps_top_row_pu[1].s_me_info[i4_reflist].i1_ref_idx = ps_top_left_mb_pu->s_me_info[i4_reflist].i1_ref_idx;
+                ps_top_row_pu[1].s_me_info[i4_reflist].s_mv = ps_top_left_mb_pu->s_me_info[i4_reflist].s_mv;
+            }
+        }
+        else if(ps_top_syn[1].u2_is_intra
+                        || (ps_top_row_pu[1].b2_pred_mode == i4_cmpl_predmode))
+        {
+            ps_top_row_pu[1].s_me_info[i4_reflist].i1_ref_idx = 0;
+            ps_top_row_pu[1].s_me_info[i4_reflist].s_mv = zero_mv;
+        }
+
+        ih264e_get_mv_predictor(ps_left_mb_pu, ps_top_row_pu, &ps_pred_mv[i4_reflist], i4_reflist);
     }
 
-    ih264e_get_mv_predictor(ps_left_mb_pu, ps_top_row_pu, ps_pred_mv);
 }
 
 /**
@@ -635,7 +722,7 @@ void ih264e_mv_pred(process_ctxt_t *ps_proc)
 *
 *******************************************************************************
 */
-void ih264e_mv_pred_me(process_ctxt_t *ps_proc)
+void ih264e_mv_pred_me(process_ctxt_t *ps_proc, WORD32 i4_ref_list)
 {
     /* left mb motion vector */
     enc_pu_t *ps_left_mb_pu ;
@@ -649,11 +736,14 @@ void ih264e_mv_pred_me(process_ctxt_t *ps_proc)
     enc_pu_t s_top_row_pu[2];
 
     /* predicted motion vector */
-    mv_t *ps_pred_mv = ps_proc->ps_pred_mv;
+    enc_pu_mv_t *ps_pred_mv = ps_proc->ps_pred_mv;
 
     /* zero mv */
     mv_t zero_mv = {0, 0};
 
+    /* Complementary pred mode */
+    WORD32 i4_cmpl_predmode = (i4_ref_list == 0) ? PRED_L1 : PRED_L0;
+
     /*  mb neighbor availability */
     block_neighbors_t *ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
 
@@ -664,19 +754,23 @@ void ih264e_mv_pred_me(process_ctxt_t *ps_proc)
     s_top_row_pu[0] = ps_top_row_pu[0];
     s_top_row_pu[1] = ps_top_row_pu[1];
 
-    /* Before performing mv prediction prepare the ngbr information and
-     * reset motion vectors basing on their availability */
-    if (!ps_ngbr_avbl->u1_mb_a  )
+    /*
+     * Before performing mv prediction prepare the ngbr information and
+     * reset motion vectors basing on their availability
+     */
+
+    if (!ps_ngbr_avbl->u1_mb_a || (ps_left_mb_pu->b2_pred_mode == i4_cmpl_predmode))
     {
         /* left mv */
-        ps_left_mb_pu->i1_l0_ref_idx = -1;
-        ps_left_mb_pu->s_l0_mv = zero_mv;
+        ps_left_mb_pu->s_me_info[i4_ref_list].i1_ref_idx = 0;
+        ps_left_mb_pu->s_me_info[i4_ref_list].s_mv = zero_mv;
     }
-    if (!ps_ngbr_avbl->u1_mb_b )
+    if (!ps_ngbr_avbl->u1_mb_b || (s_top_row_pu[0].b2_pred_mode == i4_cmpl_predmode))
     {
         /* top mv */
-        s_top_row_pu[0].i1_l0_ref_idx = -1;
-        s_top_row_pu[0].s_l0_mv = zero_mv;
+        s_top_row_pu[0].s_me_info[i4_ref_list].i1_ref_idx = 0;
+        s_top_row_pu[0].s_me_info[i4_ref_list].s_mv = zero_mv;
+
     }
     if (!ps_ngbr_avbl->u1_mb_c)
     {
@@ -684,19 +778,28 @@ void ih264e_mv_pred_me(process_ctxt_t *ps_proc)
          * prediction if top left is available use it for prediction else
          * set the mv information to -1 and (0, 0)
          * */
-        if (!ps_ngbr_avbl->u1_mb_d)
+        if (!ps_ngbr_avbl->u1_mb_d || (ps_top_left_mb_pu->b2_pred_mode == i4_cmpl_predmode))
         {
-            s_top_row_pu[1].i1_l0_ref_idx = -1;
-            s_top_row_pu[1].s_l0_mv = zero_mv;
+            s_top_row_pu[1].s_me_info[i4_ref_list].i1_ref_idx = 0;
+            s_top_row_pu[1].s_me_info[i4_ref_list].s_mv = zero_mv;
+
+            s_top_row_pu[1].s_me_info[i4_ref_list].i1_ref_idx = 0;
+            s_top_row_pu[1].s_me_info[i4_ref_list].s_mv = zero_mv;
         }
         else
         {
-            s_top_row_pu[1].i1_l0_ref_idx = ps_top_left_mb_pu->i1_l0_ref_idx;
-            s_top_row_pu[1].s_l0_mv = ps_top_left_mb_pu->s_l0_mv;
+            s_top_row_pu[1].s_me_info[i4_ref_list].i1_ref_idx = ps_top_left_mb_pu->s_me_info[0].i1_ref_idx;
+            s_top_row_pu[1].s_me_info[i4_ref_list].s_mv = ps_top_left_mb_pu->s_me_info[0].s_mv;
         }
     }
+    else if (ps_top_row_pu[1].b2_pred_mode == i4_cmpl_predmode)
+    {
+        ps_top_row_pu[1].s_me_info[i4_ref_list].i1_ref_idx = 0;
+        ps_top_row_pu[1].s_me_info[i4_ref_list].s_mv = zero_mv;
+    }
 
-    ih264e_get_mv_predictor(ps_left_mb_pu, &(s_top_row_pu[0]), ps_pred_mv);
+    ih264e_get_mv_predictor(ps_left_mb_pu, &(s_top_row_pu[0]),
+                            &ps_pred_mv[i4_ref_list], i4_ref_list);
 }
 
 /**
@@ -722,20 +825,40 @@ void ih264e_init_me(process_ctxt_t *ps_proc)
     /* me ctxt */
     me_ctxt_t *ps_me_ctxt = &ps_proc->s_me_ctxt;
 
+    /* codec context */
+    codec_t *ps_codec = ps_proc->ps_codec;
+
+    ps_me_ctxt->i4_skip_bias[BSLICE] = SKIP_BIAS_B;
+
+    if (ps_codec->s_cfg.u4_num_bframes == 0)
+    {
+       ps_me_ctxt->i4_skip_bias[PSLICE] = 4 * SKIP_BIAS_P;
+    }
+    else
+    {
+       ps_me_ctxt->i4_skip_bias[PSLICE] =  SKIP_BIAS_P;
+    }
+
     /* src ptr */
     ps_me_ctxt->pu1_src_buf_luma = ps_proc->pu1_src_buf_luma;
+    /* src stride */
+    ps_me_ctxt->i4_src_strd = ps_proc->i4_src_strd;
 
-    /* ref ptr */
-    ps_me_ctxt->pu1_ref_buf_luma = ps_proc->pu1_ref_buf_luma;
+    /* ref ptrs and corresponding lagrange params */
+    ps_me_ctxt->apu1_ref_buf_luma[0] = ps_proc->apu1_ref_buf_luma[0];
+    ps_me_ctxt->apu1_ref_buf_luma[1] = ps_proc->apu1_ref_buf_luma[1];
 
-    /* lagrange param */
     ps_me_ctxt->u4_lambda_motion = gu1_qp0[ps_me_ctxt->u1_mb_qp];
+
+
 }
 
+
 /**
 *******************************************************************************
 *
-* @brief This function performs motion estimation for the current mb
+* @brief This function performs motion estimation for the current mb using
+*   single reference list
 *
 * @par Description:
 *  The current mb is compared with a list of mb's in the reference frame for
@@ -753,7 +876,7 @@ void ih264e_init_me(process_ctxt_t *ps_proc)
 *
 *******************************************************************************
 */
-void ih264e_compute_me(process_ctxt_t *ps_proc)
+void ih264e_compute_me_single_reflist(process_ctxt_t *ps_proc)
 {
     /* me ctxt */
     me_ctxt_t *ps_me_ctxt = &ps_proc->s_me_ctxt;
@@ -761,20 +884,6 @@ void ih264e_compute_me(process_ctxt_t *ps_proc)
     /* codec context */
     codec_t *ps_codec = ps_proc->ps_codec;
 
-//    /* mb syntax elements of neighbors */
-//    mb_info_t *ps_top_syn = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
-//    mb_info_t *ps_top_left_syn = &(ps_proc->s_top_left_mb_syntax_ME);
-
-    /* mb part info */
-    mb_part_ctxt *ps_mb_part = &ps_me_ctxt->s_mb_part;
-    mb_part_ctxt skip_mb_part_info;
-
-    /* temp var */
-    WORD32 rows_above, rows_below, columns_left, columns_right,u4_use_stat_sad;
-
-    /* Motion vectors in full-pel units */
-    WORD16 mv_x, mv_y;
-
     /* recon stride */
     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
 
@@ -787,118 +896,104 @@ void ih264e_compute_me(process_ctxt_t *ps_proc)
     /* Sad therholds */
     ps_me_ctxt->pu2_sad_thrsh = ps_qp_params->pu2_sad_thrsh;
 
-    /*Best half pel buffer*/
-    UWORD8 *pu1_best_subpel_buf = ps_proc->pu1_best_subpel_buf;
-    UWORD32 u4_bst_spel_strd = ps_proc->u4_bst_spel_buf_strd;
-
-    /* During evaluation for motion vectors do not search through padded regions */
-    /* Obtain number of rows and columns that are effective for computing for me evaluation */
-    rows_above = MB_SIZE + ps_proc->i4_mb_y * MB_SIZE;
-    rows_below = (ps_proc->i4_ht_mbs - ps_proc->i4_mb_y) * MB_SIZE;
-    columns_left = MB_SIZE + ps_proc->i4_mb_x * MB_SIZE;
-    columns_right = (ps_proc->i4_wd_mbs - ps_proc->i4_mb_x) * MB_SIZE;
+    /* Mb part ctxts for SKIP */
+    mb_part_ctxt s_skip_mbpart;
 
-    /* init srch range */
-    /* NOTE : For now, lets limit the search range by DEFAULT_MAX_SRCH_RANGE_X / 2
-     * on all sides.
-     */
-//    ps_me_ctxt->i4_srch_range_w = -MIN(columns_left, ps_me_ctxt->ai2_srch_boundaries[0]);
-//    ps_me_ctxt->i4_srch_range_e = MIN(columns_right, ps_me_ctxt->ai2_srch_boundaries[0]);
-//    ps_me_ctxt->i4_srch_range_n = -MIN(rows_above, ps_me_ctxt->ai2_srch_boundaries[1]);
-//    ps_me_ctxt->i4_srch_range_s = MIN(rows_below, ps_me_ctxt->ai2_srch_boundaries[1]);
-
-    ps_me_ctxt->i4_srch_range_w = -MIN(columns_left, DEFAULT_MAX_SRCH_RANGE_X >> 1);
-    ps_me_ctxt->i4_srch_range_e = MIN(columns_right, DEFAULT_MAX_SRCH_RANGE_X >> 1);
-    ps_me_ctxt->i4_srch_range_n = -MIN(rows_above, DEFAULT_MAX_SRCH_RANGE_Y >> 1);
-    ps_me_ctxt->i4_srch_range_s = MIN(rows_below, DEFAULT_MAX_SRCH_RANGE_Y >> 1);
-
-    /* this is to facilitate fast sub pel computation with minimal loads */
-    if (ps_me_ctxt->u4_enable_hpel)
     {
+        WORD32 rows_above, rows_below, columns_left, columns_right;
+
+        /* During evaluation for motion vectors do not search through padded regions */
+        /* Obtain number of rows and columns that are effective for computing for me evaluation */
+        rows_above = MB_SIZE + ps_proc->i4_mb_y * MB_SIZE;
+        rows_below = (ps_proc->i4_ht_mbs - ps_proc->i4_mb_y) * MB_SIZE;
+        columns_left = MB_SIZE + ps_proc->i4_mb_x * MB_SIZE;
+        columns_right = (ps_proc->i4_wd_mbs - ps_proc->i4_mb_x) * MB_SIZE;
+
+        /* init srch range */
+        /* NOTE : For now, lets limit the search range by DEFAULT_MAX_SRCH_RANGE_X / 2
+         * on all sides.
+         */
+        ps_me_ctxt->i4_srch_range_w = -MIN(columns_left, DEFAULT_MAX_SRCH_RANGE_X >> 1);
+        ps_me_ctxt->i4_srch_range_e = MIN(columns_right, DEFAULT_MAX_SRCH_RANGE_X >> 1);
+        ps_me_ctxt->i4_srch_range_n = -MIN(rows_above, DEFAULT_MAX_SRCH_RANGE_Y >> 1);
+        ps_me_ctxt->i4_srch_range_s = MIN(rows_below, DEFAULT_MAX_SRCH_RANGE_Y >> 1);
+
+        /* this is to facilitate fast sub pel computation with minimal loads */
         ps_me_ctxt->i4_srch_range_w += 1;
         ps_me_ctxt->i4_srch_range_e -= 1;
         ps_me_ctxt->i4_srch_range_n += 1;
         ps_me_ctxt->i4_srch_range_s -= 1;
     }
 
-    /*Initialize the min sad option*/
-    ps_me_ctxt->u4_min_sad_reached  = 0;    /*Not yet found min sad*/
-    ps_me_ctxt->i4_min_sad          = ps_proc->ps_cur_mb->u4_min_sad;
+    /* Compute ME and store the MVs */
 
-    /************************************************************/
-    /* Get the seed motion vector candidates                    */
-    /************************************************************/
-    ih264e_get_search_candidates(ps_proc, ps_me_ctxt);
-
-    /************************************************************/
-    /* Init the MB part ctxt structure                          */
-    /************************************************************/
-    ps_mb_part->s_mv_curr.i2_mvx = 0;
-    ps_mb_part->s_mv_curr.i2_mvy = 0;
-    ps_mb_part->i4_mb_cost = INT_MAX;
-    ps_mb_part->i4_mb_distortion = INT_MAX;
-
-    /* With NMB changes this logic will not work as we cannot exit NME in between*/
-    /********************************************************************/
-    /*                  Analyse skip                                    */
-    /********************************************************************/
-//    if (ps_proc->ps_codec->s_cfg.u4_enable_satqd == 0
-//                    && u4_frame_level_me == 0)
-//    {
-//        if ( (ps_proc->ps_ngbr_avbl->u1_mb_a && (ps_me_ctxt->u4_left_is_skip == 1)) ||
-//                        (ps_proc->ps_ngbr_avbl->u1_mb_b && ps_top_syn->u2_mb_type == PSKIP) ||
-//                        (ps_proc->ps_ngbr_avbl->u1_mb_d && ps_top_left_syn->u2_mb_type == PSKIP) )
-//        {
-//            if ( 0 == ih264e_analyse_skip(ps_proc, ps_me_ctxt) )
-//            {
-//                return;
-//            }
-//        }
-//    }
-
-    /********************************************************************/
-    /*                  compute skip cost                               */
-    /********************************************************************/
-    /* See if we need to use modified sad */
-    u4_use_stat_sad = (ps_proc->ps_codec->s_cfg.u4_enable_satqd == 1);
-
-    /* init the cost of skip MB */
-    skip_mb_part_info.i4_mb_cost = INT_MAX;
-    ime_compute_skip_cost(ps_me_ctxt, ps_proc->ps_skip_mv, &skip_mb_part_info, u4_use_stat_sad);
+    /***********************************************************************
+     * Compute ME for list L0
+     ***********************************************************************/
 
+    /* Init SATQD for the current list */
+    ps_me_ctxt->u4_min_sad_reached  = 0;
+    ps_me_ctxt->i4_min_sad = ps_proc->ps_cur_mb->u4_min_sad;
 
-    if (ps_me_ctxt->u4_min_sad_reached == 0)
+    /* Get the seed motion vector candidates                    */
+    ih264e_get_search_candidates(ps_proc, ps_me_ctxt, PRED_L0);
+
+    /* ****************************************************************
+     *Evaluate the SKIP for current list
+     * ****************************************************************/
+    s_skip_mbpart.s_mv_curr.i2_mvx = 0;
+    s_skip_mbpart.s_mv_curr.i2_mvy = 0;
+    s_skip_mbpart.i4_mb_cost = INT_MAX;
+    s_skip_mbpart.i4_mb_distortion = INT_MAX;
+
+    ime_compute_skip_cost( ps_me_ctxt,
+                           (ime_mv_t *)(&ps_proc->ps_skip_mv[PRED_L0].s_mv),
+                           &s_skip_mbpart,
+                           ps_proc->ps_codec->s_cfg.u4_enable_satqd,
+                           PRED_L0,
+                           0 /* Not a Bslice */ );
+
+    s_skip_mbpart.s_mv_curr.i2_mvx <<= 2;
+    s_skip_mbpart.s_mv_curr.i2_mvy <<= 2;
+
+    /******************************************************************
+     * Evaluate ME For current list
+     *****************************************************************/
+    ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr.i2_mvx = 0;
+    ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr.i2_mvy = 0;
+    ps_me_ctxt->as_mb_part[PRED_L0].i4_mb_cost = INT_MAX;
+    ps_me_ctxt->as_mb_part[PRED_L0].i4_mb_distortion = INT_MAX;
+
+    /* Init Hpel */
+    ps_me_ctxt->as_mb_part[PRED_L0].pu1_best_hpel_buf = NULL;
+
+    /* In case we found out the minimum SAD, exit the ME eval */
+    if (!ps_me_ctxt->u4_min_sad_reached)
     {
-        /************************************************************/
-        /* Evaluate search candidates for initial mv pt.            */
-        /************************************************************/
-        ime_evaluate_init_srchposn_16x16(ps_me_ctxt);
+        /* Evaluate search candidates for initial mv pt */
+        ime_evaluate_init_srchposn_16x16(ps_me_ctxt, PRED_L0);
 
         /********************************************************************/
         /*                  full pel motion estimation                      */
         /********************************************************************/
-        ime_full_pel_motion_estimation_16x16(ps_me_ctxt);
+        ime_full_pel_motion_estimation_16x16(ps_me_ctxt, PRED_L0);
 
-        DEBUG_MV_HISTOGRAM_ADD((ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvx >> 2),
-                               (ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvy >> 2));
+        /* Scale the MV to qpel resolution */
+        ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr.i2_mvx <<= 2;
+        ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr.i2_mvy <<= 2;
 
-        DEBUG_SAD_HISTOGRAM_ADD(ps_me_ctxt->s_mb_part.i4_mb_distortion, 1);
-        /********************************************************************/
-        /*                   sub pel motion estimation                      */
-        /********************************************************************/
         if (ps_me_ctxt->u4_enable_hpel)
         {
-            /* motion vectors in terms of full pel values */
-            mv_x = ps_mb_part->s_mv_curr.i2_mvx >> 2;
-            mv_y = ps_mb_part->s_mv_curr.i2_mvy >> 2;
-
             /* moving src pointer to the converged motion vector location*/
-            pu1_hpel_src = ps_me_ctxt->pu1_ref_buf_luma + mv_x + (mv_y * i4_rec_strd);
+            pu1_hpel_src =   ps_me_ctxt->apu1_ref_buf_luma[PRED_L0]
+                             + (ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr.i2_mvx >> 2)
+                             + (ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr.i2_mvy >> 2)* i4_rec_strd;
 
-            ps_me_ctxt->pu1_half_x = ps_proc->pu1_half_x;
-            ps_me_ctxt->pu1_half_y = ps_proc->pu1_half_y;
-            ps_me_ctxt->pu1_half_xy = ps_proc->pu1_half_xy;
-            ps_me_ctxt->u4_hp_buf_strd = HP_BUFF_WD;
+            ps_me_ctxt->apu1_subpel_buffs[0] = ps_proc->apu1_subpel_buffs[0];
+            ps_me_ctxt->apu1_subpel_buffs[1] = ps_proc->apu1_subpel_buffs[1];
+            ps_me_ctxt->apu1_subpel_buffs[2] = ps_proc->apu1_subpel_buffs[2];
+
+            ps_me_ctxt->u4_subpel_buf_strd = HP_BUFF_WD;
 
             /* half  pel search is done for both sides of full pel,
              * hence half_x of width x height = 17x16 is created
@@ -907,9 +1002,9 @@ void ih264e_compute_me(process_ctxt_t *ps_proc)
 
             /* computing half_x */
             ps_codec->pf_ih264e_sixtapfilter_horz(pu1_hpel_src,
-                                                  ps_proc->pu1_half_x,
+                                                  ps_me_ctxt->apu1_subpel_buffs[0],
                                                   i4_rec_strd,
-                                                  ps_me_ctxt->u4_hp_buf_strd);
+                                                  ps_me_ctxt->u4_subpel_buf_strd);
 
             /*
              * Halfpel search is done for both sides of full pel,
@@ -918,61 +1013,57 @@ void ih264e_compute_me(process_ctxt_t *ps_proc)
              * for half_xy top_left is required
              * hence it starts from pu1_hpel_src = full_pel_converged_point - i4_rec_strd - 1
              */
-
             pu1_hpel_src -= i4_rec_strd;
 
             /* computing half_y , and half_xy*/
             ps_codec->pf_ih264e_sixtap_filter_2dvh_vert(
-                            pu1_hpel_src, ps_proc->pu1_half_y,
-                            ps_proc->pu1_half_xy, i4_rec_strd,
-                            ps_me_ctxt->u4_hp_buf_strd, ps_proc->ai16_pred1 + 3,
-                            ps_me_ctxt->u4_hp_buf_strd);
+                            pu1_hpel_src, ps_me_ctxt->apu1_subpel_buffs[1],
+                            ps_me_ctxt->apu1_subpel_buffs[2], i4_rec_strd,
+                            ps_me_ctxt->u4_subpel_buf_strd, ps_proc->ai16_pred1 + 3,
+                            ps_me_ctxt->u4_subpel_buf_strd);
 
-            ime_sub_pel_motion_estimation_16x16(ps_me_ctxt);
+            ime_sub_pel_motion_estimation_16x16(ps_me_ctxt, PRED_L0);
         }
     }
 
-    {
 
-        /* if skip gives a better cost than other search, copy the cost accordingly*/
-        if (skip_mb_part_info.i4_mb_cost < ps_mb_part->i4_mb_cost)
-        {
-            ps_mb_part->i4_mb_cost = skip_mb_part_info.i4_mb_cost;
-            ps_mb_part->i4_mb_distortion = skip_mb_part_info.i4_mb_distortion;
-            ps_mb_part->s_mv_curr.i2_mvx = skip_mb_part_info.s_mv_curr.i2_mvx;
-            ps_mb_part->s_mv_curr.i2_mvy = skip_mb_part_info.s_mv_curr.i2_mvy;
-        }
-        else
-        {
-            /*
-             * If the current MB has a sub pel component,
-             * we need to copy that to the best subpel buffer
-             */
-            if (ps_me_ctxt->u4_enable_hpel && ps_mb_part->pu1_best_hpel_buf)
-            {
-                ps_codec->pf_inter_pred_luma_copy(ps_mb_part->pu1_best_hpel_buf,
-                                                  pu1_best_subpel_buf,
-                                                  ps_me_ctxt->u4_hp_buf_strd,
-                                                  u4_bst_spel_strd, MB_SIZE,
-                                                  MB_SIZE, NULL, 0);
-            }
-        }
+    /***********************************************************************
+     * If a particular skiip Mv is giving better sad, copy to the corresponding
+     * MBPART
+     * In B slices this loop should go only to PREDL1: If we found min sad
+     * we will go to the skip ref list only
+     * Have to find a way to make it without too much change or new vars
+     **********************************************************************/
+    if (s_skip_mbpart.i4_mb_cost < ps_me_ctxt->as_mb_part[PRED_L0].i4_mb_cost)
+    {
+        ps_me_ctxt->as_mb_part[PRED_L0].i4_mb_cost = s_skip_mbpart.i4_mb_cost;
+        ps_me_ctxt->as_mb_part[PRED_L0].i4_mb_distortion = s_skip_mbpart.i4_mb_distortion;
+        ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr = s_skip_mbpart.s_mv_curr;
     }
-
-    DEBUG_SAD_HISTOGRAM_ADD(ps_me_ctxt->s_mb_part.i4_mb_distortion, 0);
-
-    /* update the type of the mb if necessary */
-    if (ps_me_ctxt->s_mb_part.i4_mb_cost < ps_proc->ps_cur_mb->i4_mb_cost)
+    else if (ps_me_ctxt->as_mb_part[PRED_L0].pu1_best_hpel_buf)
     {
-        /* mb cost */
-        ps_proc->ps_cur_mb->i4_mb_cost = ps_me_ctxt->s_mb_part.i4_mb_cost;
+        /* Now we have to copy the buffers */
+        ps_codec->pf_inter_pred_luma_copy(
+                        ps_me_ctxt->as_mb_part[PRED_L0].pu1_best_hpel_buf,
+                        ps_proc->pu1_best_subpel_buf,
+                        ps_me_ctxt->u4_subpel_buf_strd,
+                        ps_proc->u4_bst_spel_buf_strd, MB_SIZE, MB_SIZE,
+                        NULL, 0);
+    }
 
-        /* mb distortion */
-        ps_proc->ps_cur_mb->i4_mb_distortion = ps_me_ctxt->s_mb_part.i4_mb_distortion;
+    /**********************************************************************
+     * Now get the minimum of MB part sads by searching over all ref lists
+     **********************************************************************/
+    ps_proc->ps_pu->s_me_info[PRED_L0].s_mv.i2_mvx = ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr.i2_mvx;
+    ps_proc->ps_pu->s_me_info[PRED_L0].s_mv.i2_mvy = ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr.i2_mvy;
+    ps_proc->ps_cur_mb->i4_mb_cost = ps_me_ctxt->as_mb_part[PRED_L0].i4_mb_cost;
+    ps_proc->ps_cur_mb->i4_mb_distortion = ps_me_ctxt->as_mb_part[PRED_L0].i4_mb_distortion;
+    ps_proc->ps_cur_mb->u4_mb_type = P16x16;
+    ps_proc->ps_pu->b2_pred_mode = PRED_L0 ;
 
-        /* mb type */
-        ps_proc->ps_cur_mb->u4_mb_type  = P16x16;
-    }
+    /* Mark the reflists */
+    ps_proc->ps_pu->s_me_info[0].i1_ref_idx = -1;
+    ps_proc->ps_pu->s_me_info[1].i1_ref_idx =  0;
 
     /* number of partitions */
     ps_proc->u4_num_sub_partitions = 1;
@@ -986,19 +1077,13 @@ void ih264e_compute_me(process_ctxt_t *ps_proc)
     ps_proc->ps_pu->b4_wd = 3;
     ps_proc->ps_pu->b4_ht = 3;
 
-    /* ref idx */
-    ps_proc->ps_pu->i1_l0_ref_idx = 0;
-
-    /* motion vector L0 */
-    ps_proc->ps_pu->s_l0_mv.i2_mvx = ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvx;
-    ps_proc->ps_pu->s_l0_mv.i2_mvy = ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvy;
-
     /* Update min sad conditions */
     if (ps_me_ctxt->u4_min_sad_reached == 1)
     {
         ps_proc->ps_cur_mb->u4_min_sad_reached = 1;
         ps_proc->ps_cur_mb->u4_min_sad = ps_me_ctxt->i4_min_sad;
     }
+
 }
 
 /**
@@ -1054,9 +1139,9 @@ void ih264e_compute_me_nmb(process_ctxt_t *ps_proc, UWORD32 u4_nmb_count)
             }
         }
 
-        ps_proc->ps_skip_mv = &(ps_proc->ps_nmb_info[u4_i].s_skip_mv);
+        ps_proc->ps_skip_mv = &(ps_proc->ps_nmb_info[u4_i].as_skip_mv[0]);
         ps_proc->ps_ngbr_avbl = &(ps_proc->ps_nmb_info[u4_i].s_ngbr_avbl);
-        ps_proc->ps_pred_mv = &(ps_proc->ps_nmb_info[u4_i].s_pred_mv);
+        ps_proc->ps_pred_mv = &(ps_proc->ps_nmb_info[u4_i].as_pred_mv[0]);
 
         ps_proc->ps_cur_mb = &(ps_proc->ps_nmb_info[u4_i]);
 
@@ -1080,7 +1165,8 @@ void ih264e_compute_me_nmb(process_ctxt_t *ps_proc, UWORD32 u4_nmb_count)
         /* init me */
         ih264e_init_me(ps_proc);
 
-        ih264e_compute_me(ps_proc);
+        /* Compute ME according to slice type */
+        ps_proc->ps_codec->apf_compute_me[ps_proc->i4_slice_type](ps_proc);
 
         /* update top and left structs */
         {
@@ -1119,7 +1205,8 @@ void ih264e_compute_me_nmb(process_ctxt_t *ps_proc, UWORD32 u4_nmb_count)
         /* update buffers pointers */
         ps_proc->pu1_src_buf_luma += MB_SIZE;
         ps_proc->pu1_rec_buf_luma += MB_SIZE;
-        ps_proc->pu1_ref_buf_luma += MB_SIZE;
+        ps_proc->apu1_ref_buf_luma[0] += MB_SIZE;
+        ps_proc->apu1_ref_buf_luma[1] += MB_SIZE;
 
         /*
          * Note: Although chroma mb size is 8, as the chroma buffers are interleaved,
@@ -1127,7 +1214,9 @@ void ih264e_compute_me_nmb(process_ctxt_t *ps_proc, UWORD32 u4_nmb_count)
          */
         ps_proc->pu1_src_buf_chroma += MB_SIZE;
         ps_proc->pu1_rec_buf_chroma += MB_SIZE;
-        ps_proc->pu1_ref_buf_chroma += MB_SIZE;
+        ps_proc->apu1_ref_buf_chroma[0] += MB_SIZE;
+        ps_proc->apu1_ref_buf_chroma[1] += MB_SIZE;
+
 
         ps_proc->pu4_mb_pu_cnt += 1;
     }
@@ -1139,7 +1228,8 @@ void ih264e_compute_me_nmb(process_ctxt_t *ps_proc, UWORD32 u4_nmb_count)
     /* update buffers pointers */
     ps_proc->pu1_src_buf_luma -= MB_SIZE * u4_nmb_count;
     ps_proc->pu1_rec_buf_luma -= MB_SIZE * u4_nmb_count;
-    ps_proc->pu1_ref_buf_luma -= MB_SIZE * u4_nmb_count;
+    ps_proc->apu1_ref_buf_luma[0] -= MB_SIZE * u4_nmb_count;
+    ps_proc->apu1_ref_buf_luma[1] -= MB_SIZE * u4_nmb_count;
 
     /*
      * Note: Although chroma mb size is 8, as the chroma buffers are interleaved,
@@ -1147,7 +1237,892 @@ void ih264e_compute_me_nmb(process_ctxt_t *ps_proc, UWORD32 u4_nmb_count)
      */
     ps_proc->pu1_src_buf_chroma -= MB_SIZE * u4_nmb_count;
     ps_proc->pu1_rec_buf_chroma -= MB_SIZE * u4_nmb_count;
-    ps_proc->pu1_ref_buf_chroma -= MB_SIZE * u4_nmb_count;
+    ps_proc->apu1_ref_buf_chroma[0] -= MB_SIZE * u4_nmb_count;
+    ps_proc->apu1_ref_buf_chroma[1] -= MB_SIZE * u4_nmb_count;
+
 
     ps_proc->pu4_mb_pu_cnt -= u4_nmb_count;
 }
+
+
+/**
+*******************************************************************************
+*
+* @brief The function computes parameters for a BSKIP MB
+*
+* @par Description:
+*  The function updates the skip motion vector for B Mb, check if the Mb can be
+*  marked as skip and returns it
+*
+* @param[in] ps_proc
+*  Pointer to process context
+*
+* @param[in] u4_for_me
+*  Dummy
+*
+* @param[in] i4_reflist
+*  Dummy
+*
+* @returns Flag indicating if the current Mb can be skip or not
+*
+* @remarks
+*   The code implements the logic as described in sec 8.4.1.2.2
+*   It also computes co-located MB parmas according to sec 8.4.1.2.1
+*
+*   Need to add condition for this fucntion to be used in ME
+*
+*******************************************************************************/
+WORD32 ih264e_find_bskip_params_me(process_ctxt_t *ps_proc, WORD32 i4_reflist)
+{
+    /* Colzero for co-located MB */
+    WORD32 i4_colzeroflag;
+
+    /* motion vectors for neighbouring MBs */
+    enc_pu_t *ps_a_pu, *ps_c_pu, *ps_b_pu;
+
+    /* Variables to check if a particular mB is available */
+    WORD32 i4_a, i4_b, i4_c, i4_c_avail;;
+
+    /* Mode availability, init to no modes available     */
+    WORD32 i4_mode_avail;
+
+    /*  mb neighbor availability */
+    block_neighbors_t *ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
+
+    /* Temp var */
+    WORD32 i, i4_cmpl_mode, i4_skip_type = -1;
+
+    /*
+     * Colocated motion vector
+     */
+    mv_t s_mvcol;
+
+    /*
+     * Colocated picture idx
+     */
+    WORD32 i4_refidxcol;
+
+    UNUSED(i4_reflist);
+
+    /**************************************************************************
+     *Find co-located MB parameters
+     *      See sec 8.4.1.2.1  for reference
+     **************************************************************************/
+    {
+        /*
+         * Find the co-located Mb and update the skip and pred appropriately
+         * 1) Default colpic is forward ref : Table 8-6
+         * 2) Default mb col is current MB : Table 8-8
+         */
+
+        if (ps_proc->ps_colpu->b1_intra_flag)
+        {
+            s_mvcol.i2_mvx = 0;
+            s_mvcol.i2_mvy = 0;
+            i4_refidxcol = -1;
+        }
+        else
+        {
+            if (ps_proc->ps_colpu->b2_pred_mode != PRED_L1)
+            {
+                s_mvcol = ps_proc->ps_colpu->s_me_info[PRED_L0].s_mv;
+                i4_refidxcol = 0;
+            }
+            else // if(ps_proc->ps_colpu->b2_pred_mode != PRED_L0)
+            {
+                s_mvcol = ps_proc->ps_colpu->s_me_info[PRED_L1].s_mv;
+                i4_refidxcol = 0;
+            }
+        }
+
+        /* RefPicList1[ 0 ]  is marked as  "used for short-term reference", as default */
+        i4_colzeroflag = (!i4_refidxcol && (ABS(s_mvcol.i2_mvx) <= 1)
+                        && (ABS(s_mvcol.i2_mvy) <= 1));
+
+    }
+
+    /***************************************************************************
+     * Evaluating skip params : Spatial Skip
+     **************************************************************************/
+    {
+    /* Get the neighbouring MBS according to Section 8.4.1.2.2 */
+    ps_a_pu = &ps_proc->s_left_mb_pu_ME;
+    ps_b_pu = (ps_proc->ps_top_row_pu_ME + ps_proc->i4_mb_x);
+
+    i4_c_avail = 0;
+    if (ps_ngbr_avbl->u1_mb_c)
+    {
+        ps_c_pu = &((ps_proc->ps_top_row_pu_ME + ps_proc->i4_mb_x)[1]);
+        i4_c_avail = 1;
+    }
+    else
+    {
+        ps_c_pu = &ps_proc->s_top_left_mb_pu_ME;
+        i4_c_avail = ps_ngbr_avbl->u1_mb_d;
+    }
+
+    i4_a = ps_ngbr_avbl->u1_mb_a;
+    i4_b = ps_ngbr_avbl->u1_mb_b;
+    i4_c = i4_c_avail;
+
+    /* Init to no mode avail */
+    i4_mode_avail = 0;
+    for (i = 0; i < 2; i++)
+    {
+        i4_cmpl_mode = (i == 0) ? PRED_L1 : PRED_L0;
+
+        i4_mode_avail |= (i4_a && (ps_a_pu->b2_pred_mode != i4_cmpl_mode) && (ps_a_pu->s_me_info[i].i1_ref_idx != 0))<<i;
+        i4_mode_avail |= (i4_b && (ps_b_pu->b2_pred_mode != i4_cmpl_mode) && (ps_b_pu->s_me_info[i].i1_ref_idx != 0))<<i;
+        i4_mode_avail |= (i4_c && (ps_c_pu->b2_pred_mode != i4_cmpl_mode) && (ps_c_pu->s_me_info[i].i1_ref_idx != 0))<<i;
+    }
+
+    if (i4_mode_avail == 0x3 || i4_mode_avail == 0x0)
+    {
+        i4_skip_type= PRED_BI;
+    }
+    else if(i4_mode_avail == 0x1)
+    {
+        i4_skip_type = PRED_L0;
+    }
+    else if(i4_mode_avail == 0x2)
+    {
+        i4_skip_type = PRED_L1;
+    }
+
+    /* Update skip MV for L0 */
+    if ((i4_mode_avail & 0x1) && (!i4_colzeroflag))
+    {
+        ps_proc->ps_skip_mv[0].s_mv.i2_mvx = ps_proc->ps_pred_mv[0].s_mv.i2_mvx;
+        ps_proc->ps_skip_mv[0].s_mv.i2_mvy = ps_proc->ps_pred_mv[0].s_mv.i2_mvy;
+    }
+    else
+    {
+        ps_proc->ps_skip_mv[0].s_mv.i2_mvx = 0;
+        ps_proc->ps_skip_mv[0].s_mv.i2_mvy = 0;
+    }
+
+    /* Update skip MV for L1 */
+    if ((i4_mode_avail & 0x2) && (!i4_colzeroflag))
+    {
+        ps_proc->ps_skip_mv[1].s_mv.i2_mvx = ps_proc->ps_pred_mv[1].s_mv.i2_mvx;
+        ps_proc->ps_skip_mv[1].s_mv.i2_mvy = ps_proc->ps_pred_mv[1].s_mv.i2_mvy;
+    }
+    else
+    {
+        ps_proc->ps_skip_mv[1].s_mv.i2_mvx = 0;
+        ps_proc->ps_skip_mv[1].s_mv.i2_mvy = 0;
+    }
+
+    }
+
+    /***************************************************************************
+     * Evaluating skip params : Temporal skip
+     **************************************************************************/
+    {
+        pic_buf_t *  ps_ref_pic[MAX_REF_PIC_CNT];
+        WORD32 i4_td, i4_tx, i4_tb, i4_dist_scale_factor;
+        enc_pu_mv_t *ps_skip_mv = &ps_proc->ps_skip_mv[2];
+
+        ps_ref_pic[PRED_L0] = ps_proc->aps_ref_pic[PRED_L0];
+        ps_ref_pic[PRED_L1] = ps_proc->aps_ref_pic[PRED_L1];
+
+        i4_tb = ps_proc->ps_codec->i4_poc - ps_ref_pic[PRED_L0]->i4_abs_poc;
+        i4_td = ps_ref_pic[PRED_L1]->i4_abs_poc - ps_ref_pic[PRED_L0]->i4_abs_poc;
+
+        i4_tb = CLIP3(-128, 127, i4_tb);
+        i4_td = CLIP3(-128, 127, i4_td);
+
+        i4_tx = ( 16384 + ABS( i4_td / 2 ) ) / i4_td ;
+        i4_dist_scale_factor =  CLIP3( -1024, 1023, ( i4_tb * i4_tx + 32 ) >> 6 );
+
+        /* Motion vectors taken in full pel resolution , hence  -> (& 0xfffc) operation */
+        ps_skip_mv[PRED_L0].s_mv.i2_mvx = (( i4_dist_scale_factor * s_mvcol.i2_mvx + 128 ) >> 8) & 0xfffc;
+        ps_skip_mv[PRED_L0].s_mv.i2_mvy = (( i4_dist_scale_factor * s_mvcol.i2_mvy + 128 ) >> 8) & 0xfffc;
+
+        ps_skip_mv[PRED_L1].s_mv.i2_mvx = (ps_skip_mv[PRED_L0].s_mv.i2_mvx - s_mvcol.i2_mvx) & 0xfffc;
+        ps_skip_mv[PRED_L1].s_mv.i2_mvy = (ps_skip_mv[PRED_L0].s_mv.i2_mvy - s_mvcol.i2_mvy) & 0xfffc;
+
+    }
+
+    return i4_skip_type;
+}
+
+/**
+*******************************************************************************
+*
+* @brief The function computes the skip motion vectoe for B mb
+*
+* @par Description:
+*  The function gives the skip motion vector for B Mb, check if the Mb can be
+*  marked as skip
+*
+* @param[in] ps_proc
+*  Pointer to process context
+*
+* @param[in] u4_for_me
+*  Dummy
+*
+* @param[in] u4_for_me
+*  Dummy
+*
+* @returns Flag indicating if the current Mb can be skip or not
+*
+* @remarks The code implements the logic as described in sec 8.4.1.2.2 in H264
+*   specification. It also computes co-located MB parmas according to sec 8.4.1.2.1
+*
+*******************************************************************************/
+WORD32 ih264e_find_bskip_params(process_ctxt_t *ps_proc, WORD32 i4_reflist)
+{
+    WORD32 i4_colzeroflag;
+
+    /* motion vectors */
+    enc_pu_t *ps_a_pu, *ps_c_pu, *ps_b_pu;
+
+    /* Syntax elem */
+    mb_info_t *ps_a_syn, *ps_b_syn, *ps_c_syn;
+
+    /* Variables to check if a particular mB is available */
+    WORD32 i4_a, i4_b, i4_c, i4_c_avail;
+
+    /* Mode availability, init to no modes available     */
+    WORD32 i4_mode_avail;
+
+    /*  mb neighbor availability */
+    block_neighbors_t *ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
+
+    /* Temp var */
+    WORD32 i, i4_cmpl_mode;
+
+    UNUSED(i4_reflist);
+
+    /**************************************************************************
+     *Find co-locates parameters
+     *      See sec 8.4.1.2.1  for reference
+     **************************************************************************/
+    {
+        /*
+         * Find the co-located Mb and update the skip and pred appropriately
+         * 1) Default colpic is forward ref : Table 8-6
+         * 2) Default mb col is current MB : Table 8-8
+         */
+
+        mv_t s_mvcol;
+        WORD32 i4_refidxcol;
+
+        if (ps_proc->ps_colpu->b1_intra_flag)
+        {
+            s_mvcol.i2_mvx = 0;
+            s_mvcol.i2_mvy = 0;
+            i4_refidxcol = -1;
+        }
+        else
+        {
+            if (ps_proc->ps_colpu->b2_pred_mode != PRED_L1)
+            {
+                s_mvcol = ps_proc->ps_colpu->s_me_info[PRED_L0].s_mv;
+                i4_refidxcol = 0;
+            }
+            else // if(ps_proc->ps_colpu->b2_pred_mode != PRED_L0)
+            {
+                s_mvcol = ps_proc->ps_colpu->s_me_info[PRED_L1].s_mv;
+                i4_refidxcol = 0;
+            }
+        }
+
+        /* RefPicList1[ 0 ]  is marked as  "used for short-term reference", as default */
+        i4_colzeroflag = (!i4_refidxcol && (ABS(s_mvcol.i2_mvx) <= 1)
+                        && (ABS(s_mvcol.i2_mvy) <= 1));
+
+    }
+
+    /***************************************************************************
+     * Evaluating skip params
+     **************************************************************************/
+    /* Section 8.4.1.2.2 */
+    ps_a_syn = &ps_proc->s_left_mb_syntax_ele;
+    ps_a_pu = &ps_proc->s_left_mb_pu;
+
+    ps_b_syn = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
+    ps_b_pu = (ps_proc->ps_top_row_pu + ps_proc->i4_mb_x);
+
+    i4_c_avail = 0;
+    if (ps_ngbr_avbl->u1_mb_c)
+    {
+        ps_c_syn = &((ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x)[1]);
+        ps_c_pu = &((ps_proc->ps_top_row_pu + ps_proc->i4_mb_x)[1]);
+        i4_c_avail = 1;
+    }
+    else
+    {
+        ps_c_syn = &(ps_proc->s_top_left_mb_syntax_ele);
+        ps_c_pu = &ps_proc->s_top_left_mb_pu;
+        i4_c_avail = ps_ngbr_avbl->u1_mb_d;
+    }
+
+
+    i4_a = ps_ngbr_avbl->u1_mb_a;
+    i4_a &= !ps_a_syn->u2_is_intra;
+
+    i4_b = ps_ngbr_avbl->u1_mb_b;
+    i4_b &= !ps_b_syn->u2_is_intra;
+
+    i4_c = i4_c_avail;
+    i4_c &= !ps_c_syn->u2_is_intra;
+
+    /* Init to no mode avail */
+    i4_mode_avail = 0;
+    for (i = 0; i < 2; i++)
+    {
+        i4_cmpl_mode = (i == 0) ? PRED_L1 : PRED_L0;
+
+        i4_mode_avail |= (i4_a && (ps_a_pu->b2_pred_mode != i4_cmpl_mode) && (ps_a_pu->s_me_info[i].i1_ref_idx != 0))<<i;
+        i4_mode_avail |= (i4_b && (ps_b_pu->b2_pred_mode != i4_cmpl_mode) && (ps_b_pu->s_me_info[i].i1_ref_idx != 0))<<i;
+        i4_mode_avail |= (i4_c && (ps_c_pu->b2_pred_mode != i4_cmpl_mode) && (ps_c_pu->s_me_info[i].i1_ref_idx != 0))<<i;
+    }
+
+    /* Update skip MV for L0 */
+    if ((i4_mode_avail & 0x1) && (!i4_colzeroflag))
+    {
+        ps_proc->ps_skip_mv[0].s_mv.i2_mvx = ps_proc->ps_pred_mv[0].s_mv.i2_mvx;
+        ps_proc->ps_skip_mv[0].s_mv.i2_mvy = ps_proc->ps_pred_mv[0].s_mv.i2_mvy;
+    }
+    else
+    {
+        ps_proc->ps_skip_mv[0].s_mv.i2_mvx = 0;
+        ps_proc->ps_skip_mv[0].s_mv.i2_mvy = 0;
+    }
+
+    /* Update skip MV for L1 */
+    if ((i4_mode_avail & 0x2) && (!i4_colzeroflag))
+    {
+        ps_proc->ps_skip_mv[1].s_mv.i2_mvx = ps_proc->ps_pred_mv[1].s_mv.i2_mvx;
+        ps_proc->ps_skip_mv[1].s_mv.i2_mvy = ps_proc->ps_pred_mv[1].s_mv.i2_mvy;
+    }
+    else
+    {
+        ps_proc->ps_skip_mv[1].s_mv.i2_mvx = 0;
+        ps_proc->ps_skip_mv[1].s_mv.i2_mvy = 0;
+    }
+
+    /* Now see if the ME information matches the SKIP information */
+    switch (ps_proc->ps_pu->b2_pred_mode)
+    {
+        case PRED_BI:
+            if (  (ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvx == ps_proc->ps_skip_mv[0].s_mv.i2_mvx)
+               && (ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvy == ps_proc->ps_skip_mv[0].s_mv.i2_mvy)
+               && (ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvx == ps_proc->ps_skip_mv[1].s_mv.i2_mvx)
+               && (ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvy == ps_proc->ps_skip_mv[1].s_mv.i2_mvy)
+               && (i4_mode_avail ==  0x3 || i4_mode_avail == 0x0))
+            {
+                return 1;
+            }
+            break;
+
+        case PRED_L0:
+            if ( (ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvx == ps_proc->ps_skip_mv[0].s_mv.i2_mvx)
+              && (ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvy == ps_proc->ps_skip_mv[0].s_mv.i2_mvy)
+              && (i4_mode_avail == 0x1))
+            {
+                return 1;
+            }
+            break;
+
+        case PRED_L1:
+            if (  (ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvx == ps_proc->ps_skip_mv[1].s_mv.i2_mvx)
+               && (ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvy == ps_proc->ps_skip_mv[1].s_mv.i2_mvy)
+               && (i4_mode_avail == 0x2))
+            {
+                return 1;
+            }
+            break;
+    }
+
+    return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief This function computes the best motion vector among the tentative mv
+* candidates chosen.
+*
+* @par Description:
+*  This function determines the position in the search window at which the motion
+*  estimation should begin in order to minimise the number of search iterations.
+*
+* @param[in] ps_mb_part
+*  pointer to current mb partition ctxt with respect to ME
+*
+* @param[in] u4_lambda_motion
+*  lambda motion
+*
+* @param[in] u4_fast_flag
+*  enable/disable fast sad computation
+*
+* @returns  mv pair & corresponding distortion and cost
+*
+* @remarks Currently onyl 4 search candiates are supported
+*
+*******************************************************************************
+*/
+void ih264e_evaluate_bipred(me_ctxt_t *ps_me_ctxt,
+                            process_ctxt_t *ps_proc,
+                            mb_part_ctxt *ps_mb_ctxt_bi)
+{
+
+    UWORD32 i, u4_fast_sad;
+
+    WORD32 i4_dest_buff;
+
+    mv_t *ps_l0_pred_mv, *ps_l1_pred_mv, s_l0_mv, s_l1_mv;
+
+    UWORD8 *pu1_ref_mb_l0, *pu1_ref_mb_l1;
+
+    UWORD8 *pu1_dst_buf;
+
+    WORD32 i4_ref_l0_stride, i4_ref_l1_stride;
+
+    WORD32 i4_mb_distortion, i4_mb_cost;
+
+    u4_fast_sad = ps_me_ctxt->u4_enable_fast_sad;
+
+    i4_dest_buff = 0;
+    for (i = 0; i < ps_me_ctxt->u4_num_candidates[PRED_BI]; i += 2)
+    {
+        pu1_dst_buf = ps_me_ctxt->apu1_subpel_buffs[i4_dest_buff];
+
+        s_l0_mv.i2_mvx = ps_me_ctxt->as_mv_init_search[PRED_BI][i].i2_mvx >> 2;
+        s_l0_mv.i2_mvy = ps_me_ctxt->as_mv_init_search[PRED_BI][i].i2_mvy >> 2;
+        s_l1_mv.i2_mvx = ps_me_ctxt->as_mv_init_search[PRED_BI][i + 1].i2_mvx >> 2;
+        s_l1_mv.i2_mvy = ps_me_ctxt->as_mv_init_search[PRED_BI][i + 1].i2_mvy >> 2;
+
+        ps_l0_pred_mv = &ps_proc->ps_pred_mv[PRED_L0].s_mv;
+        ps_l1_pred_mv = &ps_proc->ps_pred_mv[PRED_L1].s_mv;
+
+        if ((ps_me_ctxt->as_mv_init_search[PRED_BI][i].i2_mvx & 0x3)||
+                        (ps_me_ctxt->as_mv_init_search[PRED_BI][i].i2_mvy & 0x3))
+        {
+            pu1_ref_mb_l0 = ps_me_ctxt->as_mb_part[PRED_L0].pu1_best_hpel_buf;
+            i4_ref_l0_stride = ps_me_ctxt->u4_subpel_buf_strd;
+        }
+        else
+        {
+            pu1_ref_mb_l0 = ps_me_ctxt->apu1_ref_buf_luma[PRED_L0] + (s_l0_mv.i2_mvx) + ((s_l0_mv.i2_mvy) * ps_me_ctxt->i4_rec_strd);
+            i4_ref_l0_stride = ps_me_ctxt->i4_rec_strd;
+        }
+
+
+        if ((ps_me_ctxt->as_mv_init_search[PRED_BI][i + 1].i2_mvx & 0x3) ||
+                        (ps_me_ctxt->as_mv_init_search[PRED_BI][i + 1].i2_mvy & 0x3))
+        {
+            pu1_ref_mb_l1 = ps_me_ctxt->as_mb_part[PRED_L1].pu1_best_hpel_buf;
+            i4_ref_l1_stride = ps_me_ctxt->u4_subpel_buf_strd;
+        }
+        else
+        {
+            pu1_ref_mb_l1 = ps_me_ctxt->apu1_ref_buf_luma[PRED_L1] + (s_l1_mv.i2_mvx) + ((s_l1_mv.i2_mvy) * ps_me_ctxt->i4_rec_strd);
+            i4_ref_l1_stride = ps_me_ctxt->i4_rec_strd;
+        }
+
+        ps_proc->ps_codec->pf_inter_pred_luma_bilinear(
+                        pu1_ref_mb_l0, pu1_ref_mb_l1, pu1_dst_buf,
+                        i4_ref_l0_stride, i4_ref_l1_stride,
+                        ps_me_ctxt->u4_subpel_buf_strd, MB_SIZE, MB_SIZE);
+
+        ps_me_ctxt->pf_ime_compute_sad_16x16[u4_fast_sad](
+                        ps_me_ctxt->pu1_src_buf_luma, pu1_dst_buf,
+                        ps_me_ctxt->i4_src_strd, ps_me_ctxt->u4_subpel_buf_strd,
+                        ps_mb_ctxt_bi->i4_mb_distortion, &i4_mb_distortion);
+
+        /* compute cost */
+        i4_mb_cost =  ps_me_ctxt->pu1_mv_bits[( s_l0_mv.i2_mvy << 2 ) - ps_l0_pred_mv->i2_mvx];
+        i4_mb_cost += ps_me_ctxt->pu1_mv_bits[( s_l0_mv.i2_mvy << 2 ) - ps_l0_pred_mv->i2_mvy];
+        i4_mb_cost += ps_me_ctxt->pu1_mv_bits[( s_l1_mv.i2_mvx << 2 ) - ps_l1_pred_mv->i2_mvx];
+        i4_mb_cost += ps_me_ctxt->pu1_mv_bits[( s_l1_mv.i2_mvy << 2 ) - ps_l1_pred_mv->i2_mvy];
+
+        i4_mb_cost -= (ps_me_ctxt->i4_skip_bias[BSLICE]) * (ps_me_ctxt->i4_skip_type == PRED_BI) * (i == 0);
+
+
+        i4_mb_cost *= ps_me_ctxt->u4_lambda_motion;
+        i4_mb_cost += i4_mb_distortion;
+
+        if (i4_mb_cost < ps_mb_ctxt_bi->i4_mb_cost)
+        {
+            ps_mb_ctxt_bi->i4_srch_pos_idx = (i>>1);
+            ps_mb_ctxt_bi->i4_mb_cost = i4_mb_cost;
+            ps_mb_ctxt_bi->i4_mb_distortion = i4_mb_distortion;
+            ps_mb_ctxt_bi->pu1_best_hpel_buf = pu1_dst_buf;
+            i4_dest_buff = (i4_dest_buff + 1) % 2;
+        }
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief This function performs motion estimation for the current mb
+*
+* @par Description:
+*  The current mb is compared with a list of mb's in the reference frame for
+*  least cost. The mb that offers least cost is chosen as predicted mb and the
+*  displacement of the predicted mb from index location of the current mb is
+*  signaled as mv. The list of the mb's that are chosen in the reference frame
+*  are dependent on the speed of the ME configured.
+*
+* @param[in] ps_proc
+*  Process context corresponding to the job
+*
+* @returns  motion vector of the pred mb, sad, cost.
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264e_compute_me_multi_reflist(process_ctxt_t *ps_proc)
+{
+    /* me ctxt */
+    me_ctxt_t *ps_me_ctxt = &ps_proc->s_me_ctxt;
+
+    /* codec context */
+    codec_t *ps_codec = ps_proc->ps_codec;
+
+    /* Temp variables for looping over ref lists */
+    WORD32 i4_reflist, i4_max_reflist;
+
+    /* recon stride */
+    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
+
+    /* source buffer for halp pel generation functions */
+    UWORD8 *pu1_hpel_src;
+
+    /* quantization parameters */
+    quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
+
+    /* Sad therholds */
+    ps_me_ctxt->pu2_sad_thrsh = ps_qp_params->pu2_sad_thrsh;
+
+    /* Mb part ctxts for SKIP */
+    mb_part_ctxt as_skip_mbpart[2];
+
+    {
+        WORD32 rows_above, rows_below, columns_left, columns_right;
+
+        /* During evaluation for motion vectors do not search through padded regions */
+        /* Obtain number of rows and columns that are effective for computing for me evaluation */
+        rows_above = MB_SIZE + ps_proc->i4_mb_y * MB_SIZE;
+        rows_below = (ps_proc->i4_ht_mbs - ps_proc->i4_mb_y) * MB_SIZE;
+        columns_left = MB_SIZE + ps_proc->i4_mb_x * MB_SIZE;
+        columns_right = (ps_proc->i4_wd_mbs - ps_proc->i4_mb_x) * MB_SIZE;
+
+        /* init srch range */
+        /* NOTE : For now, lets limit the search range by DEFAULT_MAX_SRCH_RANGE_X / 2
+         * on all sides.
+         */
+        ps_me_ctxt->i4_srch_range_w = -MIN(columns_left, DEFAULT_MAX_SRCH_RANGE_X >> 1);
+        ps_me_ctxt->i4_srch_range_e = MIN(columns_right, DEFAULT_MAX_SRCH_RANGE_X >> 1);
+        ps_me_ctxt->i4_srch_range_n = -MIN(rows_above, DEFAULT_MAX_SRCH_RANGE_Y >> 1);
+        ps_me_ctxt->i4_srch_range_s = MIN(rows_below, DEFAULT_MAX_SRCH_RANGE_Y >> 1);
+
+        /* this is to facilitate fast sub pel computation with minimal loads */
+        if (ps_me_ctxt->u4_enable_hpel)
+        {
+            ps_me_ctxt->i4_srch_range_w += 1;
+            ps_me_ctxt->i4_srch_range_e -= 1;
+            ps_me_ctxt->i4_srch_range_n += 1;
+            ps_me_ctxt->i4_srch_range_s -= 1;
+        }
+    }
+
+    /* Compute ME and store the MVs */
+    {
+        /***********************************************************************
+         * Compute ME for lists L0 and L1
+         *  For L0 -> L0 skip + L0
+         *  for L1 -> L0 skip + L0 + L1 skip + L1
+         ***********************************************************************/
+        i4_max_reflist = (ps_proc->i4_slice_type == PSLICE) ? PRED_L0 : PRED_L1;
+
+        /* Init SATQD for the current list */
+        ps_me_ctxt->u4_min_sad_reached  = 0;
+        ps_me_ctxt->i4_min_sad = ps_proc->ps_cur_mb->u4_min_sad;
+
+        for (i4_reflist = PRED_L0; i4_reflist <= i4_max_reflist; i4_reflist++)
+        {
+
+            /* Get the seed motion vector candidates                    */
+            ih264e_get_search_candidates(ps_proc, ps_me_ctxt, i4_reflist);
+
+            /* ****************************************************************
+             *Evaluate the SKIP for current list
+             * ****************************************************************/
+            as_skip_mbpart[i4_reflist].s_mv_curr.i2_mvx = 0;
+            as_skip_mbpart[i4_reflist].s_mv_curr.i2_mvy = 0;
+            as_skip_mbpart[i4_reflist].i4_mb_cost = INT_MAX;
+            as_skip_mbpart[i4_reflist].i4_mb_distortion = INT_MAX;
+
+            if (ps_me_ctxt->i4_skip_type == i4_reflist)
+            {
+                ime_compute_skip_cost( ps_me_ctxt,
+                                       (ime_mv_t *)(&ps_proc->ps_skip_mv[i4_reflist].s_mv),
+                                       &as_skip_mbpart[i4_reflist],
+                                       ps_proc->ps_codec->s_cfg.u4_enable_satqd,
+                                       i4_reflist,
+                                       (ps_proc->i4_slice_type == BSLICE) );
+            }
+
+            as_skip_mbpart[i4_reflist].s_mv_curr.i2_mvx <<= 2;
+            as_skip_mbpart[i4_reflist].s_mv_curr.i2_mvy <<= 2;
+
+            /******************************************************************
+             * Evaluate ME For current list
+             *****************************************************************/
+            ps_me_ctxt->as_mb_part[i4_reflist].s_mv_curr.i2_mvx = 0;
+            ps_me_ctxt->as_mb_part[i4_reflist].s_mv_curr.i2_mvy = 0;
+            ps_me_ctxt->as_mb_part[i4_reflist].i4_mb_cost = INT_MAX;
+            ps_me_ctxt->as_mb_part[i4_reflist].i4_mb_distortion = INT_MAX;
+
+            /* Init Hpel */
+            ps_me_ctxt->as_mb_part[i4_reflist].pu1_best_hpel_buf = NULL;
+
+            /* In case we found out the minimum SAD, exit the ME eval */
+            if (ps_me_ctxt->u4_min_sad_reached)
+            {
+                i4_max_reflist = i4_reflist;
+                break;
+            }
+
+
+            /* Evaluate search candidates for initial mv pt */
+            ime_evaluate_init_srchposn_16x16(ps_me_ctxt, i4_reflist);
+
+            /********************************************************************/
+            /*                  full pel motion estimation                      */
+            /********************************************************************/
+            ime_full_pel_motion_estimation_16x16(ps_me_ctxt, i4_reflist);
+
+            DEBUG_MV_HISTOGRAM_ADD((ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvx >> 2),
+                                   (ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvy >> 2));
+
+            DEBUG_SAD_HISTOGRAM_ADD(ps_me_ctxt->s_mb_part.i4_mb_distortion, 1);
+
+            /* Scale the MV to qpel resolution */
+            ps_me_ctxt->as_mb_part[i4_reflist].s_mv_curr.i2_mvx <<= 2;
+            ps_me_ctxt->as_mb_part[i4_reflist].s_mv_curr.i2_mvy <<= 2;
+
+            if (ps_me_ctxt->u4_enable_hpel)
+            {
+                /* moving src pointer to the converged motion vector location */
+                pu1_hpel_src =   ps_me_ctxt->apu1_ref_buf_luma[i4_reflist]
+                               + (ps_me_ctxt->as_mb_part[i4_reflist].s_mv_curr.i2_mvx >> 2)
+                               + ((ps_me_ctxt->as_mb_part[i4_reflist].s_mv_curr.i2_mvy >> 2)* i4_rec_strd);
+
+                ps_me_ctxt->apu1_subpel_buffs[0] = ps_proc->apu1_subpel_buffs[0];
+                ps_me_ctxt->apu1_subpel_buffs[1] = ps_proc->apu1_subpel_buffs[1];
+                ps_me_ctxt->apu1_subpel_buffs[2] = ps_proc->apu1_subpel_buffs[2];
+
+                /* Init the search position to an invalid number */
+                ps_me_ctxt->as_mb_part[i4_reflist].i4_srch_pos_idx = 3;
+
+                /* Incase a buffer is still in use by L0, replace it with spare buff */
+                ps_me_ctxt->apu1_subpel_buffs[ps_me_ctxt->as_mb_part[PRED_L0].i4_srch_pos_idx] =
+                                ps_proc->apu1_subpel_buffs[3];
+
+
+                ps_me_ctxt->u4_subpel_buf_strd = HP_BUFF_WD;
+
+                /* half  pel search is done for both sides of full pel,
+                 * hence half_x of width x height = 17x16 is created
+                 * starting from left half_x of converged full pel */
+                pu1_hpel_src -= 1;
+
+                /* computing half_x */
+                ps_codec->pf_ih264e_sixtapfilter_horz(pu1_hpel_src,
+                                                      ps_me_ctxt->apu1_subpel_buffs[0],
+                                                      i4_rec_strd,
+                                                      ps_me_ctxt->u4_subpel_buf_strd);
+
+                /*
+                 * Halfpel search is done for both sides of full pel,
+                 * hence half_y of width x height = 16x17 is created
+                 * starting from top half_y of converged full pel
+                 * for half_xy top_left is required
+                 * hence it starts from pu1_hpel_src = full_pel_converged_point - i4_rec_strd - 1
+                 */
+                pu1_hpel_src -= i4_rec_strd;
+
+                /* computing half_y and half_xy */
+                ps_codec->pf_ih264e_sixtap_filter_2dvh_vert(
+                                pu1_hpel_src, ps_me_ctxt->apu1_subpel_buffs[1],
+                                ps_me_ctxt->apu1_subpel_buffs[2], i4_rec_strd,
+                                ps_me_ctxt->u4_subpel_buf_strd, ps_proc->ai16_pred1 + 3,
+                                ps_me_ctxt->u4_subpel_buf_strd);
+
+                ime_sub_pel_motion_estimation_16x16(ps_me_ctxt, i4_reflist);
+
+            }
+        }
+
+        /***********************************************************************
+         * If a particular skiip Mv is giving better sad, copy to the corresponding
+         * MBPART
+         * In B slices this loop should go only to PREDL1: If we found min sad
+         * we will go to the skip ref list only
+         * Have to find a way to make it without too much change or new vars
+         **********************************************************************/
+        for (i4_reflist = 0; i4_reflist <= i4_max_reflist; i4_reflist++)
+        {
+            if (as_skip_mbpart[i4_reflist].i4_mb_cost < ps_me_ctxt->as_mb_part[i4_reflist].i4_mb_cost)
+            {
+                ps_me_ctxt->as_mb_part[i4_reflist].i4_mb_cost = as_skip_mbpart[i4_reflist].i4_mb_cost;
+                ps_me_ctxt->as_mb_part[i4_reflist].i4_mb_distortion = as_skip_mbpart[i4_reflist].i4_mb_distortion;
+                ps_me_ctxt->as_mb_part[i4_reflist].s_mv_curr = as_skip_mbpart[i4_reflist].s_mv_curr;
+            }
+        }
+
+        /***********************************************************************
+         * Compute ME for BI
+         *  In case of BI we do ME for two candidates
+         *   1) The best L0 and L1 Mvs
+         *   2) Skip L0 and L1 MVs
+         *
+         *   TODO
+         *   one of the search candidates is skip. Hence it may be duplicated
+         ***********************************************************************/
+        if (i4_max_reflist == PRED_L1 && ps_me_ctxt->u4_min_sad_reached == 0)
+        {
+            WORD32 i, j = 0;
+            WORD32 l0_srch_pos_idx, l1_srch_pos_idx;
+            WORD32 i4_l0_skip_mv_idx, i4_l1_skip_mv_idx;
+
+            /* Get the free buffers */
+            l0_srch_pos_idx = ps_me_ctxt->as_mb_part[PRED_L0].i4_srch_pos_idx;
+            l1_srch_pos_idx = ps_me_ctxt->as_mb_part[PRED_L1].i4_srch_pos_idx;
+
+            /* Search for the two free buffers in subpel list */
+            for (i = 0; i < SUBPEL_BUFF_CNT; i++)
+            {
+                if (i != l0_srch_pos_idx && i != l1_srch_pos_idx)
+                {
+                    ps_me_ctxt->apu1_subpel_buffs[j] = ps_proc->apu1_subpel_buffs[i];
+                    j++;
+                }
+            }
+            ps_me_ctxt->u4_subpel_buf_strd = HP_BUFF_WD;
+
+            /* Copy the statial SKIP MV of each list */
+            i4_l0_skip_mv_idx = ps_me_ctxt->u4_num_candidates[PRED_L0] - 2;
+            i4_l1_skip_mv_idx = ps_me_ctxt->u4_num_candidates[PRED_L1] - 2;
+            ps_me_ctxt->as_mv_init_search[PRED_BI][0].i2_mvx = ps_me_ctxt->as_mv_init_search[PRED_L0][i4_l0_skip_mv_idx].i2_mvx << 2;
+            ps_me_ctxt->as_mv_init_search[PRED_BI][0].i2_mvy = ps_me_ctxt->as_mv_init_search[PRED_L0][i4_l0_skip_mv_idx].i2_mvy << 2;
+            ps_me_ctxt->as_mv_init_search[PRED_BI][1].i2_mvx = ps_me_ctxt->as_mv_init_search[PRED_L1][i4_l1_skip_mv_idx].i2_mvx << 2;
+            ps_me_ctxt->as_mv_init_search[PRED_BI][1].i2_mvy = ps_me_ctxt->as_mv_init_search[PRED_L1][i4_l1_skip_mv_idx].i2_mvy << 2;
+
+            /* Copy the SKIP MV temporal of each list */
+            i4_l0_skip_mv_idx++;
+            i4_l1_skip_mv_idx++;
+            ps_me_ctxt->as_mv_init_search[PRED_BI][2].i2_mvx = ps_me_ctxt->as_mv_init_search[PRED_L0][i4_l0_skip_mv_idx].i2_mvx << 2;
+            ps_me_ctxt->as_mv_init_search[PRED_BI][2].i2_mvy = ps_me_ctxt->as_mv_init_search[PRED_L0][i4_l0_skip_mv_idx].i2_mvy << 2;
+            ps_me_ctxt->as_mv_init_search[PRED_BI][3].i2_mvx = ps_me_ctxt->as_mv_init_search[PRED_L1][i4_l1_skip_mv_idx].i2_mvx << 2;
+            ps_me_ctxt->as_mv_init_search[PRED_BI][3].i2_mvy = ps_me_ctxt->as_mv_init_search[PRED_L1][i4_l1_skip_mv_idx].i2_mvy << 2;
+
+            /* Copy the best MV after ME */
+            ps_me_ctxt->as_mv_init_search[PRED_BI][4] = ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr;
+            ps_me_ctxt->as_mv_init_search[PRED_BI][5] = ps_me_ctxt->as_mb_part[PRED_L1].s_mv_curr;
+
+            ps_me_ctxt->u4_num_candidates[PRED_BI] = 6;
+
+            ps_me_ctxt->as_mb_part[PRED_BI].i4_mb_cost = INT_MAX;
+            ps_me_ctxt->as_mb_part[PRED_BI].i4_mb_distortion = INT_MAX;
+
+            ih264e_evaluate_bipred(ps_me_ctxt, ps_proc,
+                                   &ps_me_ctxt->as_mb_part[PRED_BI]);
+
+            i4_max_reflist = PRED_BI;
+        }
+
+        /**********************************************************************
+         * Now get the minimum of MB part sads by searching over all ref lists
+         **********************************************************************/
+        ps_proc->ps_pu->b2_pred_mode = 0x3;
+
+        for (i4_reflist = 0; i4_reflist <= i4_max_reflist; i4_reflist++)
+        {
+            if (ps_me_ctxt->as_mb_part[i4_reflist].i4_mb_cost < ps_proc->ps_cur_mb->i4_mb_cost)
+            {
+                ps_proc->ps_cur_mb->i4_mb_cost = ps_me_ctxt->as_mb_part[i4_reflist].i4_mb_cost;
+                ps_proc->ps_cur_mb->i4_mb_distortion = ps_me_ctxt->as_mb_part[i4_reflist].i4_mb_distortion;
+                ps_proc->ps_cur_mb->u4_mb_type = (ps_proc->i4_slice_type == PSLICE) ? P16x16 : B16x16;
+                ps_proc->ps_pu->b2_pred_mode = i4_reflist ;
+            }
+        }
+
+        /**********************************************************************
+         * In case we have a BI MB, we have to copy the buffers and set proer MV's
+         *  1)In case its BI, we need to get the best MVs given by BI and update
+         *    to their corresponding MB part
+         *  2)We also need to copy the buffer in which bipred buff is populated
+         *
+         *  Not that if we have
+         **********************************************************************/
+        if (ps_proc->ps_pu->b2_pred_mode == PRED_BI)
+        {
+            WORD32 i4_srch_pos = ps_me_ctxt->as_mb_part[PRED_BI].i4_srch_pos_idx;
+            UWORD8 *pu1_bi_buf = ps_me_ctxt->as_mb_part[PRED_BI].pu1_best_hpel_buf;
+
+            ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr = ps_me_ctxt->as_mv_init_search[PRED_BI][i4_srch_pos << 1];
+            ps_me_ctxt->as_mb_part[PRED_L1].s_mv_curr = ps_me_ctxt->as_mv_init_search[PRED_BI][(i4_srch_pos << 1) + 1];
+
+            /* Now we have to copy the buffers */
+            ps_codec->pf_inter_pred_luma_copy(pu1_bi_buf,
+                                              ps_proc->pu1_best_subpel_buf,
+                                              ps_me_ctxt->u4_subpel_buf_strd,
+                                              ps_proc->u4_bst_spel_buf_strd,
+                                              MB_SIZE, MB_SIZE, NULL, 0);
+
+        }
+        else if (ps_me_ctxt->as_mb_part[ps_proc->ps_pu->b2_pred_mode].pu1_best_hpel_buf)
+        {
+            /* Now we have to copy the buffers */
+            ps_codec->pf_inter_pred_luma_copy(
+                            ps_me_ctxt->as_mb_part[ps_proc->ps_pu->b2_pred_mode].pu1_best_hpel_buf,
+                            ps_proc->pu1_best_subpel_buf,
+                            ps_me_ctxt->u4_subpel_buf_strd,
+                            ps_proc->u4_bst_spel_buf_strd, MB_SIZE, MB_SIZE,
+                            NULL, 0);
+        }
+    }
+
+    /**************************************************************************
+     *Now copy the MVs to the current PU with qpel scaling
+     ***************************************************************************/
+    ps_proc->ps_pu->s_me_info[PRED_L0].s_mv.i2_mvx = (ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr.i2_mvx);
+    ps_proc->ps_pu->s_me_info[PRED_L0].s_mv.i2_mvy = (ps_me_ctxt->as_mb_part[PRED_L0].s_mv_curr.i2_mvy);
+    ps_proc->ps_pu->s_me_info[PRED_L1].s_mv.i2_mvx = (ps_me_ctxt->as_mb_part[PRED_L1].s_mv_curr.i2_mvx);
+    ps_proc->ps_pu->s_me_info[PRED_L1].s_mv.i2_mvy = (ps_me_ctxt->as_mb_part[PRED_L1].s_mv_curr.i2_mvy);
+
+
+    ps_proc->ps_pu->s_me_info[0].i1_ref_idx = (ps_proc->ps_pu->b2_pred_mode != PRED_L1)? -1:0;
+    ps_proc->ps_pu->s_me_info[1].i1_ref_idx = (ps_proc->ps_pu->b2_pred_mode != PRED_L0)? -1:0;
+
+    /* number of partitions */
+    ps_proc->u4_num_sub_partitions = 1;
+    *(ps_proc->pu4_mb_pu_cnt) = 1;
+
+    /* position in-terms of PU */
+    ps_proc->ps_pu->b4_pos_x = 0;
+    ps_proc->ps_pu->b4_pos_y = 0;
+
+    /* PU size */
+    ps_proc->ps_pu->b4_wd = 3;
+    ps_proc->ps_pu->b4_ht = 3;
+
+    /* Update min sad conditions */
+    if (ps_me_ctxt->u4_min_sad_reached == 1)
+    {
+        ps_proc->ps_cur_mb->u4_min_sad_reached = 1;
+        ps_proc->ps_cur_mb->u4_min_sad = ps_me_ctxt->i4_min_sad;
+    }
+}
+
diff --git a/encoder/ih264e_me.h b/encoder/ih264e_me.h
index c4834a1..bd88a01 100644
--- a/encoder/ih264e_me.h
+++ b/encoder/ih264e_me.h
@@ -42,10 +42,10 @@
 /*****************************************************************************/
 
 /**
-******************************************************************************
+ ******************************************************************************
  *  @brief      compute median of 3 elements (a, b, c) and store the output
  *  in to result. This is used for mv prediction
-******************************************************************************
+ ******************************************************************************
  */
 
 #define MEDIAN(a, b, c, result) if (a > b){\
@@ -69,210 +69,285 @@
                                     }\
                                 }
 
-
-
 /*****************************************************************************/
 /* Extern Function Declarations                                              */
 /*****************************************************************************/
 
 /**
-*******************************************************************************
-*
-* @brief
-*  This function populates the length of the codewords for motion vectors in the
-*  range (-search range, search range) in pixels
-*
-* @param[in] ps_me
-*  Pointer to me ctxt
-*
-* @param[out] pu1_mv_bits
-*  length of the codeword for all mv's
-*
-* @remarks The length of the code words are derived from signed exponential
-* goloumb codes.
-*
-*******************************************************************************
-*/
-void ih264e_init_mv_bits
-    (
-        me_ctxt_t *ps_me
-    );
+ *******************************************************************************
+ *
+ * @brief
+ *  This function populates the length of the codewords for motion vectors in the
+ *  range (-search range, search range) in pixels
+ *
+ * @param[in] ps_me
+ *  Pointer to me ctxt
+ *
+ * @param[out] pu1_mv_bits
+ *  length of the codeword for all mv's
+ *
+ * @remarks The length of the code words are derived from signed exponential
+ * goloumb codes.
+ *
+ *******************************************************************************
+ */
+void ih264e_init_mv_bits(me_ctxt_t *ps_me);
 
 /**
-*******************************************************************************
-*
-* @brief The function gives the skip motion vector
-*
-* @par Description:
-*  The function gives the skip motion vector
-*
-* @param[in] ps_left_mb_pu
-*  pointer to left mb motion vector info
-*
-* @param[in] ps_top_row_pu
-*  pointer to top & top right mb motion vector info
-*
-* @param[out] ps_pred_mv
-*  pointer to candidate predictors for the current block
-*
-* @returns The x & y components of the MV predictor.
-*
-* @remarks The code implements the logic as described in sec 8.4.1.1 in H264
-*   specification.
-*
-*******************************************************************************
+ *******************************************************************************
+ *
+ * @brief The function computes the parameters for a P skip MB
+ *
+ * @par Description:
+ *  The function computes the parameters for a P skip MB
+ *
+ * @param[in] ps_proc
+ *  Process context
+ *
+ * @param[in] u4_for_me
+ *  Flag to indicate the purpose of computing skip
+ *
+ * @param[out] ps_pred_mv
+ *  Flag to indicate the current active refernce list
+ *
+ * @returns
+ *       1) Updates skip MV in proc
+ *       2) Returns if the current MB can be coded as skip or not
+ *
+ * @remarks The code implements the logic as described in sec 8.4.1.1 in H264
+ *   specification.
+ *
+ *******************************************************************************
 */
-void ih264e_find_skip_motion_vector
-    (
-        process_ctxt_t *ps_proc,
-        UWORD32 u4_for_me
-    );
+ih264e_skip_params_ft  ih264e_find_pskip_params;
 
 /**
-*******************************************************************************
-*
-* @brief motion vector predictor
-*
-* @par Description:
-*  The routine calculates the motion vector predictor for a given block,
-*  given the candidate MV predictors.
-*
-* @param[in] ps_left_mb_pu
-*  pointer to left mb motion vector info
-*
-* @param[in] ps_top_row_pu
-*  pointer to top & top right mb motion vector info
-*
-* @param[out] ps_pred_mv
-*  pointer to candidate predictors for the current block
-*
-* @returns  The x & y components of the MV predictor.
-*
-* @remarks The code implements the logic as described in sec 8.4.1.3 in H264
-*   specification.
-*   Assumptions : 1. Assumes Single reference frame
-*                 2. Assumes Only partition of size 16x16
-*
-*******************************************************************************
+ *******************************************************************************
+ *
+ * @brief The function computes the parameters for a P skip MB
+ *
+ * @par Description:
+ *  The function computes the parameters for a P skip MB
+ *
+ * @param[in] ps_proc
+ *  Process context
+ *
+ * @param[in] u4_for_me
+ *  Flag to indicate the purpose of computing skip
+ *
+ * @param[out] ps_pred_mv
+ *  Flag to indicate the current active refernce list
+ *
+ * @returns
+ *       1) Updates skip MV in proc
+ *       2) Returns if the current MB can be coded as skip or not
+ *
+ * @remarks The code implements the logic as described in sec 8.4.1.1 in H264
+ *   specification.
+ *
+ *******************************************************************************
 */
-void ih264e_get_mv_predictor
-        (
-            enc_pu_t *ps_left_mb_pu,
-            enc_pu_t *ps_top_row_pu,
-            mv_t *ps_pred_mv
-        );
+ih264e_skip_params_ft  ih264e_find_pskip_params_me;
 
 /**
-*******************************************************************************
-*
-* @brief This function computes the best motion vector for the current mb
-*
-* @par Description:
-*  This function currently does nothing except set motion vectors from external
-*  source
-*
-* @param[in] ps_proc
-*  Process context corresponding to the job
-*
-* @returns  none
-*
-* @remarks none
-*
-*******************************************************************************
+ *******************************************************************************
+ *
+ * @brief The function computes the parameters for a B skip MB
+ *
+ * @par Description:
+ *  The function computes the parameters for a B skip MB
+ *
+ * @param[in] ps_proc
+ *  Process context
+ *
+ * @param[in] u4_for_me
+ *  Flag to indicate the purpose of computing skip
+ *
+ * @param[out] ps_pred_mv
+ *  Flag to indicate the current active refernce list
+ *
+ * @returns
+ *       1) Updates skip MV in proc
+ *       2) Returns if the current MB can be coded as skip or not
+ *
+ * @remarks The code implements the logic as described in sec 8.4.1.1 in H264
+ *   specification.
+ *
+ *******************************************************************************
 */
-void ih264e_compute_me
-    (
-        process_ctxt_t *ps_proc
-    );
+ih264e_skip_params_ft  ih264e_find_bskip_params;
 
 /**
-*******************************************************************************
-*
-* @brief This function initializes me ctxt
-*
-* @par Description:
-*  Before dispatching the current job to me thread, the me context associated
-*  with the job is initialized.
-*
-* @param[in] ps_proc
-*  Process context corresponding to the job
-*
-* @returns  none
-*
-* @remarks none
-*
-*******************************************************************************
+ *******************************************************************************
+ *
+ * @brief The function computes the parameters for a B skip MB
+ *
+ * @par Description:
+ *  The function computes the parameters for a B skip MB
+ *
+ * @param[in] ps_proc
+ *  Process context
+ *
+ * @param[in] u4_for_me
+ *  Flag to indicate the purpose of computing skip
+ *
+ * @param[out] ps_pred_mv
+ *  Flag to indicate the current active refernce list
+ *
+ * @returns
+ *       1) Updates skip MV in proc
+ *       2) The type of SKIP [L0/L1/BI]
+ *
+ * @remarks
+ *******************************************************************************
 */
+ih264e_skip_params_ft  ih264e_find_bskip_params_me;
+
+/**
+ *******************************************************************************
+ *
+ * @brief motion vector predictor
+ *
+ * @par Description:
+ *  The routine calculates the motion vector predictor for a given block,
+ *  given the candidate MV predictors.
+ *
+ * @param[in] ps_left_mb_pu
+ *  pointer to left mb motion vector info
+ *
+ * @param[in] ps_top_row_pu
+ *  pointer to top & top right mb motion vector info
+ *
+ * @param[out] ps_pred_mv
+ *  pointer to candidate predictors for the current block
+ *
+ * @returns  The x & y components of the MV predictor.
+ *
+ * @remarks The code implements the logic as described in sec 8.4.1.3 in H264
+ *   specification.
+ *   Assumptions : 1. Assumes Only partition of size 16x16
+ *
+ *******************************************************************************
+ */
+void ih264e_get_mv_predictor(enc_pu_t *ps_left_mb_pu, enc_pu_t *ps_top_row_pu,
+                             enc_pu_mv_t *ps_pred_mv, WORD32 i4_ref_list);
+
+/**
+ *******************************************************************************
+ *
+ * @brief This fucntion evalues ME for 2 reference lists
+ *
+ * @par Description:
+ *  It evaluates skip, full-pel an half-pel and assigns the correct MV in proc
+ *
+ * @param[in] ps_proc
+ *  Process context corresponding to the job
+ *
+ * @returns  none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+ih264e_compute_me_ft  ih264e_compute_me_multi_reflist;
+
+/**
+ *******************************************************************************
+ *
+ * @brief This fucntion evalues ME for single reflist [Pred L0]
+ *
+ * @par Description:
+ *  It evaluates skip, full-pel an half-pel and assigns the correct MV in proc
+ *
+ * @param[in] ps_proc
+ *  Process context corresponding to the job
+ *
+ * @returns  none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+ih264e_compute_me_ft  ih264e_compute_me_single_reflist;
+
+/**
+ *******************************************************************************
+ *
+ * @brief This function initializes me ctxt
+ *
+ * @par Description:
+ *  Before dispatching the current job to me thread, the me context associated
+ *  with the job is initialized.
+ *
+ * @param[in] ps_proc
+ *  Process context corresponding to the job
+ *
+ * @returns  none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
 void ih264e_init_me(process_ctxt_t *ps_proc);
 
 /**
-*******************************************************************************
-*
-* @brief This function performs motion estimation for the current NMB
-*
-* @par Description:
-*  Intializes input and output pointers required by the function ih264e_compute_me
-*  and calls the function ih264e_compute_me in a loop to process NMBs.
-*
-* @param[in] ps_proc
-*  Process context corresponding to the job
-*
-* @returns
-*
-* @remarks none
-*
-*******************************************************************************
-*/
-void ih264e_compute_me_nmb
-    (
-        process_ctxt_t *ps_proc,
-        UWORD32 u4_nmb_count
-    );
+ *******************************************************************************
+ *
+ * @brief This function performs motion estimation for the current NMB
+ *
+ * @par Description:
+ *  Intializes input and output pointers required by the function ih264e_compute_me
+ *  and calls the function ih264e_compute_me in a loop to process NMBs.
+ *
+ * @param[in] ps_proc
+ *  Process context corresponding to the job
+ *
+ * @returns
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+void ih264e_compute_me_nmb(process_ctxt_t *ps_proc, UWORD32 u4_nmb_count);
 
 /**
-*******************************************************************************
-*
-* @brief This function performs MV prediction
-*
-* @par Description:
-*
-* @param[in] ps_proc
-*  Process context corresponding to the job
-*
-* @returns  none
-*
-* @remarks none
-*  This function will update the MB availability since intra inter decision
-*  should be done before the call
-*
-*******************************************************************************
-*/
-void ih264e_mv_pred
-    (
-        process_ctxt_t *ps_proc
-    );
+ *******************************************************************************
+ *
+ * @brief This function performs MV prediction
+ *
+ * @par Description:
+ *
+ * @param[in] ps_proc
+ *  Process context corresponding to the job
+ *
+ * @returns  none
+ *
+ * @remarks none
+ *  This function will update the MB availability since intra inter decision
+ *  should be done before the call
+ *
+ *******************************************************************************
+ */
+void ih264e_mv_pred(process_ctxt_t *ps_proc, WORD32 i4_reflist);
 
 /**
-*******************************************************************************
-*
-* @brief This function approximates Pred. MV
-*
-* @par Description:
-*
-* @param[in] ps_proc
-*  Process context corresponding to the job
-*
-* @returns  none
-*
-* @remarks none
-*  Motion estimation happens at nmb level. For cost calculations, mv is appro
-*  ximated using this function
-*
-*******************************************************************************
-*/
-void ih264e_mv_pred_me
-    (
-        process_ctxt_t *ps_proc
-    );
+ *******************************************************************************
+ *
+ * @brief This function approximates Pred. MV
+ *
+ * @par Description:
+ *
+ * @param[in] ps_proc
+ *  Process context corresponding to the job
+ *
+ * @returns  none
+ *
+ * @remarks none
+ *  Motion estimation happens at nmb level. For cost calculations, mv is appro
+ *  ximated using this function
+ *
+ *******************************************************************************
+ */
+void ih264e_mv_pred_me(process_ctxt_t *ps_proc, WORD32 i4_ref_list);
 
 #endif /* IH264E_ME_H_ */
diff --git a/encoder/ih264e_modify_frm_rate.c b/encoder/ih264e_modify_frm_rate.c
index bc0e873..f1e6e61 100644
--- a/encoder/ih264e_modify_frm_rate.c
+++ b/encoder/ih264e_modify_frm_rate.c
@@ -57,14 +57,17 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ih264e_defs.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_rc_mem_interface.h"
 #include "ih264e_time_stamp.h"
@@ -103,7 +106,7 @@ WORD32 ih264e_pd_frm_rate_get_init_free_memtab(pd_frm_rate_handle *pps_pd_frm_ra
                                                ITT_FUNC_TYPE_E e_func_type)
 {
     WORD32 i4_mem_tab_idx = 0;
-    static pd_frm_rate_t s_temp_pd_frm_rate_t;
+    pd_frm_rate_t s_temp_pd_frm_rate_t;
 
     /* Hack for al alloc, during which we dont have any state memory.
      Dereferencing can cause issues */
diff --git a/encoder/ih264e_process.c b/encoder/ih264e_process.c
index 670428e..850cefc 100644
--- a/encoder/ih264e_process.c
+++ b/encoder/ih264e_process.c
@@ -68,8 +68,8 @@
 #include "ih264_defs.h"
 #include "ih264_debug.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -78,20 +78,21 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "ih264_platform_macros.h"
 #include "ih264_macros.h"
-#include "ih264_error.h"
 #include "ih264_buf_mgr.h"
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
-#include "ih264_structs.h"
 #include "ih264_common_tables.h"
 #include "ih264_list.h"
 #include "ih264e_defs.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
+#include "ih264e_cabac.h"
 #include "ih264e_process.h"
 #include "ithread.h"
 #include "ih264e_intra_modes_eval.h"
@@ -105,15 +106,11 @@
 #include "ih264e_deblk.h"
 #include "ih264e_me.h"
 #include "ih264e_debug.h"
-#include "ih264e_process.h"
 #include "ih264e_master.h"
 #include "ih264e_utils.h"
 #include "irc_mem_req_and_acq.h"
-#include "irc_cntrl_param.h"
-#include "irc_frame_info_collector.h"
 #include "irc_rate_control_api.h"
 #include "ih264e_platform_macros.h"
-#include "ih264_padding.h"
 #include "ime_statistics.h"
 
 
@@ -141,7 +138,7 @@
 IH264E_ERROR_T ih264e_generate_sps_pps(codec_t *ps_codec)
 {
     /* choose between ping-pong process buffer set */
-    WORD32 ctxt_sel = ps_codec->i4_encode_api_call_cnt & 1;
+    WORD32 ctxt_sel = ps_codec->i4_encode_api_call_cnt % MAX_CTXT_SETS;
 
     /* entropy ctxt */
     entropy_ctxt_t *ps_entropy = &ps_codec->as_process[ctxt_sel * MAX_PROCESS_THREADS].s_entropy;
@@ -274,7 +271,6 @@ IH264E_ERROR_T ih264e_init_entropy_ctxt(process_ctxt_t *ps_proc)
 *
 *******************************************************************************
 */
-#define GET_NUM_BITS(ps_bitstream) ((ps_bitstream->u4_strm_buf_offset << 3) + WORD_SIZE - ps_bitstream->i4_bits_left_in_cw)
 
 IH264E_ERROR_T ih264e_entropy(process_ctxt_t *ps_proc)
 {
@@ -284,6 +280,9 @@ IH264E_ERROR_T ih264e_entropy(process_ctxt_t *ps_proc)
     /* entropy context */
     entropy_ctxt_t *ps_entropy = &ps_proc->s_entropy;
 
+    /* cabac context */
+    cabac_ctxt_t *ps_cabac_ctxt = ps_entropy->ps_cabac;
+
     /* sps */
     sps_t *ps_sps = ps_entropy->ps_sps_base + (ps_entropy->u4_sps_id % MAX_SPS_CNT);
 
@@ -309,12 +308,12 @@ IH264E_ERROR_T ih264e_entropy(process_ctxt_t *ps_proc)
     UWORD8  *pu1_entropy_map_curr;
 
     /* proc base idx */
-    WORD32 ctxt_sel = ps_proc->i4_encode_api_call_cnt & 1;
+    WORD32 ctxt_sel = ps_proc->i4_encode_api_call_cnt % MAX_CTXT_SETS;
 
     /* temp var */
     WORD32 i4_wd_mbs, i4_ht_mbs;
     UWORD32 u4_mb_cnt, u4_mb_idx, u4_mb_end_idx;
-
+    WORD32 bitstream_start_offset, bitstream_end_offset;
     /********************************************************************/
     /*                            BEGIN INIT                            */
     /********************************************************************/
@@ -391,6 +390,13 @@ IH264E_ERROR_T ih264e_entropy(process_ctxt_t *ps_proc)
         /* once start of frame / slice is done, you can reset it */
         /* it is the responsibility of the caller to set this flag */
         ps_entropy->i4_sof = 0;
+
+        if (CABAC == ps_entropy->u1_entropy_coding_mode_flag)
+        {
+            BITSTREAM_BYTE_ALIGN(ps_bitstrm);
+            BITSTREAM_FLUSH(ps_bitstrm);
+            ih264e_init_cabac_ctxt(ps_entropy);
+        }
     }
 
     /* begin entropy coding for the mb set */
@@ -399,7 +405,7 @@ IH264E_ERROR_T ih264e_entropy(process_ctxt_t *ps_proc)
         /* init ptrs/indices */
         if (ps_entropy->i4_mb_x == i4_wd_mbs)
         {
-            ps_entropy->i4_mb_y ++;
+            ps_entropy->i4_mb_y++;
             ps_entropy->i4_mb_x = 0;
 
             /* packed mb coeff data */
@@ -411,7 +417,7 @@ IH264E_ERROR_T ih264e_entropy(process_ctxt_t *ps_proc)
                             ps_entropy->i4_mb_y * ps_codec->u4_size_header_data;
 
             /* proc map */
-            pu1_proc_map = ps_proc->pu1_proc_map + ps_entropy->i4_mb_y  * i4_wd_mbs;
+            pu1_proc_map = ps_proc->pu1_proc_map + ps_entropy->i4_mb_y * i4_wd_mbs;
 
             /* entropy map */
             pu1_entropy_map_curr = ps_entropy->pu1_entropy_map + ps_entropy->i4_mb_y * i4_wd_mbs;
@@ -430,20 +436,31 @@ IH264E_ERROR_T ih264e_entropy(process_ctxt_t *ps_proc)
             volatile UWORD8 *pu1_buf1;
             WORD32 idx = ps_entropy->i4_mb_x;
 
-            pu1_buf1 =  pu1_proc_map + idx;
-            if(*pu1_buf1)
+            pu1_buf1 = pu1_proc_map + idx;
+            if (*pu1_buf1)
                 break;
             ithread_yield();
         }
 
+
         /* write mb layer */
-        ps_codec->pf_write_mb_syntax_layer[i4_slice_type](ps_entropy);
+        ps_entropy->i4_error_code |= ps_codec->pf_write_mb_syntax_layer[ps_entropy->u1_entropy_coding_mode_flag][i4_slice_type](ps_entropy);
+        /* Starting bitstream offset for header in bits */
+        bitstream_start_offset = GET_NUM_BITS(ps_bitstrm);
 
         /* set entropy map */
         pu1_entropy_map_curr[ps_entropy->i4_mb_x] = 1;
 
-        u4_mb_idx ++;
-        ps_entropy->i4_mb_x ++;
+        u4_mb_idx++;
+        ps_entropy->i4_mb_x++;
+        /* check for eof */
+        if (CABAC == ps_entropy->u1_entropy_coding_mode_flag)
+        {
+            if (ps_entropy->i4_mb_x < i4_wd_mbs)
+            {
+                ih264e_cabac_encode_terminate(ps_cabac_ctxt, 0);
+            }
+        }
 
         if (ps_entropy->i4_mb_x == i4_wd_mbs)
         {
@@ -459,39 +476,65 @@ IH264E_ERROR_T ih264e_entropy(process_ctxt_t *ps_proc)
                 /* No need to open a slice at end of frame. The current slice can be closed at the time
                  * of signaling eof flag.
                  */
-                if ( (u4_mb_idx != u4_mb_cnt) && (i4_curr_slice_idx != pu1_slice_idx[u4_mb_idx]))
+                if ((u4_mb_idx != u4_mb_cnt) && (i4_curr_slice_idx
+                                                != pu1_slice_idx[u4_mb_idx]))
                 {
-                    /* mb skip run */
-                    if ((i4_slice_type != ISLICE) && *ps_entropy->pi4_mb_skip_run)
-                    {
-                        if (*ps_entropy->pi4_mb_skip_run)
+                    if (CAVLC == ps_entropy->u1_entropy_coding_mode_flag)
+                    { /* mb skip run */
+                        if ((i4_slice_type != ISLICE)
+                                        && *ps_entropy->pi4_mb_skip_run)
                         {
+                            if (*ps_entropy->pi4_mb_skip_run)
+                            {
                             PUT_BITS_UEV(ps_bitstrm, *ps_entropy->pi4_mb_skip_run, ps_entropy->i4_error_code, "mb skip run");
-                            *ps_entropy->pi4_mb_skip_run = 0;
+                                *ps_entropy->pi4_mb_skip_run = 0;
+                            }
                         }
+                        /* put rbsp trailing bits for the previous slice */
+                                 ps_entropy->i4_error_code |= ih264e_put_rbsp_trailing_bits(ps_bitstrm);
+                    }
+                    else
+                    {
+                        ih264e_cabac_encode_terminate(ps_cabac_ctxt, 1);
                     }
-
-                    /* put rbsp trailing bits for the previous slice */
-                    ps_entropy->i4_error_code |= ih264e_put_rbsp_trailing_bits(ps_bitstrm);
 
                     /* update slice header pointer */
                     i4_curr_slice_idx = pu1_slice_idx[u4_mb_idx];
                     ps_entropy->i4_cur_slice_idx = i4_curr_slice_idx;
-                    ps_slice_hdr = ps_entropy->ps_slice_hdr_base + (i4_curr_slice_idx % MAX_SLICE_HDR_CNT);
+                    ps_slice_hdr = ps_entropy->ps_slice_hdr_base+ (i4_curr_slice_idx % MAX_SLICE_HDR_CNT);
 
                     /* populate slice header */
                     ps_entropy->i4_mb_start_add = u4_mb_idx;
-                    ih264e_populate_slice_header(ps_proc, ps_slice_hdr, ps_pps, ps_sps);
+                    ih264e_populate_slice_header(ps_proc, ps_slice_hdr, ps_pps,
+                                                 ps_sps);
 
                     /* generate slice header */
-                    ps_entropy->i4_error_code |= ih264e_generate_slice_header(ps_bitstrm, ps_slice_hdr,
-                                                                              ps_pps, ps_sps);
+                    ps_entropy->i4_error_code |= ih264e_generate_slice_header(
+                                    ps_bitstrm, ps_slice_hdr, ps_pps, ps_sps);
+                    if (CABAC == ps_entropy->u1_entropy_coding_mode_flag)
+                    {
+                        BITSTREAM_BYTE_ALIGN(ps_bitstrm);
+                        BITSTREAM_FLUSH(ps_bitstrm);
+                        ih264e_init_cabac_ctxt(ps_entropy);
+                    }
+                }
+                else
+                {
+                    if (CABAC == ps_entropy->u1_entropy_coding_mode_flag
+                                    && u4_mb_idx != u4_mb_cnt)
+                    {
+                        ih264e_cabac_encode_terminate(ps_cabac_ctxt, 0);
+                    }
                 }
             }
-
             /* Dont execute any further instructions until store synchronization took place */
             DATA_SYNC();
         }
+
+        /* Ending bitstream offset for header in bits */
+        bitstream_end_offset = GET_NUM_BITS(ps_bitstrm);
+        ps_entropy->u4_header_bits[i4_slice_type == PSLICE] +=
+                        bitstream_end_offset - bitstream_start_offset;
     }
 
     /* check for eof */
@@ -500,30 +543,47 @@ IH264E_ERROR_T ih264e_entropy(process_ctxt_t *ps_proc)
         /* set end of frame flag */
         ps_entropy->i4_eof = 1;
     }
+    else
+    {
+        if (CABAC == ps_entropy->u1_entropy_coding_mode_flag
+                        && ps_codec->s_cfg.e_slice_mode
+                                        != IVE_SLICE_MODE_BLOCKS)
+        {
+            ih264e_cabac_encode_terminate(ps_cabac_ctxt, 0);
+        }
+    }
 
     if (ps_entropy->i4_eof)
     {
-        /* mb skip run */
-        if ((i4_slice_type != ISLICE) && *ps_entropy->pi4_mb_skip_run)
+        if (CAVLC == ps_entropy->u1_entropy_coding_mode_flag)
         {
-            if (*ps_entropy->pi4_mb_skip_run)
+            /* mb skip run */
+            if ((i4_slice_type != ISLICE) && *ps_entropy->pi4_mb_skip_run)
             {
-                PUT_BITS_UEV(ps_bitstrm, *ps_entropy->pi4_mb_skip_run, ps_entropy->i4_error_code, "mb skip run");
-                *ps_entropy->pi4_mb_skip_run = 0;
+                if (*ps_entropy->pi4_mb_skip_run)
+                {
+                    PUT_BITS_UEV(ps_bitstrm, *ps_entropy->pi4_mb_skip_run,
+                                 ps_entropy->i4_error_code, "mb skip run");
+                    *ps_entropy->pi4_mb_skip_run = 0;
+                }
             }
+            /* put rbsp trailing bits */
+             ps_entropy->i4_error_code |= ih264e_put_rbsp_trailing_bits(ps_bitstrm);
+        }
+        else
+        {
+            ih264e_cabac_encode_terminate(ps_cabac_ctxt, 1);
         }
-
-        /* put rbsp trailing bits */
-        ps_entropy->i4_error_code |= ih264e_put_rbsp_trailing_bits(ps_bitstrm);
 
         /* update current frame stats to rc library */
-        if (IVE_RC_NONE != ps_codec->s_cfg.e_rc_mode)
         {
             /* number of bytes to stuff */
             WORD32 i4_stuff_bytes;
 
             /* update */
-            i4_stuff_bytes = ih264e_update_rc_post_enc(ps_codec, ctxt_sel, ps_proc->i4_pic_cnt);
+            i4_stuff_bytes = ih264e_update_rc_post_enc(
+                            ps_codec, ctxt_sel,
+                            (ps_proc->ps_codec->i4_poc == 0));
 
             /* cbr rc - house keeping */
             if (ps_codec->s_rate_control.post_encode_skip[ctxt_sel])
@@ -537,10 +597,21 @@ IH264E_ERROR_T ih264e_entropy(process_ctxt_t *ps_proc)
             }
         }
 
+        /*
+         *Frame number is to be incremented only if the current frame is a
+         * reference frame. After each successful frame encode, we increment
+         * frame number by 1
+         */
+        if (!ps_codec->s_rate_control.post_encode_skip[ctxt_sel]
+                        && ps_codec->u4_is_curr_frm_ref)
+        {
+            ps_codec->i4_frame_num++;
+        }
         /********************************************************************/
         /*      signal the output                                           */
         /********************************************************************/
-        ps_codec->as_out_buf[ctxt_sel].s_bits_buf.u4_bytes = ps_entropy->ps_bitstrm->u4_strm_buf_offset;
+        ps_codec->as_out_buf[ctxt_sel].s_bits_buf.u4_bytes =
+                        ps_entropy->ps_bitstrm->u4_strm_buf_offset;
 
         DEBUG("entropy status %x", ps_entropy->i4_error_code);
     }
@@ -679,9 +750,9 @@ IH264E_ERROR_T ih264e_pack_header_data(process_ctxt_t *ps_proc)
 
         i2_mv_ptr = (WORD16 *)pu1_ptr;
 
-        *i2_mv_ptr++ = ps_proc->ps_pu->s_l0_mv.i2_mvx - ps_proc->ps_pred_mv->i2_mvx;
+        *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvx - ps_proc->ps_pred_mv[0].s_mv.i2_mvx;
 
-        *i2_mv_ptr++ = ps_proc->ps_pu->s_l0_mv.i2_mvy - ps_proc->ps_pred_mv->i2_mvy;
+        *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvy - ps_proc->ps_pred_mv[0].s_mv.i2_mvy;
 
         /* end of mb layer */
         ps_proc->pv_mb_header_data = i2_mv_ptr;
@@ -697,6 +768,79 @@ IH264E_ERROR_T ih264e_pack_header_data(process_ctxt_t *ps_proc)
         /* end of mb layer */
         ps_proc->pv_mb_header_data = pu1_ptr;
     }
+    else if(u4_mb_type == B16x16)
+    {
+
+        /* pointer to mb header storage space */
+        UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data;
+
+        WORD16 *i2_mv_ptr;
+
+        UWORD32 u4_pred_mode = ps_proc->ps_pu->b2_pred_mode;
+
+        /* mb type plus mode */
+        *pu1_ptr++ = (u4_pred_mode << 4) + u4_mb_type;
+
+        /* cbp */
+        *pu1_ptr++ = ps_proc->u4_cbp;
+
+        /* mb qp delta */
+        *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev;
+
+        /* l0 & l1 me data */
+        i2_mv_ptr = (WORD16 *)pu1_ptr;
+
+        if (u4_pred_mode != PRED_L1)
+        {
+            *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvx
+                            - ps_proc->ps_pred_mv[0].s_mv.i2_mvx;
+
+            *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvy
+                            - ps_proc->ps_pred_mv[0].s_mv.i2_mvy;
+        }
+        if (u4_pred_mode != PRED_L0)
+        {
+            *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvx
+                            - ps_proc->ps_pred_mv[1].s_mv.i2_mvx;
+
+            *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvy
+                            - ps_proc->ps_pred_mv[1].s_mv.i2_mvy;
+        }
+
+        /* end of mb layer */
+        ps_proc->pv_mb_header_data = i2_mv_ptr;
+
+    }
+    else if(u4_mb_type == BDIRECT)
+    {
+        /* pointer to mb header storage space */
+        UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data;
+
+        /* mb type plus mode */
+        *pu1_ptr++ = u4_mb_type;
+
+        /* cbp */
+        *pu1_ptr++ = ps_proc->u4_cbp;
+
+        /* mb qp delta */
+        *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev;
+
+        ps_proc->pv_mb_header_data = pu1_ptr;
+
+    }
+    else if(u4_mb_type == BSKIP)
+    {
+        UWORD32 u4_pred_mode = ps_proc->ps_pu->b2_pred_mode;
+
+        /* pointer to mb header storage space */
+        UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data;
+
+        /* mb type plus mode */
+        *pu1_ptr++ = (u4_pred_mode << 4) + u4_mb_type;
+
+        /* end of mb layer */
+        ps_proc->pv_mb_header_data = pu1_ptr;
+    }
 
     return IH264E_SUCCESS;
 }
@@ -788,12 +932,11 @@ WORD32 ih264e_update_proc_ctxt(process_ctxt_t *ps_proc)
     /* mb type, mb class, csbp */
     *ps_top_left_syn = *ps_top_syn;
 
-    if (ps_proc->i4_slice_type == PSLICE)
+    if (ps_proc->i4_slice_type != ISLICE)
     {
         /*****************************************/
         /* update top left with top info results */
         /*****************************************/
-
         /* mv */
         *ps_top_left_mb_pu = *ps_top_row_pu;
     }
@@ -832,17 +975,13 @@ WORD32 ih264e_update_proc_ctxt(process_ctxt_t *ps_proc)
             memcpy(pu1_top_mb_intra_modes, ps_proc->au1_intra_luma_mb_8x8_modes, 4);
         }
 
-        if (ps_proc->i4_slice_type == PSLICE)
+        if ((ps_proc->i4_slice_type == PSLICE) ||(ps_proc->i4_slice_type == BSLICE))
         {
             /* mv */
             *ps_left_mb_pu = *ps_top_row_pu = *(ps_proc->ps_pu);
-
-//            /* reset ngbr mv's */
-//            ps_top_row_pu->i1_l0_ref_idx = -1;
-//            ps_top_row_pu->s_l0_mv = zero_mv;
-//
-//            *ps_left_mb_pu = *ps_top_row_pu;
         }
+
+        *ps_proc->pu4_mb_pu_cnt = 1;
     }
     else
     {
@@ -898,7 +1037,7 @@ WORD32 ih264e_update_proc_ctxt(process_ctxt_t *ps_proc)
         s_job.i2_mb_y = ps_proc->i4_mb_y;
 
         /* proc base idx */
-        s_job.i2_proc_base_idx = (ps_codec->i4_encode_api_call_cnt & 1) ? (MAX_PROCESS_CTXT / 2): 0 ;
+        s_job.i2_proc_base_idx = (ps_codec->i4_encode_api_call_cnt % MAX_CTXT_SETS) ? (MAX_PROCESS_CTXT / 2) : 0;
 
         /* queue the job */
         error_status |= ih264_list_queue(ps_proc->pv_entropy_jobq, &s_job, 1);
@@ -929,7 +1068,8 @@ WORD32 ih264e_update_proc_ctxt(process_ctxt_t *ps_proc)
     /* update buffers pointers */
     ps_proc->pu1_src_buf_luma += MB_SIZE;
     ps_proc->pu1_rec_buf_luma += MB_SIZE;
-    ps_proc->pu1_ref_buf_luma += MB_SIZE;
+    ps_proc->apu1_ref_buf_luma[0] += MB_SIZE;
+    ps_proc->apu1_ref_buf_luma[1] += MB_SIZE;
 
     /*
      * Note: Although chroma mb size is 8, as the chroma buffers are interleaved,
@@ -937,7 +1077,9 @@ WORD32 ih264e_update_proc_ctxt(process_ctxt_t *ps_proc)
      */
     ps_proc->pu1_src_buf_chroma += MB_SIZE;
     ps_proc->pu1_rec_buf_chroma += MB_SIZE;
-    ps_proc->pu1_ref_buf_chroma += MB_SIZE;
+    ps_proc->apu1_ref_buf_chroma[0] += MB_SIZE;
+    ps_proc->apu1_ref_buf_chroma[1] += MB_SIZE;
+
 
 
     /* Reset cost, distortion params */
@@ -948,6 +1090,10 @@ WORD32 ih264e_update_proc_ctxt(process_ctxt_t *ps_proc)
 
     ps_proc->pu4_mb_pu_cnt += 1;
 
+    /* Update colocated pu */
+    if (ps_proc->i4_slice_type == BSLICE)
+        ps_proc->ps_colpu += *(ps_proc->aps_mv_buf[1]->pu4_mb_pu_cnt +  (i4_mb_y * ps_proc->i4_wd_mbs) + i4_mb_x);
+
     /* deblk ctxts */
     if (ps_proc->u4_disable_deblock_level != 1)
     {
@@ -1004,6 +1150,7 @@ IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc)
 
     /* strides */
     WORD32 i4_src_strd = ps_proc->i4_src_strd;
+    WORD32 i4_src_chroma_strd = ps_proc->i4_src_chroma_strd;
     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
 
     /* quant params */
@@ -1035,43 +1182,54 @@ IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc)
     i4_mb_y = ps_proc->i4_mb_y;
 
     /* Number of mbs processed in one loop of process function */
-    ps_proc->i4_nmb_ntrpy = (ps_proc->i4_wd_mbs > MAX_NMB) ? MAX_NMB : ps_proc->i4_wd_mbs;
-    ps_proc->u4_nmb_me = (ps_proc->i4_wd_mbs > MAX_NMB)? MAX_NMB : ps_proc->i4_wd_mbs;
+    ps_proc->i4_nmb_ntrpy = ps_proc->i4_wd_mbs;
+    ps_proc->u4_nmb_me = ps_proc->i4_wd_mbs;
 
+    /* init buffer pointers */
     convert_uv_only = 1;
-    if (u4_pad_bottom_sz && (ps_proc->i4_mb_y == ps_proc->i4_ht_mbs - 1))
+    if (u4_pad_bottom_sz || u4_pad_right_sz ||
+        ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_422ILE)
     {
-        u2_num_rows = (UWORD16) MB_SIZE - u4_pad_bottom_sz;
+        if (ps_proc->i4_mb_y == ps_proc->i4_ht_mbs - 1)
+            u2_num_rows = (UWORD16) MB_SIZE - u4_pad_bottom_sz;
         ps_proc->pu1_src_buf_luma_base = ps_codec->pu1_y_csc_buf_base;
+        i4_src_strd = ps_proc->i4_src_strd = ps_codec->s_cfg.u4_max_wd;
         ps_proc->pu1_src_buf_luma = ps_proc->pu1_src_buf_luma_base + (i4_mb_x * MB_SIZE) + ps_codec->s_cfg.u4_max_wd * (i4_mb_y * MB_SIZE);
         convert_uv_only = 0;
-
     }
     else
+    {
+        i4_src_strd = ps_proc->i4_src_strd = ps_proc->s_inp_buf.s_raw_buf.au4_strd[0];
         ps_proc->pu1_src_buf_luma = ps_proc->pu1_src_buf_luma_base + (i4_mb_x * MB_SIZE) + i4_src_strd * (i4_mb_y * MB_SIZE);
+    }
 
-    /* init buffer pointers */
 
     if (ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_422ILE ||
         ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420P ||
-        ps_proc->i4_mb_y == (ps_proc->i4_ht_mbs - 1))
+        ps_proc->i4_mb_y == (ps_proc->i4_ht_mbs - 1) ||
+        u4_pad_bottom_sz || u4_pad_right_sz)
     {
         if ((ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_UV) ||
             (ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU))
             ps_proc->pu1_src_buf_chroma_base = ps_codec->pu1_uv_csc_buf_base;
 
         ps_proc->pu1_src_buf_chroma = ps_proc->pu1_src_buf_chroma_base + (i4_mb_x * MB_SIZE) + ps_codec->s_cfg.u4_max_wd * (i4_mb_y * BLK8x8SIZE);
+        i4_src_chroma_strd = ps_proc->i4_src_chroma_strd = ps_codec->s_cfg.u4_max_wd;
     }
     else
     {
-        ps_proc->pu1_src_buf_chroma = ps_proc->pu1_src_buf_chroma_base + (i4_mb_x * MB_SIZE) + i4_src_strd * (i4_mb_y * BLK8x8SIZE);
+        i4_src_chroma_strd = ps_proc->i4_src_chroma_strd = ps_proc->s_inp_buf.s_raw_buf.au4_strd[1];
+        ps_proc->pu1_src_buf_chroma = ps_proc->pu1_src_buf_chroma_base + (i4_mb_x * MB_SIZE) + i4_src_chroma_strd * (i4_mb_y * BLK8x8SIZE);
     }
 
     ps_proc->pu1_rec_buf_luma = ps_proc->pu1_rec_buf_luma_base + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * MB_SIZE);
     ps_proc->pu1_rec_buf_chroma = ps_proc->pu1_rec_buf_chroma_base + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * BLK8x8SIZE);
-    ps_proc->pu1_ref_buf_luma = ps_proc->pu1_ref_buf_luma_base + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * MB_SIZE);
-    ps_proc->pu1_ref_buf_chroma = ps_proc->pu1_ref_buf_chroma_base + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * BLK8x8SIZE);
 
+    /* Tempral back and forward reference buffer */
+    ps_proc->apu1_ref_buf_luma[0] = ps_proc->apu1_ref_buf_luma_base[0] + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * MB_SIZE);
+    ps_proc->apu1_ref_buf_chroma[0] = ps_proc->apu1_ref_buf_chroma_base[0] + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * BLK8x8SIZE);
+    ps_proc->apu1_ref_buf_luma[1] = ps_proc->apu1_ref_buf_luma_base[1] + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * MB_SIZE);
+    ps_proc->apu1_ref_buf_chroma[1] = ps_proc->apu1_ref_buf_chroma_base[1] + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * BLK8x8SIZE);
 
     /*
      * Do color space conversion
@@ -1084,9 +1242,9 @@ IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc)
             /* In case of 420 semi-planar input, copy last few rows to intermediate
                buffer as chroma trans functions access one extra byte due to interleaved input.
                This data will be padded if required */
-            if (ps_proc->i4_mb_y == (ps_proc->i4_ht_mbs - 1))
+            if (ps_proc->i4_mb_y == (ps_proc->i4_ht_mbs - 1) || u4_pad_bottom_sz || u4_pad_right_sz)
             {
-                WORD32 num_rows = ps_codec->s_cfg.u4_disp_ht & 0xF;
+                WORD32 num_rows = MB_SIZE;
                 UWORD8 *pu1_src;
                 UWORD8 *pu1_dst;
                 WORD32 i;
@@ -1095,11 +1253,16 @@ IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc)
 
                 pu1_dst = ps_proc->pu1_src_buf_luma;
 
-                for (i = 0; i < num_rows; i++)
-                {
-                    memcpy(pu1_dst, pu1_src, ps_codec->s_cfg.u4_wd);
-                    pu1_src += ps_proc->s_inp_buf.s_raw_buf.au4_strd[0];
-                    pu1_dst += ps_proc->i4_src_strd;
+                /* If padding is required, we always copy luma, if padding isn't required we never copy luma. */
+                if (u4_pad_bottom_sz || u4_pad_right_sz) {
+                    if (ps_proc->i4_mb_y == (ps_proc->i4_ht_mbs - 1))
+                        num_rows = MB_SIZE - u4_pad_bottom_sz;
+                    for (i = 0; i < num_rows; i++)
+                    {
+                        memcpy(pu1_dst, pu1_src, ps_codec->s_cfg.u4_wd);
+                        pu1_src += ps_proc->s_inp_buf.s_raw_buf.au4_strd[0];
+                        pu1_dst += ps_proc->i4_src_strd;
+                    }
                 }
                 pu1_src = (UWORD8 *)ps_proc->s_inp_buf.s_raw_buf.apv_bufs[1] + (i4_mb_x * BLK8x8SIZE) +
                           ps_proc->s_inp_buf.s_raw_buf.au4_strd[1] * (i4_mb_y * BLK8x8SIZE);
@@ -1108,12 +1271,15 @@ IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc)
                 /* Last MB row of chroma is copied unconditionally, since trans functions access an extra byte
                  * due to interleaved input
                  */
-                num_rows = (ps_codec->s_cfg.u4_disp_ht >> 1) - (ps_proc->i4_mb_y * BLK8x8SIZE);
+                if (ps_proc->i4_mb_y == (ps_proc->i4_ht_mbs - 1))
+                    num_rows = (ps_codec->s_cfg.u4_disp_ht >> 1) - (ps_proc->i4_mb_y * BLK8x8SIZE);
+                else
+                    num_rows = BLK8x8SIZE;
                 for (i = 0; i < num_rows; i++)
                 {
                     memcpy(pu1_dst, pu1_src, ps_codec->s_cfg.u4_wd);
                     pu1_src += ps_proc->s_inp_buf.s_raw_buf.au4_strd[1];
-                    pu1_dst += ps_proc->i4_src_strd;
+                    pu1_dst += ps_proc->i4_src_chroma_strd;
                 }
 
             }
@@ -1137,7 +1303,7 @@ IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc)
                             ps_proc->s_inp_buf.s_raw_buf.au4_strd[0],
                             ps_proc->s_inp_buf.s_raw_buf.au4_strd[1],
                             ps_proc->s_inp_buf.s_raw_buf.au4_strd[2],
-                            ps_proc->i4_src_strd, ps_proc->i4_src_strd,
+                            ps_proc->i4_src_strd, ps_proc->i4_src_chroma_strd,
                             convert_uv_only);
             break;
 
@@ -1150,8 +1316,8 @@ IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc)
                             ps_proc->pu1_src_buf_chroma,
                             ps_proc->pu1_src_buf_chroma + 1, pu1_y_buf_base,
                             ps_codec->s_cfg.u4_disp_wd, u2_num_rows,
-                            ps_proc->i4_src_strd, ps_proc->i4_src_strd,
-                            ps_proc->i4_src_strd,
+                            ps_proc->i4_src_strd, ps_proc->i4_src_chroma_strd,
+                            ps_proc->i4_src_chroma_strd,
                             ps_proc->s_inp_buf.s_raw_buf.au4_strd[0] >> 1);
             break;
 
@@ -1159,8 +1325,7 @@ IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc)
             break;
     }
 
-    if (u4_pad_right_sz && (ps_proc->i4_mb_x == 0) &&
-                    (ps_proc->i4_src_strd > (WORD32)ps_codec->s_cfg.u4_disp_wd) )
+    if (u4_pad_right_sz && (ps_proc->i4_mb_x == 0))
     {
         UWORD32 u4_pad_wd, u4_pad_ht;
         u4_pad_wd = (UWORD32)(ps_proc->i4_src_strd - ps_codec->s_cfg.u4_disp_wd);
@@ -1175,7 +1340,7 @@ IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc)
 
         ih264_pad_right_chroma(
                         ps_proc->pu1_src_buf_chroma + ps_codec->s_cfg.u4_disp_wd,
-                        ps_proc->i4_src_strd, u4_pad_ht / 2, u4_pad_wd);
+                        ps_proc->i4_src_chroma_strd, u4_pad_ht / 2, u4_pad_wd);
     }
 
     /* pad bottom edge */
@@ -1184,8 +1349,8 @@ IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc)
         ih264_pad_bottom(ps_proc->pu1_src_buf_luma + (MB_SIZE - u4_pad_bottom_sz) * ps_proc->i4_src_strd,
                          ps_proc->i4_src_strd, ps_proc->i4_src_strd, u4_pad_bottom_sz);
 
-        ih264_pad_bottom(ps_proc->pu1_src_buf_chroma + (MB_SIZE - u4_pad_bottom_sz) * ps_proc->i4_src_strd / 2,
-                         ps_proc->i4_src_strd, ps_proc->i4_src_strd, (u4_pad_bottom_sz / 2));
+        ih264_pad_bottom(ps_proc->pu1_src_buf_chroma + (MB_SIZE - u4_pad_bottom_sz) * ps_proc->i4_src_chroma_strd / 2,
+                         ps_proc->i4_src_chroma_strd, ps_proc->i4_src_chroma_strd, (u4_pad_bottom_sz / 2));
     }
 
 
@@ -1206,7 +1371,12 @@ IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc)
     /*********************************************************************/
 
     /* init mv buffer ptr */
-    ps_proc->ps_pu = ps_cur_mv_buf->ps_pic_pu + (i4_mb_y * ps_proc->i4_wd_mbs * (MIN_PU_SIZE * MIN_PU_SIZE));
+    ps_proc->ps_pu = ps_cur_mv_buf->ps_pic_pu + (i4_mb_y * ps_proc->i4_wd_mbs *
+                     ((MB_SIZE * MB_SIZE) / (ENC_MIN_PU_SIZE * ENC_MIN_PU_SIZE)));
+
+    /* Init co-located mv buffer */
+    ps_proc->ps_colpu = ps_proc->aps_mv_buf[1]->ps_pic_pu + (i4_mb_y * ps_proc->i4_wd_mbs *
+                        ((MB_SIZE * MB_SIZE) / (ENC_MIN_PU_SIZE * ENC_MIN_PU_SIZE)));
 
     if (i4_mb_y == 0)
     {
@@ -1214,7 +1384,8 @@ IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc)
     }
     else
     {
-        ps_proc->ps_top_row_pu_ME = ps_cur_mv_buf->ps_pic_pu + ((i4_mb_y - 1) * ps_proc->i4_wd_mbs * (MIN_PU_SIZE * MIN_PU_SIZE));
+        ps_proc->ps_top_row_pu_ME = ps_cur_mv_buf->ps_pic_pu + ((i4_mb_y - 1) * ps_proc->i4_wd_mbs *
+                                    ((MB_SIZE * MB_SIZE) / (ENC_MIN_PU_SIZE * ENC_MIN_PU_SIZE)));
     }
 
     ps_proc->pu4_mb_pu_cnt = ps_cur_mv_buf->pu4_mb_pu_cnt + (i4_mb_y * ps_proc->i4_wd_mbs);
@@ -1743,34 +1914,69 @@ WORD32 ih264e_process(process_ctxt_t *ps_proc)
     WORD32 luma_idx, chroma_idx, is_intra;
 
     /* temp variables */
-    WORD32 ctxt_sel = ps_proc->i4_encode_api_call_cnt & 1;
+    WORD32 ctxt_sel = ps_proc->i4_encode_api_call_cnt % MAX_CTXT_SETS;
 
-    /* list of modes for evaluation */
+    /*
+     * list of modes for evaluation
+     * -------------------------------------------------------------------------
+     * Note on enabling I4x4 and I16x16
+     * At very low QP's the hadamard transform in I16x16 will push up the maximum
+     * coeff value very high. CAVLC may not be able to represent the value and
+     * hence the stream may not be decodable in some clips.
+     * Hence at low QPs, we will enable I4x4 and disable I16x16 irrespective of preset.
+     */
     if (ps_proc->i4_slice_type == ISLICE)
     {
-        /* enable intra 16x16 */
-        u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_16x16 ? (1 << I16x16) : 0;
+        if (ps_proc->u4_frame_qp > 10)
+        {
+            /* enable intra 16x16 */
+            u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_16x16 ? (1 << I16x16) : 0;
 
-        /* enable intra 8x8 */
-        u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_8x8 ? (1 << I8x8) : 0;
+            /* enable intra 8x8 */
+            u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_8x8 ? (1 << I8x8) : 0;
+        }
 
         /* enable intra 4x4 */
         u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_4x4 ? (1 << I4x4) : 0;
+        u4_valid_modes |= (ps_proc->u4_frame_qp <= 10) << I4x4;
+
     }
     else if (ps_proc->i4_slice_type == PSLICE)
     {
-        /* enable intra 16x16 */
-        u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_16x16 ? (1 << I16x16) : 0;
+        if (ps_proc->u4_frame_qp > 10)
+        {
+            /* enable intra 16x16 */
+            u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_16x16 ? (1 << I16x16) : 0;
+        }
 
         /* enable intra 4x4 */
         if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_SLOWEST)
         {
             u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_4x4 ? (1 << I4x4) : 0;
         }
+        u4_valid_modes |= (ps_proc->u4_frame_qp <= 10) << I4x4;
 
-        /* enable inter 16x16 */
+        /* enable inter P16x16 */
         u4_valid_modes |= (1 << P16x16);
     }
+    else if (ps_proc->i4_slice_type == BSLICE)
+    {
+        if (ps_proc->u4_frame_qp > 10)
+        {
+            /* enable intra 16x16 */
+            u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_16x16 ? (1 << I16x16) : 0;
+        }
+
+        /* enable intra 4x4 */
+        if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_SLOWEST)
+        {
+            u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_4x4 ? (1 << I4x4) : 0;
+        }
+        u4_valid_modes |= (ps_proc->u4_frame_qp <= 10) << I4x4;
+
+        /* enable inter B16x16 */
+        u4_valid_modes |= (1 << B16x16);
+    }
 
 
     /* init entropy */
@@ -1806,7 +2012,7 @@ WORD32 ih264e_process(process_ctxt_t *ps_proc)
                             (ps_codec->pu2_intr_rfrsh_map[i4_mb_id] != ps_codec->i4_air_pic_cnt);
 
             /* evaluate inter 16x16 modes */
-            if (u4_valid_modes & (1 << P16x16))
+            if ((u4_valid_modes & (1 << P16x16)) || (u4_valid_modes & (1 << B16x16)))
             {
                 /* compute nmb me */
                 if (ps_proc->i4_mb_x % ps_proc->u4_nmb_me == 0)
@@ -1823,9 +2029,9 @@ WORD32 ih264e_process(process_ctxt_t *ps_proc)
                     ps_proc->u4_min_sad_reached = ps_proc->ps_nmb_info[u4_mb_index].u4_min_sad_reached;
                     ps_proc->u4_min_sad = ps_proc->ps_nmb_info[u4_mb_index].u4_min_sad;
 
-                    ps_proc->ps_skip_mv = &(ps_proc->ps_nmb_info[u4_mb_index].s_skip_mv);
+                    ps_proc->ps_skip_mv = &(ps_proc->ps_nmb_info[u4_mb_index].as_skip_mv[0]);
                     ps_proc->ps_ngbr_avbl = &(ps_proc->ps_nmb_info[u4_mb_index].s_ngbr_avbl);
-                    ps_proc->ps_pred_mv = &(ps_proc->ps_nmb_info[u4_mb_index].s_pred_mv);
+                    ps_proc->ps_pred_mv = &(ps_proc->ps_nmb_info[u4_mb_index].as_pred_mv[0]);
 
                     ps_proc->i4_mb_distortion = ps_proc->ps_nmb_info[u4_mb_index].i4_mb_distortion;
                     ps_proc->i4_mb_cost = ps_proc->ps_nmb_info[u4_mb_index].i4_mb_cost;
@@ -1889,7 +2095,7 @@ WORD32 ih264e_process(process_ctxt_t *ps_proc)
             {
                 /* intra gating in inter slices */
                 /* No need of gating if we want to force intra, we need to find the threshold only if inter is enabled by AIR*/
-                if (i4_air_enable_inter && ps_proc->i4_slice_type == PSLICE && ps_codec->u4_inter_gate)
+                if (i4_air_enable_inter && ps_proc->i4_slice_type != ISLICE && ps_codec->u4_inter_gate)
                 {
                     /* distortion of neighboring blocks */
                     WORD32 i4_distortion[4];
@@ -1906,6 +2112,7 @@ WORD32 ih264e_process(process_ctxt_t *ps_proc)
 
                 }
 
+
                 /* If we are going to force intra we need to evaluate intra irrespective of gating */
                 if ( (!i4_air_enable_inter) || ((i4_gate_threshold + 16 *((WORD32) ps_proc->u4_lambda)) < ps_proc->i4_mb_distortion))
                 {
@@ -1933,10 +2140,10 @@ WORD32 ih264e_process(process_ctxt_t *ps_proc)
                     {
                         ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(ps_proc);
                     }
-                }
 
-            }
+                }
         }
+     }
 
         /* is intra */
         if (ps_proc->u4_mb_type == I4x4 || ps_proc->u4_mb_type == I16x16 || ps_proc->u4_mb_type == I8x8)
@@ -1955,13 +2162,14 @@ WORD32 ih264e_process(process_ctxt_t *ps_proc)
             is_intra = 0;
         }
         ps_proc->u4_is_intra = is_intra;
+        ps_proc->ps_pu->b1_intra_flag = is_intra;
 
         /* redo MV pred of neighbors in the case intra mb */
         /* TODO : currently called unconditionally, needs to be called only in the case of intra
          * to modify neighbors */
         if (ps_proc->i4_slice_type != ISLICE)
         {
-            ih264e_mv_pred(ps_proc);
+            ih264e_mv_pred(ps_proc, ps_proc->i4_slice_type);
         }
 
         /* Perform luma mb core coding */
@@ -1973,18 +2181,18 @@ WORD32 ih264e_process(process_ctxt_t *ps_proc)
         /* coded block pattern */
         ps_proc->u4_cbp = (u4_cbp_c << 4) | u4_cbp_l;
 
-        /* mb skip */
-        if (is_intra == 0)
+        if (!ps_proc->u4_is_intra)
         {
-            if (ps_proc->u4_cbp == 0)
+            if (ps_proc->i4_slice_type == BSLICE)
             {
-                /* get skip mv */
-                UWORD32 u4_for_me = 0;
-                ih264e_find_skip_motion_vector(ps_proc,u4_for_me);
-
-                /* skip ? */
-                if (ps_proc->ps_skip_mv->i2_mvx == ps_proc->ps_pu->s_l0_mv.i2_mvx &&
-                                ps_proc->ps_skip_mv->i2_mvy == ps_proc->ps_pu->s_l0_mv.i2_mvy)
+                if (ih264e_find_bskip_params(ps_proc, PRED_L0))
+                {
+                    ps_proc->u4_mb_type = (ps_proc->u4_cbp) ? BDIRECT : BSKIP;
+                }
+            }
+            else if(!ps_proc->u4_cbp)
+            {
+                if (ih264e_find_pskip_params(ps_proc, PRED_L0))
                 {
                     ps_proc->u4_mb_type = PSKIP;
                 }
@@ -2090,106 +2298,6 @@ UPDATE_MB_INFO:
 *******************************************************************************
 *
 * @brief
-*  function to receive frame qp and pic type before encoding
-*
-* @par Description:
-*  Before encoding the frame, this function calls the rc library for frame qp
-*  and picture type
-*
-* @param[in] ps_codec
-*  Pointer to codec context
-*
-* @param[in] pic_cnt
-*  pic count
-*
-* @param[out] pi4_pic_type
-*  pic type
-
-* @returns skip_src
-*  if the source frame rate and target frame rate are not identical, the encoder
-*  skips few source frames. skip_src is set when the source need not be encoded.
-*
-* @remarks none
-*
-*******************************************************************************
-*/
-WORD32 ih264e_set_rc_pic_params(codec_t *ps_codec, WORD32 cur_pic_cnt, WORD32 *pi4_pic_type)
-{
-    /* rate control context */
-    rate_control_ctxt_t *ps_rate_control = &ps_codec->s_rate_control;
-
-    /* frame qp */
-    UWORD8 u1_frame_qp;
-
-    /* pic type */
-    PIC_TYPE_T pic_type = PIC_NA;
-
-    /* should src be skipped */
-    WORD32 skip_src = 0;
-
-    /* temp var */
-    WORD32 delta_time_stamp = 1;
-
-    /* see if the app requires any specific frame */
-    if (ps_codec->force_curr_frame_type == IV_IDR_FRAME || ps_codec->force_curr_frame_type == IV_I_FRAME)
-    {
-        irc_force_I_frame(ps_codec->s_rate_control.pps_rate_control_api);
-    }
-
-    /* call rate control lib to get curr pic type and qp to be used */
-    skip_src = ih264e_rc_pre_enc(ps_rate_control->pps_rate_control_api,
-                                 ps_rate_control->pps_pd_frm_rate,
-                                 ps_rate_control->pps_time_stamp,
-                                 ps_rate_control->pps_frame_time,
-                                 delta_time_stamp,
-                                 (ps_codec->s_cfg.i4_wd_mbs * ps_codec->s_cfg.i4_ht_mbs),
-                                 &ps_rate_control->e_pic_type,
-                                 &u1_frame_qp);
-
-    switch (ps_rate_control->e_pic_type)
-    {
-        case I_PIC:
-            pic_type = PIC_I;
-            break;
-
-        case P_PIC:
-            pic_type = PIC_P;
-            break;
-
-        case B_PIC:
-            pic_type = PIC_B;
-            break;
-
-        default:
-            break;
-    }
-
-    /* is idr? */
-    if ((0 == cur_pic_cnt % ps_codec->s_cfg.u4_idr_frm_interval) ||
-                    ps_codec->force_curr_frame_type == IV_IDR_FRAME)
-    {
-        pic_type = PIC_IDR;
-    }
-
-    /* force frame tag is not sticky */
-    if (ps_codec->force_curr_frame_type == IV_IDR_FRAME || ps_codec->force_curr_frame_type == IV_I_FRAME)
-    {
-        ps_codec->force_curr_frame_type = IV_NA_FRAME;
-    }
-
-    /* qp */
-    ps_codec->u4_frame_qp = gau1_mpeg2_to_h264_qmap[u1_frame_qp];
-
-    /* pic type */
-    *pi4_pic_type = pic_type;
-
-    return skip_src;
-}
-
-/**
-*******************************************************************************
-*
-* @brief
 *  Function to update rc context after encoding
 *
 * @par   Description
@@ -2214,7 +2322,7 @@ WORD32 ih264e_set_rc_pic_params(codec_t *ps_codec, WORD32 cur_pic_cnt, WORD32 *p
 *
 *******************************************************************************
 */
-WORD32 ih264e_update_rc_post_enc(codec_t *ps_codec, WORD32 ctxt_sel, WORD32 pic_cnt)
+WORD32 ih264e_update_rc_post_enc(codec_t *ps_codec, WORD32 ctxt_sel, WORD32 i4_is_first_frm)
 {
     /* proc set base idx */
     WORD32 i4_proc_ctxt_sel_base = ctxt_sel ? (MAX_PROCESS_CTXT / 2) : 0;
@@ -2295,18 +2403,11 @@ WORD32 ih264e_update_rc_post_enc(codec_t *ps_codec, WORD32 ctxt_sel, WORD32 pic_
                                           ps_codec->s_rate_control.pps_frame_time,
                                           (ps_proc->i4_wd_mbs * ps_proc->i4_ht_mbs),
                                           &rc_pic_type,
-                                          pic_cnt,
+                                          i4_is_first_frm,
                                           &ps_codec->s_rate_control.post_encode_skip[ctxt_sel],
                                           u1_frame_qp,
                                           &ps_codec->s_rate_control.num_intra_in_prev_frame,
                                           &ps_codec->s_rate_control.i4_avg_activity);
-
-    /* in case the frame needs to be skipped, the frame num should not be incremented */
-    if (ps_codec->s_rate_control.post_encode_skip[ctxt_sel])
-    {
-        ps_codec->i4_frame_num --;
-    }
-
     return i4_stuffing_byte;
 }
 
@@ -2358,7 +2459,7 @@ WORD32 ih264e_process_thread(void *pv_proc)
             int error = ithread_mutex_lock(ps_codec->pv_entropy_mutex);
 
             /* codec context selector */
-            WORD32 ctxt_sel = ps_codec->i4_encode_api_call_cnt & 1;
+            WORD32 ctxt_sel = ps_codec->i4_encode_api_call_cnt % MAX_CTXT_SETS;
 
             volatile UWORD32 *pu4_buf = &ps_codec->au4_entropy_thread_active[ctxt_sel];
 
diff --git a/encoder/ih264e_process.h b/encoder/ih264e_process.h
index 9715434..9cfdac8 100644
--- a/encoder/ih264e_process.h
+++ b/encoder/ih264e_process.h
@@ -284,36 +284,6 @@ WORD32 ih264e_process(process_ctxt_t *ps_proc);
 *******************************************************************************
 *
 * @brief
-*  function to receive frame qp and pic type before encoding
-*
-* @par Description:
-*  Before encoding the frame, this function calls the rc library for frame qp
-*  and picture type
-*
-* @param[in] ps_codec
-*  Pointer to codec context
-*
-* @param[in] pic_cnt
-*  pic count
-*
-* @param[out] pi4_pic_type
-*  pic type
-
-* @returns skip_src
-*  if the source frame rate and target frame rate are not identical, the encoder
-*  skips few source frames. skip_src is set when the source need not be encoded.
-*
-* @remarks none
-*
-*******************************************************************************
-*/
-WORD32 ih264e_set_rc_pic_params(codec_t *ps_codec, WORD32 cur_pic_cnt, WORD32 *pi4_pic_type);
-
-
-/**
-*******************************************************************************
-*
-* @brief
 *  Function to update rc context after encoding
 *
 * @par   Description
diff --git a/encoder/ih264e_rate_control.c b/encoder/ih264e_rate_control.c
index 1e2fe4f..1da2f03 100644
--- a/encoder/ih264e_rate_control.c
+++ b/encoder/ih264e_rate_control.c
@@ -63,6 +63,7 @@
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
 #include "ih264_common_tables.h"
+#include "ih264_cabac_tables.h"
 #include "ih264e_defs.h"
 #include "ih264e_globals.h"
 #include "irc_mem_req_and_acq.h"
@@ -75,7 +76,9 @@
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_utils.h"
 #include "irc_trace_support.h"
@@ -186,6 +189,7 @@ void ih264e_rc_init(void *pv_rc_api,
                     UWORD32 u4_peak_bit_rate,
                     UWORD32 u4_max_delay,
                     UWORD32 u4_intra_frame_interval,
+                    WORD32  i4_inter_frm_int,
                     UWORD8 *pu1_init_qp,
                     WORD32 i4_max_inter_frm_int,
                     UWORD8 *pu1_min_max_qp,
@@ -230,6 +234,9 @@ void ih264e_rc_init(void *pv_rc_api,
     u4_src_ticks = ih264e_frame_time_get_src_ticks(pv_frame_time);
     u4_tgt_ticks = ih264e_frame_time_get_tgt_ticks(pv_frame_time);
 
+    /* Init max_inter_frame int */
+    i4_max_inter_frm_int = (i4_inter_frm_int == 1) ? 2 : (i4_inter_frm_int + 2);
+
     /* Initialize the rate control */
     irc_initialise_rate_control(pv_rc_api,                  /* RC handle */
                                 e_rate_control_type,        /* RC algo type */
@@ -240,6 +247,7 @@ void ih264e_rc_init(void *pv_rc_api,
                                 u4_src_frm_rate,            /* Src frame_rate */
                                 u4_max_delay,               /* Max buffer delay */
                                 u4_intra_frame_interval,    /* Intra frm_interval */
+                                i4_inter_frm_int,           /* Inter frame interval */
                                 pu1_init_qp,                /* Init QP array[3]:[I][P][B] */
                                 u4_max_cpb_size,            /* Max VBV/CPB Buffer Size */
                                 i4_max_inter_frm_int,       /* Max inter frm_interval */
@@ -268,13 +276,13 @@ void ih264e_rc_init(void *pv_rc_api,
 *
 *******************************************************************************
 */
-picture_type_e ih264e_rc_get_picture_details(void *pv_rc_api)
+picture_type_e ih264e_rc_get_picture_details(void *pv_rc_api,
+                                             WORD32 *pi4_pic_id,
+                                             WORD32 *pi4_pic_disp_order_no)
 {
-    WORD32 i4_pic_id = 0;
-    WORD32 i4_pic_disp_order_no = 0;
     picture_type_e e_rc_pic_type = P_PIC;
 
-    irc_get_picture_details(pv_rc_api, &i4_pic_id, &i4_pic_disp_order_no,
+    irc_get_picture_details(pv_rc_api, pi4_pic_id, pi4_pic_disp_order_no,
                             &e_rc_pic_type);
 
     return (e_rc_pic_type);
@@ -286,8 +294,9 @@ picture_type_e ih264e_rc_get_picture_details(void *pv_rc_api)
 * @brief  Function to get rate control output before encoding
 *
 * @par Description
-*  This function is called before encoding the current frame and gets the qp
-*  for the current frame from rate control module
+*  This function is called before queing the current frame. It decides if we should
+*  skip the current iput buffer due to frame rate mismatch. It also updates RC about
+*  the acehivble frame rate
 *
 * @param[in] ps_rate_control_api
 *  Handle to rate control api
@@ -314,138 +323,58 @@ picture_type_e ih264e_rc_get_picture_details(void *pv_rc_api)
 *  QP for current frame
 *
 * @returns
-*  Skip or encode the current frame
+*  Skip or queue the current frame
 *
 * @remarks
 *
 *******************************************************************************
 */
-WORD32 ih264e_rc_pre_enc(void * ps_rate_control_api,
-                         void * ps_pd_frm_rate,
-                         void * ps_time_stamp,
-                         void * ps_frame_time,
-                         WORD32 i4_delta_time_stamp,
-                         WORD32 i4_total_mb_in_frame,
-                         picture_type_e *pe_vop_coding_type,
-                         UWORD8 *pu1_frame_qp)
+WORD32 ih264e_update_rc_framerates(void *ps_rate_control_api,
+                                   void *ps_pd_frm_rate,
+                                   void *ps_time_stamp,
+                                   void *ps_frame_time)
 {
-    WORD8 i4_skip_src = 0, i4_num_app_skips = 0;
+    WORD8 i4_skip_src = 0;
     UWORD32 u4_src_not_skipped_for_dts = 0;
 
-    /* Variables for the update_frm_level_info */
-    WORD32  ai4_tot_mb_in_type[MAX_MB_TYPE];
-    WORD32  ai4_tot_mb_type_qp[MAX_MB_TYPE]    = {0, 0};
-    WORD32  ai4_mb_type_sad[MAX_MB_TYPE]       = {0, 0};
-    WORD32  ai4_mb_type_tex_bits[MAX_MB_TYPE]  = {0, 0};
-    WORD32   i4_total_frame_bits               = 0;
-    WORD32   i4_total_hdr_bits                 = 0;
-    WORD32   i4_avg_mb_activity                = 0;
-    WORD32   i4_intra_frm_cost                 = 0;
-    UWORD8   u1_is_scd                         = 0;
-
-    /* Set all the MBs to Intra */
-    ai4_tot_mb_in_type[0] = i4_total_mb_in_frame;
-    ai4_tot_mb_in_type[1] = 0;
-
-    /* If delta time stamp is greater than 1, do rcupdate that many times */
-    for (i4_num_app_skips = 0; (i4_num_app_skips < i4_delta_time_stamp - 1); i4_num_app_skips++)
-    {
-        /*update the missing frames frm_rate with 0 */
-        ih264e_update_pd_frm_rate(ps_pd_frm_rate,0);
-
-        /* Update the time stamp */
-        ih264e_update_time_stamp(ps_time_stamp);
-
-        /* Do a pre encode skip update */
-
-        irc_update_frame_level_info(ps_rate_control_api,
-                                    (*pe_vop_coding_type),
-                                    ai4_mb_type_sad,        /* Frame level SAD for each type of MB[Intra/Inter] */
-                                    i4_total_frame_bits,    /* Total frame bits actually consumed */
-                                    i4_total_hdr_bits,      /*header bits for model updation*/
-                                    ai4_mb_type_tex_bits,   /* Total texture bits consumed for each type of MB[Intra/Inter] used for model */
-                                    ai4_tot_mb_type_qp,     /* Total qp of all MBs based on mb type */
-                                    ai4_tot_mb_in_type,     /* total number of mbs in each mb type */
-                                    i4_avg_mb_activity,     /* Average mb activity in frame */
-                                    u1_is_scd,              /* Is a scene change detected at the current frame */
-                                    1,                      /* If it's a pre-encode skip */
-                                    i4_intra_frm_cost,      /* Sum of Intra cost for each frame */
-                                    0);                     /* Is pic handling [irc_update_pic_handling_state] done before update */
-    }
-
     /* Update the time stamp for the current frame */
     ih264e_update_time_stamp(ps_time_stamp);
 
     /* Check if a src not needs to be skipped */
     i4_skip_src = ih264e_should_src_be_skipped(ps_frame_time,
-                                               i4_delta_time_stamp,
+                                               1,
                                                &u4_src_not_skipped_for_dts);
 
-    /***********************************************************************
-       Based on difference in source and target frame rate frames are skipped
-     ***********************************************************************/
     if (i4_skip_src)
     {
+        /***********************************************************************
+         *Based on difference in source and target frame rate frames are skipped
+         ***********************************************************************/
         /*update the missing frames frm_rate with 0 */
-        ih264e_update_pd_frm_rate(ps_pd_frm_rate,0);
-
-        /* Do a pre encode skip update */
-        irc_update_frame_level_info(ps_rate_control_api,
-                                    (*pe_vop_coding_type),
-                                    ai4_mb_type_sad,        /* Frame level SAD for each type of MB[Intra/Inter] */
-                                    i4_total_frame_bits,    /* Total frame bits actually consumed */
-                                    i4_total_hdr_bits,      /*header bits for model updation*/
-                                    ai4_mb_type_tex_bits,   /* Total texture bits consumed for each type of MB[Intra/Inter] used for model */
-                                    ai4_tot_mb_type_qp,     /* Total qp of all MBs based on mb type */
-                                    ai4_tot_mb_in_type,     /* total number of mbs in each mb type */
-                                    i4_avg_mb_activity,     /* Average mb activity in frame */
-                                    u1_is_scd,              /* Is a scene change detected at the current frame */
-                                    1,                      /* If it's a pre-encode skip */
-                                    i4_intra_frm_cost,      /* Sum of Intra cost for each frame */
-                                    0);                     /* Is pic handling [irc_update_pic_handling_state] done before update */
-
-        /* Set the current frame type to NA */
-        *pe_vop_coding_type = BUF_PIC;
+        ih264e_update_pd_frm_rate(ps_pd_frm_rate, 0);
     }
     else
     {
-#define MAX_FRAME_BITS 0x7FFFFFFF
-//        WORD32         i4_pic_id;
-//        WORD32         i4_pic_disp_order_no;
         WORD32 i4_avg_frm_rate, i4_source_frame_rate;
 
-        i4_source_frame_rate = ih264e_frame_time_get_src_frame_rate(ps_frame_time);
+        i4_source_frame_rate = ih264e_frame_time_get_src_frame_rate(
+                        ps_frame_time);
 
         /* Update the frame rate of the frame present with the tgt_frm_rate */
         /* If the frm was not skipped due to delta_time_stamp, update the
-           frame_rate with double the tgt_frame_rate value, so that it makes
-           up for one of the frames skipped by the application */
-        ih264e_update_pd_frm_rate(ps_pd_frm_rate,
-                                  i4_source_frame_rate);
+         frame_rate with double the tgt_frame_rate value, so that it makes
+         up for one of the frames skipped by the application */
+        ih264e_update_pd_frm_rate(ps_pd_frm_rate, i4_source_frame_rate);
 
         /* Based on the update get the average frame rate */
         i4_avg_frm_rate = ih264e_get_pd_avg_frm_rate(ps_pd_frm_rate);
 
         /* Call the RC library function to change the frame_rate to the
-           actually achieved frm_rate */
+         actually achieved frm_rate */
         irc_change_frm_rate_for_bit_alloc(ps_rate_control_api, i4_avg_frm_rate);
-
-        /* --------Rate control related things.  Get pic type and frame Qp---------*/
-        /* Add picture to the stack. For IPP encoder we push the variable
-           into the stack and get back the variables by requesting RC.
-           This interface is designed for IPB encoder */
-        irc_add_picture_to_stack(ps_rate_control_api, 1);
-
-        /* Query the picture_type */
-        *pe_vop_coding_type = ih264e_rc_get_picture_details(ps_rate_control_api);
-
-        /* Get current frame Qp */
-        pu1_frame_qp[0] = (UWORD8)irc_get_frame_level_qp(ps_rate_control_api,
-                                                         (picture_type_e)(pe_vop_coding_type[0]),
-                                                         MAX_FRAME_BITS);
     }
 
-    return(i4_skip_src);
+    return (i4_skip_src);
 }
 
 /**
@@ -678,8 +607,8 @@ WORD32 ih264e_rc_post_enc(void * ps_rate_control_api,
             &u1_enc_buf_overflow,&u1_enc_buf_underflow);
 
         /* We skip the frame if decoder buffer is underflowing. But we never skip first I frame */
-        // if((u1_enc_buf_overflow == 1) && (i4_is_first_frame != 1))
-        if ((u1_enc_buf_overflow == 1) && (i4_is_first_frame != 0))
+        if ((u1_enc_buf_overflow == 1) && (i4_is_first_frame != 1))
+        // if ((u1_enc_buf_overflow == 1) && (i4_is_first_frame != 0))
         {
             irc_post_encode_frame_skip(ps_rate_control_api, (picture_type_e)pe_vop_coding_type[0]);
             // i4_total_frame_bits = imp4_write_skip_frame_header(ps_enc);
diff --git a/encoder/ih264e_rate_control.h b/encoder/ih264e_rate_control.h
index de9466a..cca9ad3 100644
--- a/encoder/ih264e_rate_control.h
+++ b/encoder/ih264e_rate_control.h
@@ -90,6 +90,9 @@
 * @param[in] u4_intra_frame_interval
 *  Intra frame interval
 *
+* @param[in] i4_inter_frm_int
+*  Inter frame interval
+*
 * @param[in] pu1_init_qp
 *  Initial qp
 *
@@ -120,6 +123,7 @@ void ih264e_rc_init(void *pv_rc_api,
                     UWORD32 u4_peak_bit_rate,
                     UWORD32 u4_max_delay,
                     UWORD32 u4_intra_frame_interval,
+                    WORD32  i4_inter_frm_int,
                     UWORD8 *pu1_init_qp,
                     WORD32 i4_max_inter_frm_int,
                     UWORD8 *pu1_min_max_qp,
@@ -143,13 +147,15 @@ void ih264e_rc_init(void *pv_rc_api,
 *
 *******************************************************************************
 */
-picture_type_e ih264e_rc_get_picture_details(void *pv_rc_api);
+picture_type_e ih264e_rc_get_picture_details(void *pv_rc_api,
+                                             WORD32 *pi4_pic_id,
+                                             WORD32 *pi4_pic_disp_order_no);
 
 
 /**
 *******************************************************************************
 *
-* @brief  Function to get rate control output before encoding
+* @brief  Function to set frame rate inside RC.
 *
 * @par Description
 *  This function is called before encoding the current frame and gets the qp
@@ -167,18 +173,6 @@ picture_type_e ih264e_rc_get_picture_details(void *pv_rc_api);
 * @param[in] ps_frame_time
 *  Handle to frame time context
 *
-* @param[in] i4_delta_time_stamp
-*  Time stamp difference between frames
-*
-* @param[in] i4_total_mb_in_frame
-*  Total Macro Blocks in frame
-*
-* @param[in/out] pe_vop_coding_type
-*  Picture coding type(I/P/B)
-*
-* @param[in/out] pu1_frame_qp
-*  QP for current frame
-*
 * @returns
 *  Skip or encode the current frame
 *
@@ -186,14 +180,11 @@ picture_type_e ih264e_rc_get_picture_details(void *pv_rc_api);
 *
 *******************************************************************************
 */
-WORD32 ih264e_rc_pre_enc(void * ps_rate_control_api,
-                         void * ps_pd_frm_rate,
-                         void * ps_time_stamp,
-                         void * ps_frame_time,
-                         WORD32 i4_delta_time_stamp,
-                         WORD32 i4_total_mb_in_frame,
-                         picture_type_e *pe_vop_coding_type,
-                         UWORD8 *pu1_frame_qp);
+WORD32 ih264e_update_rc_framerates(void *ps_rate_control_api,
+                         void *ps_pd_frm_rate,
+                         void *ps_time_stamp,
+                         void *ps_frame_time
+                         );
 
 /**
 *******************************************************************************
diff --git a/encoder/ih264e_rc_mem_interface.c b/encoder/ih264e_rc_mem_interface.c
index e4d5781..a74513a 100644
--- a/encoder/ih264e_rc_mem_interface.c
+++ b/encoder/ih264e_rc_mem_interface.c
@@ -62,10 +62,10 @@
 #include "iv2.h"
 #include "ive2.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
 #include "ih264e.h"
 #include "ithread.h"
-#include "ih264e.h"
 #include "ih264_defs.h"
 #include "ih264_debug.h"
 #include "ih264_macros.h"
@@ -80,12 +80,14 @@
 #include "ih264_deblk_edge_filters.h"
 #include "ih264_common_tables.h"
 #include "ih264_list.h"
+#include "ih264_cabac_tables.h"
 #include "ih264e_error.h"
 #include "ih264e_defs.h"
 #include "ih264e_bitstream.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_master.h"
 #include "ih264_buf_mgr.h"
@@ -93,12 +95,8 @@
 #include "ih264e_utils.h"
 #include "ih264e_platform_macros.h"
 #include "ih264_cavlc_tables.h"
-#include "ih264e_config.h"
 #include "ih264e_statistics.h"
 #include "ih264e_trace.h"
-#include "ih264e_statistics.h"
-#include "ih264e_error.h"
-#include "ih264e_utils.h"
 #include "ih264e_fmt_conv.h"
 #include "ih264e_cavlc.h"
 #include "ih264e_rc_mem_interface.h"
@@ -332,7 +330,7 @@ WORD32 ih264e_get_rate_control_mem_tab(void *pv_rate_control,
                                        iv_mem_rec_t  *ps_mem,
                                        ITT_FUNC_TYPE_E e_func_type)
 {
-    static itt_memtab_t as_itt_memtab[NUM_RC_MEMTABS];
+    itt_memtab_t as_itt_memtab[NUM_RC_MEMTABS];
     WORD32 i4_num_memtab = 0, j = 0;
     void *refptr2[4];
     void **refptr1[4];
diff --git a/encoder/ih264e_structs.h b/encoder/ih264e_structs.h
index 1043a53..fc61277 100644
--- a/encoder/ih264e_structs.h
+++ b/encoder/ih264e_structs.h
@@ -39,6 +39,15 @@
 #define IH264E_STRUCTS_H_
 
 /*****************************************************************************/
+/* Structure definitions                                                    */
+/*****************************************************************************/
+
+/* Early declaration of structs */
+typedef struct _codec_t codec_t;
+typedef struct _proc_t process_ctxt_t;
+
+
+/*****************************************************************************/
 /* Extern Function type definitions                                          */
 /*****************************************************************************/
 
@@ -154,6 +163,22 @@ typedef void (*pf_fmt_conv_422ile_to_420sp)(UWORD8 *pu1_y_buf, UWORD8 *pu1_u_buf
                                             WORD32 u4_422i_stride);
 
 
+
+/**
+******************************************************************************
+ *  @brief     ME evaluation
+******************************************************************************
+ */
+typedef void ih264e_compute_me_ft(process_ctxt_t *);
+
+/**
+******************************************************************************
+ *  @brief     SKIP decision
+******************************************************************************
+ */
+typedef WORD32 ih264e_skip_params_ft(process_ctxt_t *, WORD32);
+
+
 /*****************************************************************************/
 /* Enums                                                                     */
 /*****************************************************************************/
@@ -196,11 +221,27 @@ typedef enum
  */
 typedef struct
 {
+    /**
+     *  Motion Vector
+     */
+    mv_t s_mv;
 
     /**
-     *  L0 Motion Vector
+     *  Ref index
      */
-    mv_t s_l0_mv;
+    WORD8   i1_ref_idx;
+
+} enc_pu_mv_t;
+
+
+/*
+ * Total Pu info for an MB
+ */
+typedef struct
+{
+
+    /* Array with ME info for all lists */
+    enc_pu_mv_t  s_me_info[2];
 
     /**
      *  PU X position in terms of min PU (4x4) units
@@ -223,13 +264,18 @@ typedef struct
     UWORD32     b4_ht           : 2;
 
     /**
-     *  L0 Ref index
+     *  Intra or Inter flag for each partition - 0 or 1
      */
-    WORD8   i1_l0_ref_idx;
+    UWORD32     b1_intra_flag   : 1;
+
+    /**
+     *  PRED_L0, PRED_L1, PRED_BI
+     */
+    UWORD32     b2_pred_mode    : 2;
+
 
 } enc_pu_t;
 
-typedef struct _codec_t codec_t;
 
 typedef struct
 {
@@ -336,7 +382,7 @@ typedef struct
     UWORD32                                     u4_max_bitrate;
 
     /** Maximum number of consecutive  B frames                             */
-    UWORD32                                     u4_max_num_bframes;
+    UWORD32                                     u4_num_bframes;
 
     /** Content type Interlaced/Progressive                                 */
     IV_CONTENT_TYPE_T                           e_content_type;
@@ -473,9 +519,6 @@ typedef struct
     /** IDR frame interval                                              */
     UWORD32                                     u4_idr_frm_interval;
 
-    /** consecutive B frames                                            */
-    UWORD32                                     u4_num_b_frames;
-
     /** Disable deblock level (0: Enable completely, 3: Disable completely */
     UWORD32                                     u4_disable_deblock_level;
 
@@ -859,6 +902,10 @@ typedef struct
  */
 typedef struct
 {
+    /**
+     * Pointer to the cabac context
+     */
+    cabac_ctxt_t *ps_cabac;
 
     /**
      * start of frame / start of slice flag
@@ -1142,9 +1189,9 @@ typedef struct
     WORD32  i4_mb_cost;
     WORD32  i4_mb_distortion;
 
+    enc_pu_mv_t as_skip_mv[4];
 
-    mv_t    s_skip_mv;
-    mv_t    s_pred_mv;
+    enc_pu_mv_t as_pred_mv[2];
 
     block_neighbors_t s_ngbr_avbl;
 
@@ -1165,7 +1212,7 @@ typedef struct
  *  @brief      Pixel processing thread context
  ******************************************************************************
  */
-typedef struct
+struct _proc_t
 {
     /**
      * entropy context
@@ -1210,12 +1257,12 @@ typedef struct
     /**
      * Ref pointer to current MB luma
      */
-    UWORD8 *pu1_ref_buf_luma;
+    UWORD8 *apu1_ref_buf_luma[MAX_REF_PIC_CNT];
 
     /**
      * Ref pointer to current MB chroma
      */
-    UWORD8 *pu1_ref_buf_chroma;
+    UWORD8 *apu1_ref_buf_chroma[MAX_REF_PIC_CNT];
 
     /**
      * pointer to luma plane of input buffer (base :: mb (0,0))
@@ -1230,7 +1277,7 @@ typedef struct
     /**
      * pointer to luma plane of ref buffer (base :: mb (0,0))
      */
-    UWORD8 *pu1_ref_buf_luma_base;
+    UWORD8 *apu1_ref_buf_luma_base[MAX_REF_PIC_CNT];
 
     /**
      * pointer to  chroma plane of input buffer (base :: mb (0,0))
@@ -1256,7 +1303,7 @@ typedef struct
     /**
      * pointer to  chroma plane of reconstructed buffer (base :: mb (0,0))
      */
-    UWORD8 *pu1_ref_buf_chroma_base;
+    UWORD8 *apu1_ref_buf_chroma_base[MAX_REF_PIC_CNT];
 
     /**
      * Pointer to ME NMB info
@@ -1266,12 +1313,16 @@ typedef struct
     mb_info_nmb_t *ps_cur_mb;
 
     /**
-     * source stride
-     * (strides for luma and chroma are the same)
+     * source luma stride
      */
     WORD32 i4_src_strd;
 
     /**
+     * source chroma stride
+     */
+    WORD32 i4_src_chroma_strd;
+
+    /**
      * recon stride & ref stride
      * (strides for luma and chroma are the same)
      */
@@ -1504,9 +1555,19 @@ typedef struct
     enc_pu_t *ps_pu;
 
     /**
+     * Pointer to the pu of current co-located MB in list 1
+     */
+    enc_pu_t *ps_colpu;
+
+    /**
      * predicted motion vector
      */
-    mv_t *ps_pred_mv;
+    enc_pu_mv_t *ps_skip_mv;
+
+    /**
+     * predicted motion vector
+     */
+    enc_pu_mv_t *ps_pred_mv;
 
     /**
      * top row mb syntax information base
@@ -1554,7 +1615,6 @@ typedef struct
      */
     enc_pu_t s_top_left_mb_pu_ME;
 
-
     /**
      * mb neighbor availability pointer
      */
@@ -1590,11 +1650,6 @@ typedef struct
     UWORD8 *pu1_top_mb_intra_modes;
 
     /**
-     * skip motion vector info
-     */
-    mv_t *ps_skip_mv;
-
-    /**
      * left mb motion vector
      */
     enc_pu_t s_left_mb_pu;
@@ -1802,9 +1857,14 @@ typedef struct
 
     /**
      * Reference picture for the current picture
-     * TODO: Only 1 reference assumed currently
+     * TODO: Only 2 reference assumed currently
      */
-    pic_buf_t *ps_ref_pic;
+    pic_buf_t *aps_ref_pic[MAX_REF_PIC_CNT];
+
+    /**
+     * Reference MV buff for the current picture
+     */
+    mv_buf_t *aps_mv_buf[MAX_REF_PIC_CNT];
 
     /**
      * frame info used by RC
@@ -1834,27 +1894,10 @@ typedef struct
      */
     UWORD32 u4_compute_recon;
 
-   /*
-    * Buffer for holding half_x (1/2,1 - interpolated)
-    * values when halfpel generation
-    *  for the entire plane is not enabled
-    */
-    UWORD8 *pu1_half_x;
-
     /*
-     * Buffer for holding half_x (1,1/2 - interpolated)
-     * values when halfpel generation
-     *  for the entire plane is not enabled
+     * Temporary buffers to be used for subpel computation
      */
-    UWORD8 *pu1_half_y;
-
-    /*
-     * Buffer for holding half_x (1/2,1/2 - interpolated)
-     * values when halfpel generation
-     *  for the entire plane is not enabled
-     *
-     */
-    UWORD8 *pu1_half_xy;
+    UWORD8 *apu1_subpel_buffs[SUBPEL_BUFF_CNT];
 
     /*
      * Buffer holding best sub pel values
@@ -1866,7 +1909,7 @@ typedef struct
      */
     UWORD32 u4_bst_spel_buf_strd;
 
-} process_ctxt_t;
+};
 
 /**
  ******************************************************************************
@@ -1921,12 +1964,13 @@ typedef struct
 struct _codec_t
 {
     /**
-     * Number of coded pictures
+     * Id of current pic (input order)
      */
-    WORD32 i4_coded_pic_cnt;
+    WORD32 i4_poc;
 
     /**
      * Number of encode frame API calls made
+     * This variable must only be used for context selection [Read only]
      */
     WORD32 i4_encode_api_call_cnt;
 
@@ -1961,12 +2005,6 @@ struct _codec_t
     IV_COLOR_FORMAT_T e_codec_color_format;
 
     /**
-     * source stride
-     * (strides for luma and chroma are the same)
-     */
-    WORD32 i4_src_strd;
-
-    /**
      * recon stride
      * (strides for luma and chroma are the same)
      */
@@ -2305,6 +2343,7 @@ struct _codec_t
      */
     ref_set_t as_ref_set[MAX_DPB_SIZE + MAX_CTXT_SETS];
 
+
     /*
      * Air pic cnt
      * Contains the number of pictures that have been encoded with air
@@ -2319,12 +2358,16 @@ struct _codec_t
     UWORD16 *pu2_intr_rfrsh_map;
 
     /*
-     * Alternate reference frames
      * Indicates if the current frame is used as a reference frame
      */
     UWORD32 u4_is_curr_frm_ref;
 
     /*
+     * Indicates if there can be non reference frames in the stream
+     */
+    WORD32 i4_non_ref_frames_in_stream;
+
+    /*
      * Memory for color space conversion for luma plane
      */
     UWORD8 *pu1_y_csc_buf_base;
@@ -2510,6 +2553,18 @@ struct _codec_t
     ime_compute_sad_ft *apf_compute_sad_16x16[2];
     ime_compute_sad_ft *pf_compute_sad_16x8;
 
+
+    /**
+     * Function pointer for computing ME
+     * 1 for PSLICE and 1 for BSLICE
+     */
+    ih264e_compute_me_ft *apf_compute_me[2];
+
+    /**
+     * Function pointers for computing SKIP parameters
+     */
+    ih264e_skip_params_ft *apf_find_skip_params_me[2];
+
     /**
      * fn ptrs for memory handling operations
      */
@@ -2545,8 +2600,7 @@ struct _codec_t
     /**
      * write mb layer for a given slice I, P, B
      */
-    IH264E_ERROR_T (*pf_write_mb_syntax_layer[3]) ( entropy_ctxt_t *ps_ent_ctxt );
-
+    IH264E_ERROR_T (*pf_write_mb_syntax_layer[2][3]) ( entropy_ctxt_t *ps_ent_ctxt );
 
     /**
      * Output buffer
@@ -2562,5 +2616,22 @@ struct _codec_t
      * rate control context
      */
     rate_control_ctxt_t s_rate_control;
+
+    /**
+     * input buffer queue
+     */
+    inp_buf_t as_inp_list[MAX_NUM_BFRAMES];
+
+    /**
+     * IDR flags for each input
+     */
+    WORD32 i4_idr_inp_list[MAX_NUM_BFRAMES];
+
+    /*
+    *Flag to indicate if we have recived the last input frame
+    */
+    WORD32 i4_last_inp_buff_received;
+
 };
+
 #endif /* IH264E_STRUCTS_H_ */
diff --git a/encoder/ih264e_time_stamp.c b/encoder/ih264e_time_stamp.c
index a6a7f3c..cd829b5 100644
--- a/encoder/ih264e_time_stamp.c
+++ b/encoder/ih264e_time_stamp.c
@@ -67,6 +67,7 @@
 #include "ih264_defs.h"
 #include "ih264e_defs.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
@@ -78,6 +79,8 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_rc_mem_interface.h"
 #include "ih264e_time_stamp.h"
@@ -221,7 +224,7 @@ WORD32 ih264e_frame_time_get_init_free_memtab(frame_time_handle *pps_frame_time,
                                               ITT_FUNC_TYPE_E e_func_type)
 {
     WORD32 i4_mem_tab_idx = 0;
-    static frame_time_t s_temp_frame_time_t;
+    frame_time_t s_temp_frame_time_t;
 
     /* Hack for al alloc, during which we dont have any state memory.
      Dereferencing can cause issues */
@@ -404,7 +407,7 @@ WORD32 ih264e_time_stamp_get_init_free_memtab(time_stamp_handle *pps_time_stamp,
                                               ITT_FUNC_TYPE_E e_func_type)
 {
     WORD32 i4_mem_tab_idx = 0;
-    static time_stamp_t s_temp_time_stamp_t;
+    time_stamp_t s_temp_time_stamp_t;
 
     /* Hack for al alloc, during which we dont have any state memory.
      Dereferencing can cause issues */
diff --git a/encoder/ih264e_utils.c b/encoder/ih264e_utils.c
index 3657f33..b339143 100644
--- a/encoder/ih264e_utils.c
+++ b/encoder/ih264e_utils.c
@@ -68,8 +68,8 @@
 #include "ih264_defs.h"
 #include "ih264_size_defs.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -78,6 +78,7 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
+#include "ih264_cabac_tables.h"
 #include "ih264_macros.h"
 #include "ih264_common_tables.h"
 #include "ih264_debug.h"
@@ -91,7 +92,9 @@
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
+#include "ih264e_cabac.h"
 #include "ih264e_utils.h"
 #include "ih264e_config.h"
 #include "ih264e_statistics.h"
@@ -99,9 +102,7 @@
 #include "ih264_list.h"
 #include "ih264e_encode_header.h"
 #include "ih264e_me.h"
-#include "ime_defs.h"
 #include "ime.h"
-#include "ih264e_rate_control.h"
 #include "ih264e_core_coding.h"
 #include "ih264e_rc_mem_interface.h"
 #include "ih264e_time_stamp.h"
@@ -116,6 +117,246 @@
 /*****************************************************************************/
 
 /**
+ *******************************************************************************
+ *
+ * @brief
+ *  Queues the current buffer, gets back a another buffer for encoding with corrent
+ *  picture type
+ *
+ * @par Description:
+ *      This function performs 3 distinct but related functions.
+ *      1) Maintains an input queue [Note the the term queue donot imply a
+ *         first-in first-out logic here] that queues input and dequeues them so
+ *         that input frames can be encoded at any predetermined encoding order
+ *      2) Uses RC library to decide which frame must be encoded in current pass
+ *         and which picture type it must be encoded to.
+ *      3) Uses RC library to decide the QP at which current frame has to be
+ *         encoded
+ *      4) Determines if the current picture must be encoded or not based on
+ *         PRE-ENC skip
+ *
+ *     Input queue is used for storing input buffers till they are used for
+ *     encoding. This queue is maintained at ps_codec->as_inp_list. Whenever a
+ *     valid input comes, it is added to the end of queue. This same input is
+ *     added to RC queue using the identifier as ps_codec->i4_pic_cnt. Hence any
+ *     pic from RC can be located in the input queue easily.
+ *
+ *     The dequeue operation does not start till we have ps_codec->s_cfg.u4_max_num_bframes
+ *     frames in the queue. THis is done in order to ensure that once output starts
+ *     we will have a constant stream of output with no gaps.
+ *
+ *     THe output frame order is governed by RC library. When ever we dequeue a
+ *     buffer from RC library, it ensures that we will get them in encoding order
+ *     With the output of RC library, we can use the picture id to dequeue the
+ *     corresponding buffer from input queue and encode it.
+ *
+ *     Condition at the end of stream.
+ *     -------------------------------
+ *      At the last valid buffer from the app, we will get ps_ive_ip->u4_is_last
+ *      to be set. This will the given to lib when appropriate input buffer is
+ *      given to encoding.
+ *
+ *      Since we have to output is not in sync with input, we will have frames to
+ *      encode even after we recive the last vaild input buffer. Hence we have to
+ *      make sure that we donot queue any new buffers once we get the flag [It may
+ *      mess up GOP ?]. This is acheived by setting ps_codec->i4_last_inp_buff_received
+ *      to act as a permenent marker for last frame recived [This may not be needed,
+ *      because in our current app, all buffers after the last are marked as last.
+ *      But can we rely on that?] . Hence after this flgag is set no new buffers are
+ *      queued.
+ *
+ * @param[in] ps_codec
+ *   Pointer to codec descriptor
+ *
+ * @param[in] ps_ive_ip
+ *   Current input buffer to the encoder
+ *
+ * @param[out] ps_inp
+ *   Buffer to be encoded in the current pass
+ *
+ * @returns
+ *   Flag indicating if we have a pre-enc skip or not
+ *
+ * @remarks
+ * TODO (bpic)
+ *  The check for null ans is last is redudent.
+ *  Need to see if we can remove it
+ *
+ *******************************************************************************
+ */
+WORD32 ih264e_input_queue_update(codec_t *ps_codec,
+                                 ive_video_encode_ip_t *ps_ive_ip,
+                                 inp_buf_t *ps_enc_buff)
+{
+
+    inp_buf_t *ps_inp_buf;
+    picture_type_e e_pictype;
+    WORD32 i4_skip;
+    UWORD32 ctxt_sel, u4_pic_id, u4_pic_disp_id;
+    UWORD8 u1_frame_qp;
+    UWORD32 max_frame_bits = 0x7FFFFFFF;
+
+    /*  Mark that the last input frame has been received */
+    if (ps_ive_ip->u4_is_last == 1)
+    {
+        ps_codec->i4_last_inp_buff_received = 1;
+    }
+
+    if (ps_ive_ip->s_inp_buf.apv_bufs[0] == NULL
+                    && !ps_codec->i4_last_inp_buff_received)
+    {
+        ps_enc_buff->s_raw_buf.apv_bufs[0] = NULL;
+        return 0;
+    }
+
+    /***************************************************************************
+     * Check for pre enc skip
+     *   When src and target frame rates donot match, we skip some frames to
+     *   maintain the relation ship between them
+     **************************************************************************/
+    {
+        WORD32 skip_src;
+
+        skip_src = ih264e_update_rc_framerates(
+                        ps_codec->s_rate_control.pps_rate_control_api,
+                        ps_codec->s_rate_control.pps_pd_frm_rate,
+                        ps_codec->s_rate_control.pps_time_stamp,
+                        ps_codec->s_rate_control.pps_frame_time);
+
+        if (skip_src) return 1;
+    }
+
+    /***************************************************************************
+     *Queue the input to the queue
+     **************************************************************************/
+    ps_inp_buf = &(ps_codec->as_inp_list[ps_codec->i4_pic_cnt
+                                         % MAX_NUM_BFRAMES]);
+
+    /* copy input info. to internal structure */
+    ps_inp_buf->s_raw_buf = ps_ive_ip->s_inp_buf;
+    ps_inp_buf->u4_timestamp_low = ps_ive_ip->u4_timestamp_low;
+    ps_inp_buf->u4_timestamp_high = ps_ive_ip->u4_timestamp_high;
+    ps_inp_buf->u4_is_last = ps_ive_ip->u4_is_last;
+    ps_inp_buf->pv_mb_info = ps_ive_ip->pv_mb_info;
+    ps_inp_buf->u4_mb_info_type = ps_ive_ip->u4_mb_info_type;
+    ps_inp_buf->pv_pic_info = ps_ive_ip->pv_pic_info;
+    ps_inp_buf->u4_pic_info_type = ps_ive_ip->u4_pic_info_type;
+
+    /***************************************************************************
+     * Now we should add the picture to RC stack here
+     **************************************************************************/
+    /*
+     * If an I frame has been requested, ask  RC to force it
+     * For IDR requests, we have to ask RC to force I and set IDR by our selves
+     * since RC Donot know about IDR. For forcing an IDR at dequeue stage we
+     * should record that an IDR has been requested some where. Hence we will
+     * store it in the u4_idr_inp_list at a position same as that of input frame
+     */
+    {
+        WORD32 i4_force_idr, i4_force_i;
+
+        i4_force_idr = (ps_codec->force_curr_frame_type == IV_IDR_FRAME);
+        i4_force_idr |= !(ps_codec->i4_pic_cnt % ps_codec->s_cfg.u4_idr_frm_interval);
+
+        i4_force_i = (ps_codec->force_curr_frame_type == IV_I_FRAME);
+
+        ps_codec->i4_idr_inp_list[ps_codec->i4_pic_cnt % MAX_NUM_BFRAMES] = i4_force_idr;
+
+        if ((ps_codec->i4_frame_num > 0) && (i4_force_idr || i4_force_i))
+        {
+            irc_force_I_frame(ps_codec->s_rate_control.pps_rate_control_api);
+        }
+        ps_codec->force_curr_frame_type = IV_NA_FRAME;
+    }
+
+    irc_add_picture_to_stack(ps_codec->s_rate_control.pps_rate_control_api,
+                             ps_codec->i4_pic_cnt);
+
+
+    /* Delay */
+    if (ps_codec->i4_encode_api_call_cnt
+                    < (WORD32)(ps_codec->s_cfg.u4_num_bframes))
+    {
+        ps_enc_buff->s_raw_buf.apv_bufs[0] = NULL;
+        return 0;
+    }
+
+    /***************************************************************************
+     * Get a new pic to encode
+     **************************************************************************/
+    /* Query the picture_type */
+    e_pictype = ih264e_rc_get_picture_details(
+                    ps_codec->s_rate_control.pps_rate_control_api, (WORD32 *)(&u4_pic_id),
+                    (WORD32 *)(&u4_pic_disp_id));
+
+    switch (e_pictype)
+    {
+        case I_PIC:
+            ps_codec->pic_type = PIC_I;
+            break;
+        case P_PIC:
+            ps_codec->pic_type = PIC_P;
+            break;
+        case B_PIC:
+            ps_codec->pic_type = PIC_B;
+            break;
+        default:
+            ps_codec->pic_type = PIC_NA;
+            ps_enc_buff->s_raw_buf.apv_bufs[0] = NULL;
+            return 0;
+    }
+
+    /* Set IDR if it has been requested or its the IDR interval */
+    ps_codec->pic_type = ps_codec->i4_idr_inp_list[u4_pic_id % MAX_NUM_BFRAMES] ?
+                                    PIC_IDR : ps_codec->pic_type;
+    ps_codec->i4_idr_inp_list[u4_pic_id % MAX_NUM_BFRAMES] = 0;
+
+
+
+    /* Get current frame Qp */
+    u1_frame_qp = (UWORD8)irc_get_frame_level_qp(
+                    ps_codec->s_rate_control.pps_rate_control_api, e_pictype,
+                    max_frame_bits);
+    ps_codec->u4_frame_qp = gau1_mpeg2_to_h264_qmap[u1_frame_qp];
+
+    /*
+     * copy the pic id to poc because the display order is assumed to be same
+     * as input order
+     */
+    ps_codec->i4_poc = u4_pic_id;
+
+    /***************************************************************************
+     * Now retrieve the correct picture from the queue
+     **************************************************************************/
+
+    /* Mark the skip flag   */
+    i4_skip = 0;
+    ctxt_sel = ps_codec->i4_encode_api_call_cnt % MAX_CTXT_SETS;
+    ps_codec->s_rate_control.pre_encode_skip[ctxt_sel] = i4_skip;
+
+    /* Get a buffer to encode */
+    ps_inp_buf = &(ps_codec->as_inp_list[u4_pic_id % MAX_NUM_BFRAMES]);
+
+    /* copy dequeued input to output */
+    ps_enc_buff->s_raw_buf = ps_inp_buf->s_raw_buf;
+    ps_enc_buff->u4_timestamp_low = ps_inp_buf->u4_timestamp_low;
+    ps_enc_buff->u4_timestamp_high = ps_inp_buf->u4_timestamp_high;
+    ps_enc_buff->u4_is_last = ps_inp_buf->u4_is_last;
+    ps_enc_buff->pv_mb_info = ps_inp_buf->pv_mb_info;
+    ps_enc_buff->u4_mb_info_type = ps_inp_buf->u4_mb_info_type;
+    ps_enc_buff->pv_pic_info = ps_inp_buf->pv_pic_info;
+    ps_enc_buff->u4_pic_info_type = ps_inp_buf->u4_pic_info_type;
+
+    if (ps_enc_buff->u4_is_last)
+    {
+        ps_codec->pic_type = PIC_NA;
+    }
+
+    /* Return the buffer status */
+    return (0);
+}
+
+/**
 *******************************************************************************
 *
 * @brief
@@ -134,13 +375,15 @@
 *
 *******************************************************************************
 */
-WORD32 ih264e_get_min_level(WORD32 pic_size)
+WORD32 ih264e_get_min_level(WORD32 wd, WORD32 ht)
 {
     WORD32 lvl_idx = MAX_LEVEL, i;
-
+    WORD32 pic_size = wd * ht;
+    WORD32 max = MAX(wd, ht);
     for (i = 0; i < MAX_LEVEL; i++)
     {
-        if (pic_size <= gai4_ih264_max_luma_pic_size[i])
+        if ((pic_size <= gai4_ih264_max_luma_pic_size[i]) &&
+            (max <= gai4_ih264_max_wd_ht[i]))
         {
             lvl_idx = i;
             break;
@@ -331,7 +574,7 @@ WORD32 ih264e_get_total_pic_buf_size(WORD32 pic_size,
     WORD32 num_samples;
     WORD32 max_num_bufs;
     WORD32 pad = MAX(horz_pad, vert_pad);
-    UNUSED(pic_size);
+
     /*
      * If num_ref_frames and num_reorder_frmaes is specified
      * Use minimum value
@@ -343,6 +586,7 @@ WORD32 ih264e_get_total_pic_buf_size(WORD32 pic_size,
 
     /* Maximum number of luma samples in a picture at given level */
     num_luma_samples = gai4_ih264_max_luma_pic_size[lvl_idx];
+    num_luma_samples = MAX(num_luma_samples, pic_size);
 
     /* Account for chroma */
     num_samples = num_luma_samples * 3 / 2;
@@ -403,7 +647,7 @@ WORD32 ih264e_get_pic_mv_bank_size(WORD32 num_luma_samples)
     WORD32 mv_bank_size = 0;
 
     /* number of sub mb partitions possible */
-    WORD32 num_pu = num_luma_samples / (MIN_PU_SIZE * MIN_PU_SIZE);
+    WORD32 num_pu = num_luma_samples / (ENC_MIN_PU_SIZE * ENC_MIN_PU_SIZE);
 
     /* number of mbs */
     WORD32 num_mb = num_luma_samples / (MB_SIZE * MB_SIZE);
@@ -413,10 +657,10 @@ WORD32 ih264e_get_pic_mv_bank_size(WORD32 num_luma_samples)
     mv_bank_size += num_mb * sizeof(WORD32);
 
     /* Size for pu_map */
-    mv_bank_size += num_pu;
+    mv_bank_size += ALIGN4(num_pu);
 
     /* Size for storing enc_pu_t for each PU */
-    mv_bank_size += num_pu * sizeof(enc_pu_t);
+    mv_bank_size += ALIGN4(num_pu * sizeof(enc_pu_t));
 
     return mv_bank_size;
 }
@@ -547,7 +791,7 @@ IH264E_ERROR_T ih264e_mv_buf_mgr_add_bufs(codec_t *ps_codec)
 
     /* num of luma samples */
     WORD32 num_luma_samples = ALIGN16(ps_codec->s_cfg.u4_wd)
-                    * ALIGN16(ps_codec->s_cfg.u4_ht);
+                            * ALIGN16(ps_codec->s_cfg.u4_ht);
 
     /* number of mb's & frame partitions */
     WORD32 num_pu, num_mb;
@@ -573,7 +817,7 @@ IH264E_ERROR_T ih264e_mv_buf_mgr_add_bufs(codec_t *ps_codec)
     /* compute MV bank size per picture */
     pic_mv_bank_size = ih264e_get_pic_mv_bank_size(num_luma_samples);
 
-    num_pu = num_luma_samples / (MIN_PU_SIZE * MIN_PU_SIZE);
+    num_pu = num_luma_samples / (ENC_MIN_PU_SIZE * ENC_MIN_PU_SIZE);
     num_mb = num_luma_samples / (MB_SIZE * MB_SIZE);
     i = 0;
     ps_mv_buf = ps_codec->pv_mv_bank_buf_base;
@@ -592,11 +836,13 @@ IH264E_ERROR_T ih264e_mv_buf_mgr_add_bufs(codec_t *ps_codec)
         }
 
         ps_mv_buf->pu4_mb_pu_cnt = (UWORD32 *) pu1_buf;
+        pu1_buf += num_mb * sizeof(WORD32);
 
-        ps_mv_buf->pu1_pic_pu_map = (pu1_buf + num_mb * sizeof(WORD32));
+        ps_mv_buf->pu1_pic_pu_map = pu1_buf;
+        pu1_buf += ALIGN4(num_pu);
 
-        ps_mv_buf->ps_pic_pu = (enc_pu_t *) (pu1_buf + num_mb * sizeof(WORD32)
-                        + num_pu);
+        ps_mv_buf->ps_pic_pu = (enc_pu_t *) (pu1_buf);
+        pu1_buf += ALIGN4(num_pu * sizeof(enc_pu_t));
 
         ret = ih264_buf_mgr_add((buf_mgr_t *) ps_codec->pv_mv_buf_mgr,
                                 ps_mv_buf, i);
@@ -608,7 +854,6 @@ IH264E_ERROR_T ih264e_mv_buf_mgr_add_bufs(codec_t *ps_codec)
             return error_status;
         }
 
-        pu1_buf += pic_mv_bank_size;
         ps_mv_buf++;
         i++;
     }
@@ -1002,14 +1247,12 @@ IH264E_ERROR_T ih264e_codec_init(codec_t *ps_codec)
                        ps_codec->s_cfg.u4_target_bitrate,
                        ps_codec->s_cfg.u4_max_bitrate,
                        ps_codec->s_cfg.u4_vbv_buffer_delay,
-                       ps_codec->s_cfg.u4_i_frm_interval, au1_init_qp,
-                       H264_ALLOC_INTER_FRM_INTV, au1_min_max_qp,
+                       ps_codec->s_cfg.u4_i_frm_interval,
+                       ps_codec->s_cfg.u4_num_bframes + 1, au1_init_qp,
+                       ps_codec->s_cfg.u4_num_bframes + 2 , au1_min_max_qp,
                        ps_codec->s_cfg.u4_max_level);
     }
 
-    /* src stride */
-    ps_codec->i4_src_strd = ps_codec->s_cfg.u4_strd;
-
     /* recon stride */
     ps_codec->i4_rec_strd = ALIGN16(ps_codec->s_cfg.u4_max_wd) + PAD_WD;
 
@@ -1020,6 +1263,11 @@ IH264E_ERROR_T ih264e_codec_init(codec_t *ps_codec)
 
     DEBUG_HISTOGRAM_INIT();
 
+
+    /* Init dependecy vars */
+    ps_codec->i4_last_inp_buff_received = 0;
+
+
     return IH264E_SUCCESS;
 }
 
@@ -1067,7 +1315,8 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
     UWORD8 *pu1_cur_pic_luma, *pu1_cur_pic_chroma;
 
     /* ref buffer set */
-    pic_buf_t *ps_ref_pic;
+    pic_buf_t *aps_ref_pic[MAX_REF_PIC_CNT] = {NULL, NULL};
+    mv_buf_t *aps_mv_buf[MAX_REF_PIC_CNT] = {NULL, NULL};
     WORD32 ref_set_id;
 
     /* pic time stamp */
@@ -1075,14 +1324,11 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
     UWORD32 u4_timestamp_low = ps_inp_buf->u4_timestamp_low;
 
     /* indices to access curr/prev frame info */
-    WORD32 ctxt_sel = ps_codec->i4_encode_api_call_cnt & 1;
+    WORD32 ctxt_sel = ps_codec->i4_encode_api_call_cnt % MAX_CTXT_SETS;
 
     /* curr pic type */
     PIC_TYPE_T *pic_type = &ps_codec->pic_type;
 
-    /* should src be skipped */
-    WORD32 *skip_src = &ps_codec->s_rate_control.pre_encode_skip[ctxt_sel];
-
     /* Diamond search Iteration Max Cnt */
     UWORD32 u4_num_layers =
                     (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FASTEST) ?
@@ -1094,62 +1340,46 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
     /********************************************************************/
     /*                     INITIALIZE CODEC CONTEXT                     */
     /********************************************************************/
-
-    /* pre enc rc call */
-    *skip_src = ih264e_set_rc_pic_params(ps_codec,
-                                         ps_codec->i4_encode_api_call_cnt,
-                                         (WORD32 *) pic_type);
-    if (*skip_src == 1)
+    /* slice_type */
+    if ((PIC_I == *pic_type) || (PIC_IDR == *pic_type))
     {
-        ps_codec->as_process[ctxt_sel * MAX_PROCESS_THREADS].s_inp_buf =
-                        *ps_inp_buf;
-
-        /* inform output bytes generated as zero */
-        ps_codec->as_out_buf[ctxt_sel].s_bits_buf.u4_bytes = 0;
-
-        return error_status;
+        ps_codec->i4_slice_type = ISLICE;
     }
-
-    /********************************************************************/
-    /*                     Alternate reference frame                    */
-    /********************************************************************/
-    if (ps_codec->s_cfg.u4_enable_alt_ref)
+    else if (PIC_P == *pic_type)
     {
-        if (PIC_IDR == *pic_type || PIC_I == *pic_type)
-        {
-            ps_codec->u4_is_curr_frm_ref = 1;
-        }
-        else
-        {
-            ps_codec->u4_is_curr_frm_ref = 1;
-                if(ps_codec->i4_encode_api_call_cnt % (ps_codec->s_cfg.u4_enable_alt_ref + 1))
-                    ps_codec->u4_is_curr_frm_ref = 0;
-            }
-
-        if ((ps_codec->u4_is_curr_frm_ref == 1) || (ps_codec->i4_frame_num < 0))
-        {
-            ps_codec->i4_frame_num++;
-        }
+        ps_codec->i4_slice_type = PSLICE;
     }
-    else
+    else if(PIC_B == *pic_type)
     {
-        ps_codec->u4_is_curr_frm_ref = 1;
-
-        ps_codec->i4_frame_num++;
+        ps_codec->i4_slice_type = BSLICE;
     }
 
-    /* slice_type */
-    ps_codec->i4_slice_type = PSLICE;
 
-    if ((PIC_I == *pic_type) || (PIC_IDR == *pic_type))
+    /***************************************************************************
+     * Set up variables for sending frame number, poc and reference
+     *   a) Set up alt ref too
+     **************************************************************************/
+
+    /* Check and set if the current frame is reference or not */
+    ps_codec->u4_is_curr_frm_ref = 0;
+
+    /* This frame is reference if its not a B pic, pending approval from alt ref */
+    ps_codec->u4_is_curr_frm_ref = (*pic_type != PIC_B);
+
+    /* In case if its a P pic, we will decide according to alt ref also */
+    if (ps_codec->s_cfg.u4_enable_alt_ref && (*pic_type == PIC_P)
+                    && (ps_codec->i4_pic_cnt
+                                    % (ps_codec->s_cfg.u4_enable_alt_ref + 1)))
     {
-        ps_codec->i4_slice_type = ISLICE;
-    }
-    else if (PIC_P == *pic_type)
-    {
-        ps_codec->i4_slice_type = PSLICE;
+        ps_codec->u4_is_curr_frm_ref = 0;
     }
 
+    /*
+     * Override everything in case of IDR
+     * Note that in case of IDR, at this point ps_codec->u4_is_curr_frm_ref must
+     * be 1
+     */
+
     /* is this an IDR pic */
     ps_codec->u4_is_idr = 0;
 
@@ -1165,6 +1395,10 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
         ps_codec->i4_idr_pic_id++;
     }
 
+    /***************************************************************************
+     * Set up Deblock
+     **************************************************************************/
+
     /* set deblock disable flags based on disable deblock level */
     ps_codec->i4_disable_deblk_pic = 1;
 
@@ -1235,93 +1469,132 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
         ih264e_populate_pps(ps_codec, ps_pps);
     }
 
-    /* Reference and MV bank Buffer Manager */
+    /***************************************************************************
+     *  Reference and MV bank Buffer Manager
+     *  Here we will
+     *      1) Find the correct ref pics for the current frame
+     *      2) Free the ref pic that is not going to be used anywhere
+     *      3) Find a free buff from the list and assign it as the recon of
+     *         current frame
+     *
+     *  1) Finding correct ref pic
+     *      All pics needed for future are arranged in a picture list called
+     *      ps_codec->as_ref_set. Each picture in this will have a pic buffer and
+     *      MV buffer that is marked appropriately as BUF_MGR_REF, BUF_MGR_IO or
+     *      BUF_MGR_CODEC. Also the pic_cnt and poc will also be present.
+     *      Hence to find the ref pic we will loop through the list and find
+     *      2 pictures with maximum i4_pic_cnt .
+     *
+     *      note that i4_pic_cnt == -1 is used to filter uninit ref pics.
+     *      Now since we only have max two ref pics, we will always find max 2
+     *      ref pics.
+
+     *
+     *  2) 3) Self explanatory
+     ***************************************************************************/
     {
-        /* min pic cnt among the list of pics stored in ref list */
-        WORD32 min_pic_cnt;
+        /* Search for buffs with maximum pic cnt */
 
-        /* max pic cnt among the list of pics stored in ref list */
-        WORD32 max_pic_cnt;
+        WORD32 max_pic_cnt[] = { -1, -1 };
 
-        /* temp var */
-        WORD32 i;
+        mv_buf_t *ps_mv_buf_to_free[] = { NULL, NULL };
 
-        ps_ref_pic = NULL;
+        /* temp var */
+        WORD32 i, buf_status;
 
-        /* get reference picture when necessary */
-        /* Only nearest picture encoded (max pic cnt) is used as reference */
-        if ((*pic_type != PIC_IDR) && (*pic_type != PIC_I))
+        for (i = 0; i < ps_codec->i4_ref_buf_cnt; i++)
         {
-            max_pic_cnt = ps_codec->as_ref_set[0].i4_pic_cnt;
+            if (ps_codec->as_ref_set[i].i4_pic_cnt == -1)
+                continue;
+
+            buf_status = ih264_buf_mgr_get_status(
+                            ps_codec->pv_ref_buf_mgr,
+                            ps_codec->as_ref_set[i].ps_pic_buf->i4_buf_id);
+
+            /* Ideally we should look for buffer status of MV BUFF also. But since
+             * the correponding MV buffs also will be at the same state. It dosent
+             * matter as of now. But the check will make the logic better */
+            if ((max_pic_cnt[0] < ps_codec->as_ref_set[i].i4_pic_cnt)
+                            && (buf_status & BUF_MGR_REF))
+            {
+                if (max_pic_cnt[1] < ps_codec->as_ref_set[i].i4_pic_cnt)
+                {
+                    max_pic_cnt[0] = max_pic_cnt[1];
+                    aps_ref_pic[0] = aps_ref_pic[1];
+                    aps_mv_buf[0] = aps_mv_buf[1];
 
-            ps_ref_pic = ps_codec->as_ref_set[0].ps_pic_buf;
+                    ps_mv_buf_to_free[0] = ps_mv_buf_to_free[1];
 
-            /* loop through to get the max pic cnt among the list of pics stored in ref list */
-            for (i = 1; i < ps_codec->i4_ref_buf_cnt; i++)
-            {
-                if (max_pic_cnt < ps_codec->as_ref_set[i].i4_pic_cnt)
+                    max_pic_cnt[1] = ps_codec->as_ref_set[i].i4_pic_cnt;
+                    aps_ref_pic[1] = ps_codec->as_ref_set[i].ps_pic_buf;
+                    aps_mv_buf[1] = ps_codec->as_ref_set[i].ps_mv_buf;
+                    ps_mv_buf_to_free[1] = ps_codec->as_ref_set[i].ps_mv_buf;
+
+                }
+                else
                 {
-                    max_pic_cnt = ps_codec->as_ref_set[i].i4_pic_cnt;
-                    ps_ref_pic = ps_codec->as_ref_set[i].ps_pic_buf;
+                    max_pic_cnt[0] = ps_codec->as_ref_set[i].i4_pic_cnt;
+                    aps_ref_pic[0] = ps_codec->as_ref_set[i].ps_pic_buf;
+                    aps_mv_buf[0] = ps_codec->as_ref_set[i].ps_mv_buf;
+                    ps_mv_buf_to_free[0] = ps_codec->as_ref_set[i].ps_mv_buf;
                 }
             }
         }
 
-        /* get a location at which the curr pic info can be stored for future reference */
-        ref_set_id = -1;
-
-        for (i = 0; i < ps_codec->i4_ref_buf_cnt; i++)
+        /*
+         * Now if the current picture is I or P, we discard the back ref pic and
+         * assign forward ref as backward ref
+         */
+        if (*pic_type != PIC_B)
         {
-            if (-1 == ps_codec->as_ref_set[i].i4_pic_cnt)
+            if (ps_mv_buf_to_free[0])
             {
-                ref_set_id = i;
-                break;
-            }
-        }
+                /* release this frame from reference list */
+                ih264_buf_mgr_release(ps_codec->pv_mv_buf_mgr,
+                                      ps_mv_buf_to_free[0]->i4_buf_id,
+                                      BUF_MGR_REF);
 
-        /* If all the entries in the ref_set array are filled, then remove the entry with least pic_cnt */
-        if (ref_set_id == -1)
-        {
-            /* pic info */
-            pic_buf_t *ps_cur_pic;
-
-            /* mv info */
-            mv_buf_t *ps_cur_mv_buf;
-
-            ref_set_id = 0;
-            min_pic_cnt = ps_codec->as_ref_set[0].i4_pic_cnt;
-
-            /* loop through to get the min pic cnt among the list of pics stored in ref list */
-            for (i = 1; i < ps_codec->i4_ref_buf_cnt; i++)
-            {
-                if (min_pic_cnt > ps_codec->as_ref_set[i].i4_pic_cnt)
-                {
-                    min_pic_cnt = ps_codec->as_ref_set[i].i4_pic_cnt;
-                    ref_set_id = i;
-                }
+                ih264_buf_mgr_release(ps_codec->pv_ref_buf_mgr,
+                                      aps_ref_pic[0]->i4_buf_id, BUF_MGR_REF);
             }
 
-            ps_cur_pic = ps_codec->as_ref_set[ref_set_id].ps_pic_buf;
-
-            ps_cur_mv_buf = ps_codec->as_ref_set[ref_set_id].ps_mv_buf;
+            max_pic_cnt[0] = max_pic_cnt[1];
+            aps_ref_pic[0] = aps_ref_pic[1];
+            aps_mv_buf[0] = aps_mv_buf[1];
 
-            /* release this frame from reference list */
-            ih264_buf_mgr_release(ps_codec->pv_mv_buf_mgr,
-                                  ps_cur_mv_buf->i4_buf_id, BUF_MGR_REF);
-
-            ih264_buf_mgr_release(ps_codec->pv_ref_buf_mgr,
-                                  ps_cur_pic->i4_buf_id, BUF_MGR_REF);
+            /* Dummy */
+            max_pic_cnt[1] = -1;
         }
 
-        if (ps_codec->s_cfg.u4_enable_recon)
+        /*
+         * Mark all reference pic with unused buffers to be free
+         * We need this step since each one, ie ref, recon io etc only unset their
+         * respective flags. Hence we need to combine togather and mark the ref set
+         * accordingly
+         */
+        ref_set_id = -1;
+        for (i = 0; i < ps_codec->i4_ref_buf_cnt; i++)
         {
-            ret = ih264_buf_mgr_check_free((buf_mgr_t *)ps_codec->pv_ref_buf_mgr);
+            if (ps_codec->as_ref_set[i].i4_pic_cnt == -1)
+            {
+                ref_set_id = i;
+                continue;
+            }
 
-            if (ret != IH264_SUCCESS)
+            buf_status = ih264_buf_mgr_get_status(
+                            ps_codec->pv_ref_buf_mgr,
+                            ps_codec->as_ref_set[i].ps_pic_buf->i4_buf_id);
+
+            if ((buf_status & (BUF_MGR_REF | BUF_MGR_CODEC | BUF_MGR_IO)) == 0)
             {
-                return IH264E_NO_FREE_RECONBUF;
+                ps_codec->as_ref_set[i].i4_pic_cnt = -1;
+                ps_codec->as_ref_set[i].i4_poc = 32768;
+
+                ref_set_id = i;
             }
         }
+        /* An asssert failure here means we donot have any free buffs */
+        ASSERT(ref_set_id >= 0);
     }
 
     {
@@ -1353,7 +1626,6 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
          * and getting a buffer id to free
          */
         ps_mv_buf->i4_abs_poc = ps_codec->i4_abs_pic_order_cnt;
-
         ps_mv_buf->i4_buf_id = cur_mv_bank_buf_id;
     }
 
@@ -1375,7 +1647,7 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
         }
 
         /* mark the buffer as needed for reference if the curr pic is available for ref */
-        if (1 == ps_codec->u4_is_curr_frm_ref)
+        if (ps_codec->u4_is_curr_frm_ref)
         {
             ih264_buf_mgr_set_status(ps_codec->pv_ref_buf_mgr, cur_pic_buf_id,
                                      BUF_MGR_REF);
@@ -1392,7 +1664,7 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
         ps_cur_pic->u4_timestamp_high = ps_inp_buf->u4_timestamp_high;
         ps_cur_pic->u4_timestamp_low = ps_inp_buf->u4_timestamp_low;
 
-        ps_cur_pic->i4_abs_poc = ps_codec->i4_abs_pic_order_cnt;
+        ps_cur_pic->i4_abs_poc = ps_codec->i4_poc;
         ps_cur_pic->i4_poc_lsb = ps_codec->i4_pic_order_cnt_lsb;
 
         ps_cur_pic->i4_buf_id = cur_pic_buf_id;
@@ -1401,18 +1673,17 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
         pu1_cur_pic_chroma = ps_cur_pic->pu1_chroma;
     }
 
-    /* in case the current picture is used for reference then add it to the reference set */
-    if (ps_codec->u4_is_curr_frm_ref
-                    && ((*pic_type == PIC_IDR) || (*pic_type == PIC_I)
-                                    || (*pic_type == PIC_P)))
+    /*
+     * Add the current picture to ref list independent of the fact that it is used
+     * as reference or not. This is because, now recon is not in sync with output
+     * hence we may need the current recon after some delay. By adding it to ref list
+     * we can retrieve the recon any time we want. The information that it is used
+     * for ref can still be found by checking the buffer status of pic buf.
+     */
     {
         ps_codec->as_ref_set[ref_set_id].i4_pic_cnt = ps_codec->i4_pic_cnt;
-
-        /* TODO: Currently pic_cnt and poc are same - Once frame drops are introduced change appropriately */
-        ps_codec->as_ref_set[ref_set_id].i4_poc = ps_codec->i4_pic_cnt;
-
+        ps_codec->as_ref_set[ref_set_id].i4_poc = ps_codec->i4_poc;
         ps_codec->as_ref_set[ref_set_id].ps_mv_buf = ps_mv_buf;
-
         ps_codec->as_ref_set[ref_set_id].ps_pic_buf = ps_cur_pic;
     }
 
@@ -1463,9 +1734,6 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
             /* chroma rec buffer */
             ps_proc->pu1_rec_buf_chroma_base = pu1_cur_pic_chroma;
 
-            /* src stride */
-            ps_proc->i4_src_strd = ps_codec->i4_src_strd;
-
             /* rec stride */
             ps_proc->i4_rec_strd = ps_codec->i4_rec_strd;
 
@@ -1592,16 +1860,37 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
             /* Pointer to current pictures mv buffers */
             ps_proc->ps_cur_mv_buf = ps_mv_buf;
 
-            /* pointer to ref picture */
-            ps_proc->ps_ref_pic = ps_ref_pic;
+            /*
+             * pointer to ref picture
+             * 0    : Temporal back reference
+             * 1    : Temporal forward reference
+             */
+            ps_proc->aps_ref_pic[PRED_L0] = aps_ref_pic[PRED_L0];
+            ps_proc->aps_ref_pic[PRED_L1] = aps_ref_pic[PRED_L1];
+            if (ps_codec->pic_type == PIC_B)
+            {
+                ps_proc->aps_mv_buf[PRED_L0] = aps_mv_buf[PRED_L0];
+                ps_proc->aps_mv_buf[PRED_L1] = aps_mv_buf[PRED_L1];
+            }
+            else
+            {
+                /*
+                 * Else is dummy since for non B pic we does not need this
+                 * But an assignment here will help in not having a segfault
+                 * when we calcualte colpic in P slices
+                 */
+                ps_proc->aps_mv_buf[PRED_L0] = ps_mv_buf;
+                ps_proc->aps_mv_buf[PRED_L1] = ps_mv_buf;
+            }
 
             if ((*pic_type != PIC_IDR) && (*pic_type != PIC_I))
             {
-                /* ref pointer luma */
-                ps_proc->pu1_ref_buf_luma_base = ps_ref_pic->pu1_luma;
+                /* temporal back an forward  ref pointer luma and chroma */
+                ps_proc->apu1_ref_buf_luma_base[PRED_L0] = aps_ref_pic[PRED_L0]->pu1_luma;
+                ps_proc->apu1_ref_buf_chroma_base[PRED_L0] = aps_ref_pic[PRED_L0]->pu1_chroma;
 
-                /* ref pointer chroma */
-                ps_proc->pu1_ref_buf_chroma_base = ps_ref_pic->pu1_chroma;
+                ps_proc->apu1_ref_buf_luma_base[PRED_L1] = aps_ref_pic[PRED_L1]->pu1_luma;
+                ps_proc->apu1_ref_buf_chroma_base[PRED_L1] = aps_ref_pic[PRED_L1]->pu1_chroma;
             }
 
             /* Structure for current input buffer */
@@ -1649,6 +1938,9 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
                 /* slice hdr base */
                 ps_entropy->ps_slice_hdr_base = ps_proc->ps_slice_hdr_base;
 
+                /* Abs poc */
+                ps_entropy->i4_abs_pic_order_cnt = ps_proc->ps_codec->i4_poc;
+
                 /* initialize entropy map */
                 if (i == j)
                 {
@@ -1656,6 +1948,9 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
                     memset(ps_entropy->pu1_entropy_map - ps_proc->i4_wd_mbs, 1, ps_proc->i4_wd_mbs);
                     /* row 0 to ht in mbs */
                     memset(ps_entropy->pu1_entropy_map, 0, ps_proc->i4_wd_mbs * ps_proc->i4_ht_mbs);
+
+                    /* intialize cabac tables */
+                    ih264e_init_cabac_table(ps_entropy);
                 }
 
                 /* wd in mbs */
@@ -1720,9 +2015,6 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
                 ps_me_ctxt->ai2_srch_boundaries[1] =
                                 ps_codec->s_cfg.u4_srch_rng_y;
 
-                /* src stride */
-                ps_me_ctxt->i4_src_strd = ps_codec->i4_src_strd;
-
                 /* rec stride */
                 ps_me_ctxt->i4_rec_strd = ps_codec->i4_rec_strd;
 
@@ -1751,7 +2043,7 @@ IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf)
                 /* qp */
                 ps_me_ctxt->u1_mb_qp = ps_codec->u4_frame_qp;
 
-                if ((i == 0) && (0 == ps_codec->i4_pic_cnt))
+                if ((i == j) && (0 == ps_codec->i4_poc))
                 {
                     /* init mv bits tables */
                     ih264e_init_mv_bits(ps_me_ctxt);
diff --git a/encoder/ih264e_utils.h b/encoder/ih264e_utils.h
index 651dad9..27e37e8 100644
--- a/encoder/ih264e_utils.h
+++ b/encoder/ih264e_utils.h
@@ -30,6 +30,7 @@
 *  Harish
 *
 * @par List of Functions:
+*  -ih264e_input_queue_update()
 *  -ih264e_get_min_level()
 *  -ih264e_get_lvl_idx()
 *  -ih264e_get_dpb_size()
@@ -52,6 +53,35 @@
 #define IH264E_UTILS_H_
 
 /**
+ *******************************************************************************
+ *
+ * @brief
+ *  Queues the current buffer, gets back a another buffer for encoding with corrent
+ *  picture type
+ *
+ * @par Description:
+ *
+ * @param[in] ps_codec
+ *   Pointer to codec descriptor
+ *
+ * @param[in] ps_ive_ip
+ *   Current input buffer to the encoder
+ *
+ * @param[out] ps_inp
+ *   Buffer to be encoded in the current pass
+ *
+ * @returns
+ *   Flag indicating if we have a pre-enc skip or not
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+WORD32 ih264e_input_queue_update(codec_t *ps_codec,
+                                 ive_video_encode_ip_t *ps_ive_ip,
+                                 inp_buf_t *ps_enc_buff);
+
+/**
 *******************************************************************************
 *
 * @brief
@@ -61,8 +91,11 @@
 *  Gets the minimum level index and then gets corresponding level.
 *  Also used to ignore invalid levels like 2.3, 3.3 etc
 *
-* @param[in] level
-*  Level of the stream
+* @param[in] wd
+*  Width
+*
+* @param[in] ht
+*  Height
 *
 * @returns  Level index for a given level
 *
@@ -70,7 +103,7 @@
 *
 *******************************************************************************
 */
-WORD32 ih264e_get_min_level(WORD32 pic_size);
+WORD32 ih264e_get_min_level(WORD32 wd, WORD32 ht);
 
 /**
 *******************************************************************************
diff --git a/encoder/ih264e_version.c b/encoder/ih264e_version.c
index 3dcba8d..173f17c 100644
--- a/encoder/ih264e_version.c
+++ b/encoder/ih264e_version.c
@@ -86,18 +86,9 @@
 *******************************************************************************
 */
 #define VERSION(version_string, codec_name, codec_release_type, codec_release_ver, codec_vendor)    \
-    strncpy(version_string,"@(#)Id:", MAX_STRLEN);                                                               \
-    strncat(version_string,codec_name, MAX_STRLEN);                                                              \
-    strncat(version_string,"_", MAX_STRLEN);                                                                     \
-    strncat(version_string,codec_release_type, MAX_STRLEN);                                                      \
-    strncat(version_string," Ver:", MAX_STRLEN);                                                                 \
-    strncat(version_string,codec_release_ver, MAX_STRLEN);                                                       \
-    strncat(version_string," Released by ", MAX_STRLEN);                                                         \
-    strncat(version_string,codec_vendor, MAX_STRLEN);                                                            \
-    strncat(version_string," Build: ", MAX_STRLEN);                                                              \
-    strncat(version_string,__DATE__, MAX_STRLEN);                                                                \
-    strncat(version_string," @ ", MAX_STRLEN);                                                                   \
-    strncat(version_string,__TIME__, MAX_STRLEN);
+    snprintf(version_string, MAX_STRLEN,                                                            \
+             "@(#)Id:%s_%s Ver:%s Released by %s Build: %s @ %s",                                   \
+             codec_name, codec_release_type, codec_release_ver, codec_vendor, __DATE__, __TIME__)
 
 /*****************************************************************************/
 /* Function Definitions                                                      */
@@ -131,9 +122,9 @@ IV_STATUS_T ih264e_get_version(CHAR *pc_version, UWORD32 u4_version_bufsize)
     VERSION(ac_version_tmp, CODEC_NAME, CODEC_RELEASE_TYPE, CODEC_RELEASE_VER,
             CODEC_VENDOR);
 
-    if (u4_version_bufsize >= (strnlen(ac_version_tmp, MAX_STRLEN) + 1))
+    if (u4_version_bufsize >= (strlen(ac_version_tmp) + 1))
     {
-        memcpy(pc_version, ac_version_tmp, (strnlen(ac_version_tmp, MAX_STRLEN) + 1));
+        memcpy(pc_version, ac_version_tmp, (strlen(ac_version_tmp) + 1));
         return IV_SUCCESS;
     }
     else
diff --git a/encoder/ime.c b/encoder/ime.c
index c89aaab..cfd6e81 100644
--- a/encoder/ime.c
+++ b/encoder/ime.c
@@ -50,10 +50,10 @@
 /* User include files */
 #include "ime_typedefs.h"
 #include "ime_distortion_metrics.h"
-#include "ime_structs.h"
 #include "ime_defs.h"
-#include "ime_macros.h"
+#include "ime_structs.h"
 #include "ime.h"
+#include "ime_macros.h"
 #include "ime_statistics.h"
 
 /**
@@ -87,10 +87,10 @@
 *
 *******************************************************************************
 */
-void ime_diamond_search_16x16(me_ctxt_t *ps_me_ctxt)
+void ime_diamond_search_16x16(me_ctxt_t *ps_me_ctxt, WORD32 i4_reflist)
 {
     /* MB partition info */
-    mb_part_ctxt *ps_mb_part = &ps_me_ctxt->s_mb_part;
+    mb_part_ctxt *ps_mb_part = &ps_me_ctxt->as_mb_part[i4_reflist];
 
     /* lagrange parameter */
     UWORD32 u4_lambda_motion = ps_me_ctxt->u4_lambda_motion;
@@ -106,7 +106,7 @@ void ime_diamond_search_16x16(me_ctxt_t *ps_me_ctxt)
 
     /* pointer to src macro block */
     UWORD8 *pu1_curr_mb = ps_me_ctxt->pu1_src_buf_luma;
-    UWORD8 *pu1_ref_mb = ps_me_ctxt->pu1_ref_buf_luma;
+    UWORD8 *pu1_ref_mb = ps_me_ctxt->apu1_ref_buf_luma[i4_reflist];
 
     /* strides */
     WORD32 i4_src_strd = ps_me_ctxt->i4_src_strd;
@@ -271,22 +271,24 @@ void ime_diamond_search_16x16(me_ctxt_t *ps_me_ctxt)
 *
 *******************************************************************************
 */
+
 void ime_evaluate_init_srchposn_16x16
         (
-            me_ctxt_t *ps_me_ctxt
+            me_ctxt_t *ps_me_ctxt,
+            WORD32 i4_reflist
         )
 {
     UWORD32 u4_lambda_motion = ps_me_ctxt->u4_lambda_motion;
 
     /* candidate mv cnt */
-    UWORD32 u4_num_candidates = ps_me_ctxt->u4_num_candidates;
+    UWORD32 u4_num_candidates = ps_me_ctxt->u4_num_candidates[i4_reflist];
 
     /* list of candidate mvs */
-    ime_mv_t *ps_mv_list = ps_me_ctxt->as_mv_init_search;
+    ime_mv_t *ps_mv_list = ps_me_ctxt->as_mv_init_search[i4_reflist];
 
     /* pointer to src macro block */
     UWORD8 *pu1_curr_mb = ps_me_ctxt->pu1_src_buf_luma;
-    UWORD8 *pu1_ref_mb = ps_me_ctxt->pu1_ref_buf_luma;
+    UWORD8 *pu1_ref_mb = ps_me_ctxt->apu1_ref_buf_luma[i4_reflist];
 
     /* strides */
     WORD32 i4_src_strd = ps_me_ctxt->i4_src_strd;
@@ -302,46 +304,15 @@ void ime_evaluate_init_srchposn_16x16
     WORD32 i4_mb_cost, i4_mb_cost_least = INT_MAX, i4_distortion_least = INT_MAX;
 
     /* mb partitions info */
-    mb_part_ctxt *ps_mb_part = &ps_me_ctxt->s_mb_part;
+    mb_part_ctxt *ps_mb_part = &(ps_me_ctxt->as_mb_part[i4_reflist]);
 
     /* mv bits */
     UWORD8 *pu1_mv_bits = ps_me_ctxt->pu1_mv_bits;
 
     /* temp var */
-    UWORD32  i, j, u4_srch_pos_idx = 0;
+    UWORD32  i, j;
+    WORD32 i4_srch_pos_idx = 0;
     UWORD8 *pu1_ref = NULL;
-    WORD16 mv_x, mv_y;
-
-    if (0)
-    {
-        /************************************************************/
-        /* Compute SKIP Cost                                        */
-        /************************************************************/
-        mv_x = ps_mv_list[SKIP_CAND].i2_mvx;
-        mv_y = ps_mv_list[SKIP_CAND].i2_mvy;
-
-        /* adjust ref pointer */
-        pu1_ref = pu1_ref_mb + mv_x + (mv_y * i4_ref_strd);
-
-        /* compute distortion */
-        ps_me_ctxt->pf_ime_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_ref, i4_src_strd, i4_ref_strd, i4_mb_cost_least, &i4_mb_distortion);
-
-        /* for skip mode cost & distortion are identical
-         * But we shall add a bias to favor skip mode.
-         * Doc. JVT B118 Suggests SKIP_BIAS as 16.
-         * TODO : Empirical analysis of SKIP_BIAS is necessary */
-
-        i4_distortion_least = i4_mb_distortion;
-
-        u4_srch_pos_idx = 0;
-
-#define SKIP_BIAS 8
-
-        i4_mb_cost_least = i4_mb_distortion - (u4_lambda_motion * SKIP_BIAS);
-
-#undef SKIP_BIAS
-    }
-
 
     /* Carry out a search using each of the motion vector pairs identified above as predictors. */
     /* TODO : Just like Skip, Do we need to add any bias to zero mv as well */
@@ -366,6 +337,7 @@ void ime_evaluate_init_srchposn_16x16
 
             /* compute distortion */
             ps_me_ctxt->pf_ime_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_ref, i4_src_strd, i4_ref_strd, i4_mb_cost_least, &i4_mb_distortion);
+
             DEBUG_SAD_HISTOGRAM_ADD(i4_mb_distortion, 3);
             /* compute cost */
             i4_mb_cost = i4_mb_distortion + u4_lambda_motion * ( pu1_mv_bits[ (ps_mv_list[i].i2_mvx << 2) - ps_mb_part->s_mv_pred.i2_mvx]
@@ -377,22 +349,21 @@ void ime_evaluate_init_srchposn_16x16
 
                 i4_distortion_least = i4_mb_distortion;
 
-                u4_srch_pos_idx = i;
+                i4_srch_pos_idx = i;
             }
         }
     }
 
     if (i4_mb_cost_least < ps_mb_part->i4_mb_cost)
     {
-        ps_mb_part->u4_srch_pos_idx = u4_srch_pos_idx;
+        ps_mb_part->i4_srch_pos_idx = i4_srch_pos_idx;
         ps_mb_part->i4_mb_cost = i4_mb_cost_least;
         ps_mb_part->i4_mb_distortion = i4_distortion_least;
-        ps_mb_part->s_mv_curr.i2_mvx = ps_mv_list[u4_srch_pos_idx].i2_mvx;
-        ps_mb_part->s_mv_curr.i2_mvy = ps_mv_list[u4_srch_pos_idx].i2_mvy;
+        ps_mb_part->s_mv_curr.i2_mvx = ps_mv_list[i4_srch_pos_idx].i2_mvx;
+        ps_mb_part->s_mv_curr.i2_mvy = ps_mv_list[i4_srch_pos_idx].i2_mvy;
     }
 }
 
-
 /**
 *******************************************************************************
 *
@@ -419,11 +390,12 @@ void ime_evaluate_init_srchposn_16x16
 */
 void ime_full_pel_motion_estimation_16x16
     (
-        me_ctxt_t *ps_me_ctxt
+        me_ctxt_t *ps_me_ctxt,
+        WORD32 i4_ref_list
     )
 {
     /* mb part info */
-    mb_part_ctxt *ps_mb_part = &ps_me_ctxt->s_mb_part;
+    mb_part_ctxt *ps_mb_part = &ps_me_ctxt->as_mb_part[i4_ref_list];
 
     /******************************************************************/
     /* Modify Search range about initial candidate instead of zero mv */
@@ -448,19 +420,14 @@ void ime_full_pel_motion_estimation_16x16
     switch (ps_me_ctxt->u4_me_speed_preset)
     {
         case DMND_SRCH:
-            ime_diamond_search_16x16(ps_me_ctxt);
+            ime_diamond_search_16x16(ps_me_ctxt, i4_ref_list);
             break;
         default:
             assert(0);
             break;
     }
-
-    ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvx = ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvx << 2;
-    ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvy = ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvy << 2;
-
 }
 
-
 /**
 *******************************************************************************
 *
@@ -487,13 +454,13 @@ void ime_full_pel_motion_estimation_16x16
 */
 void ime_sub_pel_motion_estimation_16x16
     (
-        me_ctxt_t *ps_me_ctxt
+        me_ctxt_t *ps_me_ctxt,
+        WORD32 i4_reflist
     )
 {
     /* pointers to src & ref macro block */
     UWORD8 *pu1_curr_mb = ps_me_ctxt->pu1_src_buf_luma;
 
-
     /* pointers to ref. half pel planes */
     UWORD8 *pu1_ref_mb_half_x;
     UWORD8 *pu1_ref_mb_half_y;
@@ -507,10 +474,10 @@ void ime_sub_pel_motion_estimation_16x16
     /* strides */
     WORD32 i4_src_strd = ps_me_ctxt->i4_src_strd;
 
-    WORD32 i4_ref_strd = ps_me_ctxt->u4_hp_buf_strd;
+    WORD32 i4_ref_strd = ps_me_ctxt->u4_subpel_buf_strd;
 
     /* mb partitions info */
-    mb_part_ctxt *ps_mb_part = &ps_me_ctxt->s_mb_part;
+    mb_part_ctxt *ps_mb_part = &ps_me_ctxt->as_mb_part[i4_reflist];
 
     /* SAD(distortion metric) of an mb */
     WORD32 i4_mb_distortion;
@@ -523,7 +490,6 @@ void ime_sub_pel_motion_estimation_16x16
     /*Best half pel buffer*/
     UWORD8 *pu1_best_hpel_buf = NULL;
 
-
     /* mv bits */
     UWORD8 *pu1_mv_bits = ps_me_ctxt->pu1_mv_bits;
 
@@ -550,6 +516,8 @@ void ime_sub_pel_motion_estimation_16x16
     WORD32 i, j;
     WORD32 ai4_sad[8];
 
+    WORD32 i4_srch_pos_idx = ps_mb_part->i4_srch_pos_idx;
+
     i2_mv_u_x = ps_mb_part->s_mv_curr.i2_mvx;
     i2_mv_u_y = ps_mb_part->s_mv_curr.i2_mvy;
 
@@ -575,10 +543,9 @@ void ime_sub_pel_motion_estimation_16x16
     /* Hence corresponding adjustments made here                  */
     /**************************************************************/
 
-    pu1_ref_mb_half_x_temp = pu1_ref_mb_half_x = ps_me_ctxt->pu1_half_x + 1;
-    pu1_ref_mb_half_y_temp = pu1_ref_mb_half_y = ps_me_ctxt->pu1_half_y + 1 + i4_ref_strd;
-    pu1_ref_mb_half_xy_temp = pu1_ref_mb_half_xy = ps_me_ctxt->pu1_half_xy + 1 + i4_ref_strd;
-
+    pu1_ref_mb_half_x_temp = pu1_ref_mb_half_x = ps_me_ctxt->apu1_subpel_buffs[0] + 1;
+    pu1_ref_mb_half_y_temp = pu1_ref_mb_half_y = ps_me_ctxt->apu1_subpel_buffs[1] + 1 + i4_ref_strd;
+    pu1_ref_mb_half_xy_temp = pu1_ref_mb_half_xy = ps_me_ctxt->apu1_subpel_buffs[2] + 1 + i4_ref_strd;
 
     ps_me_ctxt->pf_ime_sub_pel_compute_sad_16x16(pu1_curr_mb, pu1_ref_mb_half_x,
                                                  pu1_ref_mb_half_y,
@@ -611,8 +578,10 @@ void ime_sub_pel_motion_estimation_16x16
             i2_mv_u_y = mv_y_tmp;
 
 #ifndef HP_PL /*choosing whether left or right half_x*/
-            ps_me_ctxt->pu1_half_x = pu1_ref_mb_half_x_temp - i;
+            ps_me_ctxt->apu1_subpel_buffs[0] = pu1_ref_mb_half_x_temp - i;
             pu1_best_hpel_buf = pu1_ref_mb_half_x_temp - i;
+
+            i4_srch_pos_idx = 0;
 #endif
         }
 
@@ -643,8 +612,10 @@ void ime_sub_pel_motion_estimation_16x16
             i2_mv_u_y = mv_y_tmp;
 
 #ifndef HP_PL/*choosing whether top or bottom half_y*/
-            ps_me_ctxt->pu1_half_y = pu1_ref_mb_half_y_temp  - i*(i4_ref_strd);
+            ps_me_ctxt->apu1_subpel_buffs[1] = pu1_ref_mb_half_y_temp  - i*(i4_ref_strd);
             pu1_best_hpel_buf = pu1_ref_mb_half_y_temp  - i*(i4_ref_strd);
+
+            i4_srch_pos_idx = 1;
 #endif
         }
 
@@ -678,23 +649,27 @@ void ime_sub_pel_motion_estimation_16x16
                 i2_mv_u_y = mv_y_tmp;
 
 #ifndef HP_PL /*choosing between four half_xy */
-                ps_me_ctxt->pu1_half_xy = pu1_ref_mb_half_xy_temp  - j*(i4_ref_strd) - i;
+                ps_me_ctxt->apu1_subpel_buffs[2] = pu1_ref_mb_half_xy_temp  - j*(i4_ref_strd) - i;
                 pu1_best_hpel_buf =  pu1_ref_mb_half_xy_temp  - j*(i4_ref_strd) - i;
+
+                i4_srch_pos_idx = 2;
 #endif
             }
 
         }
     }
 
-    ps_mb_part->i4_mb_cost = i4_mb_cost_least;
-    ps_mb_part->i4_mb_distortion = i4_distortion_least;
-    ps_mb_part->s_mv_curr.i2_mvx = i2_mv_u_x;
-    ps_mb_part->s_mv_curr.i2_mvy = i2_mv_u_y;
-    ps_mb_part->pu1_best_hpel_buf = pu1_best_hpel_buf;
-
+    if (i4_mb_cost_least < ps_mb_part->i4_mb_cost)
+    {
+        ps_mb_part->i4_mb_cost = i4_mb_cost_least;
+        ps_mb_part->i4_mb_distortion = i4_distortion_least;
+        ps_mb_part->s_mv_curr.i2_mvx = i2_mv_u_x;
+        ps_mb_part->s_mv_curr.i2_mvy = i2_mv_u_y;
+        ps_mb_part->pu1_best_hpel_buf = pu1_best_hpel_buf;
+        ps_mb_part->i4_srch_pos_idx = i4_srch_pos_idx;
+    }
 }
 
-
 /**
 *******************************************************************************
 *
@@ -705,132 +680,105 @@ void ime_sub_pel_motion_estimation_16x16
 * @param[in] ps_me_ctxt
 *  pointer to me ctxt
 *
-* @param[in] ps_skip_mv
-*  pointer to skip mv
 *
 * @returns  none
 *
 * @remarks
 * NOTE: while computing the skip cost, do not enable early exit from compute
 * sad function because, a negative bias gets added later
+* Note tha the last ME candidate in me ctxt is taken as skip motion vector
 *
 *******************************************************************************
 */
 void ime_compute_skip_cost
     (
          me_ctxt_t *ps_me_ctxt,
-         void *pv_skip_mv,
+         ime_mv_t *ps_skip_mv,
          mb_part_ctxt *ps_smb_part_info,
-         UWORD32 u4_use_stat_sad
+         UWORD32 u4_use_stat_sad,
+         WORD32 i4_reflist,
+         WORD32 i4_is_slice_type_b
     )
 {
 
-    /* pointers to src & ref macro block */
-    UWORD8 *pu1_curr_mb = ps_me_ctxt->pu1_src_buf_luma;
-    UWORD8 *pu1_ref_mb = ps_me_ctxt->pu1_ref_buf_luma;
-
-    /* strides */
-    WORD32 i4_src_strd = ps_me_ctxt->i4_src_strd;
-    WORD32 i4_ref_strd = ps_me_ctxt->i4_rec_strd;
-
-    /* enabled fast sad computation */
-    UWORD32 u4_enable_fast_sad = ps_me_ctxt->u4_enable_fast_sad;
-
     /* SAD(distortion metric) of an mb */
     WORD32 i4_mb_distortion;
 
     /* cost = distortion + u4_lambda_motion * rate */
     WORD32 i4_mb_cost;
 
-    /* Motion vectors in full-pel units */
-    WORD16 mv_x, mv_y;
-
-    /* lambda - lagrange constant */
-    UWORD32 u4_lambda_motion = ps_me_ctxt->u4_lambda_motion;
-
-    /* skip mv */
-    ime_mv_t *ps_skip_mv = pv_skip_mv, s_clip_skip_mv;
-
     /* temp var */
     UWORD8 *pu1_ref = NULL;
-    UWORD32 u4_is_nonzero;
 
-    s_clip_skip_mv.i2_mvx = CLIP3(ps_me_ctxt->i4_srch_range_w, ps_me_ctxt->i4_srch_range_e, ps_skip_mv->i2_mvx);
-    s_clip_skip_mv.i2_mvy = CLIP3(ps_me_ctxt->i4_srch_range_n, ps_me_ctxt->i4_srch_range_s, ps_skip_mv->i2_mvy);
+    ime_mv_t s_skip_mv;
 
-    if ((s_clip_skip_mv.i2_mvx != ps_skip_mv->i2_mvx) ||
-                    (s_clip_skip_mv.i2_mvy != ps_skip_mv->i2_mvy))
-    {
-        /* skip motion vector not with in bounds */
-        /* it is possible that mv is already evaluated */
-        return ;
-    }
+    s_skip_mv.i2_mvx = (ps_skip_mv->i2_mvx +2)>>2;
+    s_skip_mv.i2_mvy = (ps_skip_mv->i2_mvy +2)>>2;
 
-    mv_x = (ps_skip_mv->i2_mvx + 2) >> 2;
-    mv_y = (ps_skip_mv->i2_mvy + 2) >> 2;
-
-    if ((mv_x << 2) != ps_skip_mv->i2_mvx || (mv_y << 2) != ps_skip_mv->i2_mvy)
+    /* Check if the skip mv is out of bounds or subpel */
     {
+        /* skip mv */
+        ime_mv_t s_clip_skip_mv;
 
+        s_clip_skip_mv.i2_mvx = CLIP3(ps_me_ctxt->i4_srch_range_w, ps_me_ctxt->i4_srch_range_e, s_skip_mv.i2_mvx);
+        s_clip_skip_mv.i2_mvy = CLIP3(ps_me_ctxt->i4_srch_range_n, ps_me_ctxt->i4_srch_range_s, s_skip_mv.i2_mvy);
 
-        return ;
+        if ((s_clip_skip_mv.i2_mvx != s_skip_mv.i2_mvx) ||
+           (s_clip_skip_mv.i2_mvy != s_skip_mv.i2_mvy) ||
+           (ps_skip_mv->i2_mvx & 0x3) ||
+           (ps_skip_mv->i2_mvy & 0x3))
+        {
+            return ;
+        }
+    }
 
 
-    }
-    else
-    {
-        /* adjust ref pointer */
-        pu1_ref = pu1_ref_mb + mv_x + (mv_y * i4_ref_strd);
-    }
+    /* adjust ref pointer */
+    pu1_ref = ps_me_ctxt->apu1_ref_buf_luma[i4_reflist] + s_skip_mv.i2_mvx
+                    + (s_skip_mv.i2_mvy * ps_me_ctxt->i4_rec_strd);
 
     if(u4_use_stat_sad == 1)
     {
-        ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16(pu1_curr_mb, pu1_ref, i4_src_strd, i4_ref_strd,
-                ps_me_ctxt->pu2_sad_thrsh, &i4_mb_distortion,&u4_is_nonzero);
-
-        /*
-         *NOTE The check here is two fold
-         * One is checking if the sad has been reached, ie min sad, which a configurable parameter
-         * If that is reached,we need not do any mode evaluation
-         * Similary if we find a distortion of zero there is no point of doing any further mode evaluation
-         * as sad is a non negative quantity
-         * hence in this case too, no further evaluation is necessary
-         */
-        /*
-         *NOTE in case we need to disable the zero check using satdq,
-         *  we need only to set the u4_is_zero to a non zero value
-         */
-        if(u4_is_nonzero==0 || i4_mb_distortion <= ps_me_ctxt->i4_min_sad)
+        UWORD32 u4_is_nonzero;
+
+        ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16(
+                        ps_me_ctxt->pu1_src_buf_luma, pu1_ref, ps_me_ctxt->i4_src_strd,
+                        ps_me_ctxt->i4_rec_strd, ps_me_ctxt->pu2_sad_thrsh,
+                        &i4_mb_distortion, &u4_is_nonzero);
+
+        if (u4_is_nonzero == 0 || i4_mb_distortion <= ps_me_ctxt->i4_min_sad)
         {
-            ps_me_ctxt->u4_min_sad_reached = 1;    /* found min sad*/
-            ps_me_ctxt->i4_min_sad =  (u4_is_nonzero == 0)?0:i4_mb_distortion;
+            ps_me_ctxt->u4_min_sad_reached = 1; /* found min sad */
+            ps_me_ctxt->i4_min_sad = (u4_is_nonzero == 0) ? 0 : i4_mb_distortion;
         }
     }
     else
     {
-        ps_me_ctxt->pf_ime_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_ref, i4_src_strd, i4_ref_strd, INT_MAX, &i4_mb_distortion);
+        ps_me_ctxt->pf_ime_compute_sad_16x16[ps_me_ctxt->u4_enable_fast_sad](
+                        ps_me_ctxt->pu1_src_buf_luma, pu1_ref, ps_me_ctxt->i4_src_strd,
+                        ps_me_ctxt->i4_rec_strd, INT_MAX, &i4_mb_distortion);
 
         if(i4_mb_distortion <= ps_me_ctxt->i4_min_sad)
         {
             ps_me_ctxt->i4_min_sad = i4_mb_distortion;
-            ps_me_ctxt->u4_min_sad_reached = 1;    /* found min sad*/
+            ps_me_ctxt->u4_min_sad_reached = 1; /* found min sad */
         }
     }
 
+
     /* for skip mode cost & distortion are identical
      * But we shall add a bias to favor skip mode.
      * Doc. JVT B118 Suggests SKIP_BIAS as 16.
      * TODO : Empirical analysis of SKIP_BIAS is necessary */
-#define SKIP_BIAS 8
-    i4_mb_cost = i4_mb_distortion - (u4_lambda_motion * SKIP_BIAS);
-#undef SKIP_BIAS
+
+    i4_mb_cost = i4_mb_distortion - (ps_me_ctxt->u4_lambda_motion * (ps_me_ctxt->i4_skip_bias[0] + ps_me_ctxt->i4_skip_bias[1]  * i4_is_slice_type_b));
 
     if (i4_mb_cost <= ps_smb_part_info->i4_mb_cost)
     {
         ps_smb_part_info->i4_mb_cost = i4_mb_cost;
         ps_smb_part_info->i4_mb_distortion = i4_mb_distortion;
-        ps_smb_part_info->s_mv_curr.i2_mvx = ps_skip_mv->i2_mvx;
-        ps_smb_part_info->s_mv_curr.i2_mvy = ps_skip_mv->i2_mvy;
+        ps_smb_part_info->s_mv_curr.i2_mvx = s_skip_mv.i2_mvx;
+        ps_smb_part_info->s_mv_curr.i2_mvy = s_skip_mv.i2_mvy;
     }
 }
 
diff --git a/encoder/ime.h b/encoder/ime.h
index 5c039e8..17912d4 100644
--- a/encoder/ime.h
+++ b/encoder/ime.h
@@ -47,6 +47,19 @@
  */
 #define NUM_LAYERS 16
 
+/**
+******************************************************************************
+ *  @brief     Skip Bias value for P slice
+******************************************************************************
+ */
+#define SKIP_BIAS_P 2
+
+/**
+******************************************************************************
+ *  @brief     Skip Bias value for B slice
+******************************************************************************
+ */
+#define SKIP_BIAS_B 16
 
 /*****************************************************************************/
 /* Extern Function Declarations                                              */
@@ -84,8 +97,8 @@
 * computational feasibility. This is only for quality eval purposes.
 *
 *******************************************************************************
-*/
-extern void ime_diamond_search_16x16(me_ctxt_t *ps_me_ctxt);
+ */
+extern void ime_diamond_search_16x16(me_ctxt_t *ps_me_ctxt, WORD32 i4_reflist);
 
 
 /**
@@ -113,10 +126,8 @@ extern void ime_diamond_search_16x16(me_ctxt_t *ps_me_ctxt);
 *
 *******************************************************************************
 */
-extern void ime_evaluate_init_srchposn_16x16
-        (
-            me_ctxt_t *ps_me_ctxt
-        );
+extern void ime_evaluate_init_srchposn_16x16(me_ctxt_t *ps_me_ctxt,
+                                             WORD32 i4_reflist);
 
 /**
 *******************************************************************************
@@ -142,10 +153,8 @@ extern void ime_evaluate_init_srchposn_16x16
 *
 *******************************************************************************
 */
-extern void ime_full_pel_motion_estimation_16x16
-    (
-        me_ctxt_t *ps_me_ctxt
-    );
+extern void ime_full_pel_motion_estimation_16x16(me_ctxt_t *ps_me_ctxt,
+                                                 WORD32 i4_ref_list);
 
 /**
 *******************************************************************************
@@ -171,10 +180,8 @@ extern void ime_full_pel_motion_estimation_16x16
 *
 *******************************************************************************
 */
-extern void ime_sub_pel_motion_estimation_16x16
-    (
-        me_ctxt_t *ps_me_ctxt
-    );
+extern void ime_sub_pel_motion_estimation_16x16(me_ctxt_t *ps_me_ctxt,
+                                                WORD32 i4_reflist);
 
 /**
 *******************************************************************************
@@ -189,6 +196,9 @@ extern void ime_sub_pel_motion_estimation_16x16
 * @param[in] ps_skip_mv
 *  pointer to skip mv
 *
+  @param[in] is_slice_type_b
+*  Whether slice type is BSLICE or not
+
 * @returns  none
 *
 * @remarks
@@ -197,13 +207,12 @@ extern void ime_sub_pel_motion_estimation_16x16
 *
 *******************************************************************************
 */
-extern void ime_compute_skip_cost
-    (
-        me_ctxt_t *ps_me_ctxt,
-        void *pv_skip_mv,
-        mb_part_ctxt *ps_smb_part_info,
-        UWORD32 u4_use_stat_sad
-    );
+extern void ime_compute_skip_cost(me_ctxt_t *ps_me_ctxt,
+                                  ime_mv_t *ps_skip_mv,
+                                  mb_part_ctxt *ps_smb_part_info,
+                                  UWORD32 u4_use_stat_sad,
+                                  WORD32 i4_reflist,
+                                  WORD32 is_slice_type_b);
 
 
 #endif /* IME_H_ */
diff --git a/encoder/ime_defs.h b/encoder/ime_defs.h
index 14d9c55..f82018d 100644
--- a/encoder/ime_defs.h
+++ b/encoder/ime_defs.h
@@ -55,5 +55,8 @@
 #define NSTEP_SRCH 50
 #define HEX_SRCH 75
 
+#define MAX_NUM_REFLIST 2
+#define SUBPEL_BUFF_CNT 4
+
 #endif /*_IME_DEFS_H_*/
 
diff --git a/encoder/ime_distortion_metrics.c b/encoder/ime_distortion_metrics.c
index 23a1fbc..f8c44df 100644
--- a/encoder/ime_distortion_metrics.c
+++ b/encoder/ime_distortion_metrics.c
@@ -1260,3 +1260,4 @@ void ime_compute_satqd_16x16_lumaintra(UWORD8 *pu1_src,
     else *sig_sad_dc = 1;
 }
 
+
diff --git a/encoder/ime_distortion_metrics.h b/encoder/ime_distortion_metrics.h
index a30e1fc..5056ba0 100644
--- a/encoder/ime_distortion_metrics.h
+++ b/encoder/ime_distortion_metrics.h
@@ -130,6 +130,7 @@ ime_compute_satqd_16x16_lumainter_ft ime_compute_satqd_16x16_lumainter;
 ime_compute_satqd_8x16_chroma_ft ime_compute_satqd_8x16_chroma;
 ime_compute_satqd_16x16_lumaintra_ft ime_compute_satqd_16x16_lumaintra;
 
+
 /*SSE4.2 Declarations*/
 ime_compute_sad_ft ime_compute_sad_16x16_sse42;
 ime_compute_sad_ft ime_compute_sad_16x16_fast_sse42;
@@ -164,7 +165,6 @@ ime_sub_pel_compute_sad_16x16_ft ime_sub_pel_compute_sad_16x16_av8;
 ime_compute_sad_stat ime_compute_16x16_sad_stat_av8;
 ime_compute_satqd_16x16_lumainter_ft ime_compute_satqd_16x16_lumainter_av8;
 
-
 #endif /* IME_DISTORTION_METRICS_H_ */
 
 
diff --git a/encoder/ime_structs.h b/encoder/ime_structs.h
index 7819b91..9baacb3 100644
--- a/encoder/ime_structs.h
+++ b/encoder/ime_structs.h
@@ -90,7 +90,7 @@ typedef struct
     /**
      * Search position for least cost among the list of candidates
      */
-    UWORD32 u4_srch_pos_idx;
+    WORD32 i4_srch_pos_idx;
 
     /**
      * Search position for least cost among the list of candidates
@@ -116,9 +116,9 @@ typedef struct
 typedef struct
 {
     /**
-     * Ref pointer to current MB luma
+     * Ref pointer to current MB luma for each ref list
      */
-    UWORD8 *pu1_ref_buf_luma;
+    UWORD8 *apu1_ref_buf_luma[MAX_NUM_REFLIST];
 
     /**
      * Src pointer to current MB luma
@@ -190,13 +190,13 @@ typedef struct
     /**
      * Number of valid candidates for the Initial search position
      */
-    UWORD32 u4_num_candidates;
+    UWORD32 u4_num_candidates[MAX_NUM_REFLIST + 1];
 
     /**
-     * Motion vector predictors derived from neighbouring
+     * Motion vector predictors derived from neighboring
      * blocks for each of the six block partitions
      */
-    ime_mv_t as_mv_init_search[5];
+    ime_mv_t as_mv_init_search[MAX_NUM_REFLIST + 1][6];
 
     /**
      * mv bits
@@ -247,10 +247,17 @@ typedef struct
 
     UWORD32 u4_left_is_skip;
 
+    /* skip_type can be PREDL0, PREDL1 or  BIPRED */
+    WORD32 i4_skip_type;
+
+    /* Biasing given for skip prediction */
+    WORD32 i4_skip_bias[2];
+
     /**
      * Structure to store the MB partition info
+     * We need 1(L0)+1(L1)+1(bi)
      */
-    mb_part_ctxt s_mb_part;
+    mb_part_ctxt as_mb_part[MAX_NUM_REFLIST + 1];
     /*
      * Threshold to compare the sad with
      */
@@ -277,27 +284,17 @@ typedef struct
     UWORD8 u1_mb_qp;
 
     /*
-     * Buffers for holding half_x , half_y and half_xy
-     * values when halfpel generation
-     *  for the entire plane is not enabled
+     * Buffers for holding subpel and bipred temp buffers
      */
-    UWORD8 *pu1_half_x;
-    UWORD8 *pu1_half_y;
-    UWORD8 *pu1_half_xy;
+    UWORD8 *apu1_subpel_buffs[SUBPEL_BUFF_CNT];
 
+    WORD32 u4_subpel_buf_strd;
 
     /*
      * Buffers to store the best halfpel plane*
      */
     UWORD8 *pu1_hpel_buf;
 
-    /*
-     * Stride for hpel buffer
-     */
-    UWORD32 u4_hpel_buf_strd;
-
-    WORD32 u4_hp_buf_strd;
-
 } me_ctxt_t;
 
 
diff --git a/encoder/irc_bit_allocation.c b/encoder/irc_bit_allocation.c
index 1dfd9de..6f52970 100644
--- a/encoder/irc_bit_allocation.c
+++ b/encoder/irc_bit_allocation.c
@@ -251,7 +251,7 @@ WORD32 irc_ba_num_fill_use_free_memtab(bit_allocation_t **pps_bit_allocation,
                                        ITT_FUNC_TYPE_E e_func_type)
 {
     WORD32 i4_mem_tab_idx = 0;
-    static bit_allocation_t s_bit_allocation_temp;
+    bit_allocation_t s_bit_allocation_temp;
 
     /*
      * Hack for all alloc, during which we don't have any state memory.
diff --git a/encoder/irc_cbr_buffer_control.c b/encoder/irc_cbr_buffer_control.c
index c179a28..9febbc8 100644
--- a/encoder/irc_cbr_buffer_control.c
+++ b/encoder/irc_cbr_buffer_control.c
@@ -79,7 +79,7 @@ WORD32 irc_cbr_buffer_num_fill_use_free_memtab(cbr_buffer_t **pps_cbr_buffer,
                                                ITT_FUNC_TYPE_E e_func_type)
 {
     WORD32 i4_mem_tab_idx = 0, i;
-    static cbr_buffer_t s_cbr_buffer_temp;
+    cbr_buffer_t s_cbr_buffer_temp;
 
     /*
      * Hack for all alloc, during which we don't have any state memory.
diff --git a/encoder/irc_common.h b/encoder/irc_common.h
index c341de4..448fad3 100644
--- a/encoder/irc_common.h
+++ b/encoder/irc_common.h
@@ -97,7 +97,7 @@ typedef float number_t;
 /* The ratios between I to P and P to B Qp is specified here */
 #define K_Q 4
 #define I_TO_P_RATIO (19) /* In K_Q Q factor */
-#define P_TO_B_RATIO (21) /* In K_Q Q factor */
+#define P_TO_B_RATIO (32) /* In K_Q Q factor */
 #define P_TO_I_RATIO (13) /* In K_Q Q factor */
 
 #endif /* _RC_COMMON_H_ */
diff --git a/encoder/irc_est_sad.c b/encoder/irc_est_sad.c
index 0d8abc2..97a0b68 100644
--- a/encoder/irc_est_sad.c
+++ b/encoder/irc_est_sad.c
@@ -58,7 +58,7 @@ WORD32 irc_est_sad_num_fill_use_free_memtab(est_sad_t **pps_est_sad,
                                             ITT_FUNC_TYPE_E e_func_type)
 {
     WORD32 i4_mem_tab_idx = 0;
-    static est_sad_t s_est_sad;
+    est_sad_t s_est_sad;
 
     /* Hack for al alloc, during which we don't have any state memory.
      * Dereferencing can cause issues
diff --git a/encoder/irc_fixed_point_error_bits.c b/encoder/irc_fixed_point_error_bits.c
index 42dcfc5..1c35685 100644
--- a/encoder/irc_fixed_point_error_bits.c
+++ b/encoder/irc_fixed_point_error_bits.c
@@ -62,7 +62,7 @@ WORD32 irc_error_bits_num_fill_use_free_memtab(error_bits_t **pps_error_bits,
                                                ITT_FUNC_TYPE_E e_func_type)
 {
     WORD32 i4_mem_tab_idx = 0;
-    static error_bits_t s_error_bits_temp;
+    error_bits_t s_error_bits_temp;
 
     /*
      * Hack for all alloc, during which we don't have any state memory.
diff --git a/encoder/irc_mb_model_based.c b/encoder/irc_mb_model_based.c
index 880ee19..b8e3d1b 100644
--- a/encoder/irc_mb_model_based.c
+++ b/encoder/irc_mb_model_based.c
@@ -47,7 +47,7 @@ WORD32 irc_mbrc_num_fill_use_free_memtab(mb_rate_control_t **pps_mb_rate_control
                                          ITT_FUNC_TYPE_E e_func_type)
 {
     WORD32 i4_mem_tab_idx = 0;
-    static mb_rate_control_t s_mb_rate_control_temp;
+    mb_rate_control_t s_mb_rate_control_temp;
 
     /*
      * Hack for al alloc, during which we don't have any state memory.
diff --git a/encoder/irc_picture_type.c b/encoder/irc_picture_type.c
index 186188c..2a91572 100644
--- a/encoder/irc_picture_type.c
+++ b/encoder/irc_picture_type.c
@@ -225,7 +225,7 @@ WORD32 irc_pic_handling_num_fill_use_free_memtab(pic_handling_t **pps_pic_handli
                                                  ITT_FUNC_TYPE_E e_func_type)
 {
     WORD32 i4_mem_tab_idx = 0;
-    static pic_handling_t s_pic_handling_temp;
+    pic_handling_t s_pic_handling_temp;
 
     /*
      * Hack for al alloc, during which we dont have any state memory.
@@ -253,6 +253,7 @@ WORD32 irc_pic_handling_num_fill_use_free_memtab(pic_handling_t **pps_pic_handli
  *****************************************************************************/
 void irc_init_pic_handling(pic_handling_t *ps_pic_handling,
                            WORD32 i4_intra_frm_int,
+                           WORD32 i4_inter_frm_int,
                            WORD32 i4_max_inter_frm_int,
                            WORD32 i4_is_gop_closed)
 {
@@ -262,7 +263,7 @@ void irc_init_pic_handling(pic_handling_t *ps_pic_handling,
     /* Checks */
     /* Codec Parameters */
     ps_pic_handling->i4_intra_frm_int = i4_intra_frm_int;
-    ps_pic_handling->i4_inter_frm_int = i4_max_inter_frm_int;
+    ps_pic_handling->i4_inter_frm_int = i4_inter_frm_int;
     ps_pic_handling->i4_max_inter_frm_int = i4_max_inter_frm_int;
     ps_pic_handling->i4_is_gop_closed = i4_is_gop_closed;
 
@@ -278,6 +279,10 @@ void irc_init_pic_handling(pic_handling_t *ps_pic_handling,
 
     /* Indices to the pic_stack */
     ps_pic_handling->i4_ref_pic_idx = 0;
+    /*
+     * B frame index should be ref_frame_num,
+     * which is 2 in out case
+     */
     ps_pic_handling->i4_b_pic_idx = 2;
     ps_pic_handling->i4_prev_b_pic_idx = 2;
 
@@ -302,7 +307,7 @@ void irc_init_pic_handling(pic_handling_t *ps_pic_handling,
     /* Variables on which the bit allocation is dependent  */
     /* Get the pic distribution in the gop */
     find_pic_distbn_in_gop(ps_pic_handling->i4_frms_in_gop, i4_intra_frm_int,
-                           i4_max_inter_frm_int, i4_is_gop_closed,
+                           i4_inter_frm_int, i4_is_gop_closed,
                            &ps_pic_handling->i4_b_in_incomp_subgop,
                            &ps_pic_handling->i4_extra_p);
 
@@ -528,8 +533,7 @@ void irc_add_pic_to_stack(pic_handling_t *ps_pic_handling, WORD32 i4_enc_pic_id)
      *      3)The new inter-frm-interval won't cross the intra_frm_interval
      */
     if((ps_pic_handling->i4_change_in_inter_frm_int == 1)
-       && ((i4_buf_pic_no % i4_inter_frm_int == 1)
-       || (i4_pic_disp_order_no == 1) || (i4_inter_frm_int == 1)))
+       && ((i4_buf_pic_no % i4_inter_frm_int == 1)|| (i4_pic_disp_order_no == 1) || (i4_inter_frm_int == 1)))
     {
         /*
          * Condition which checks if the new inter_frm_int will cross the
@@ -540,10 +544,31 @@ void irc_add_pic_to_stack(pic_handling_t *ps_pic_handling, WORD32 i4_enc_pic_id)
 
         if(i4_condn_for_change_in_inter_frm_int)
         {
+            /*
+             * If there is a change in inter frame interval. We should set the b
+             * frame IDX to the (num ref frame - num ref frame in buf)+ i4_ref_pic_idx
+             * Since our case we have a structure of I B P or I B...B P only
+             * we have three cases
+             * 1) current incoming frame is I. Then we have to leave space for
+             *    current I and next P hence write b idx as to ref idx + 2
+             * 2) Current incoming frame is B. In that case, we have I in buffer.
+             *    Only one P needs space hence write b idx as ref idx +1
+             * 3) Current incoming frame is P. In that case we are at the end of
+             *    gop [sub gop?] and we have to leave space for next gops I and P.
+             *    Thus b idx = ref idx + 2
+             *
+             *  In case of an closed Gop. The last frame has to be forced to be a P.
+             *  Hence we may have problems in that case.
+             *
+             *  Also this has the implicit assumption of only 2 ref frames
+             */
+            WORD32 i4_is_curr_frm_b =  (i4_buf_pic_no % i4_new_inter_frm_int)&&
+                            !(i4_is_gop_closed && (i4_b_count_in_gop == i4_b_frms_in_prd));
+
             /*If the inter_frm_int = 1, then the b_pic_idx needs to be modified */
             if(i4_inter_frm_int == 1)
             {
-                ps_pic_handling->i4_b_pic_idx = (1
+                ps_pic_handling->i4_b_pic_idx = ((i4_is_curr_frm_b ? 1 : 2)
                                 + ps_pic_handling->i4_ref_pic_idx)
                                 % (i4_max_inter_frm_int + 1);
             }
@@ -811,7 +836,42 @@ void irc_add_pic_to_stack(pic_handling_t *ps_pic_handling, WORD32 i4_enc_pic_id)
     i4_pic_disp_order_no++;
     i4_buf_pic_no++;
 
+#if 0
     /* For any gop */
+     /* BUG FIX
+      *  This piece of code resets the gop upon I frame(?)
+      *  This introduces a problem of GOP getting reset not at I frames as it should be
+      *  The reason AFAIK is that
+      *  1) This code uses i4_pic_disp_order_no to reset GOP. I assume it computes
+      *      if are at GOP boundary and does it, but not sure
+      *  2) The frames rmainign in GOP is done in post enc as it should be.
+      *
+      *  Also ps_pic_handling->i4_pic_disp_order_no is incremented when a pic is added
+      *  to stack becuase the additon is in disp order while poping is in encode order
+      *
+      *  SUppose there is a deay od 1 frame between queue and encode.
+      *  then he timing will be. Assume a GOP of IPPIPP
+      *
+      *      Input buff    Input to qu     Output buf/encode buff      remaining pic in gop
+      *    1  I             I                 NA                          rest to 1 2
+      *    2  P             P                 I                           0 2
+      *    3  P             P                 P                           0 1
+      *    4  I             I                 P                           reset to 1 2
+      *    5  P             P                 I                           1 1
+      *    6  P             P                 P                           1 0
+      *    7  NA            NA                P
+      *
+      *  Hence our gop gets reset at I(1)  and I(4) in the RC.thus the reaming pic in gop
+      *  count will be as shown. We can clearly see that the GOP gets reset at I(4) .Hence
+      *  for the correpondinng QP for output buf p(4) will be that of an I frame.
+      *
+      *  By hiding this I hope to fix this problem. But Iam not sure exaclty.
+      *  This needs to be investigated further
+      *
+      *  By hiding this most likely we are in effect disabling the dynanic
+      *  update of gop params.
+      */
+
     if(ps_pic_handling->i4_pic_disp_order_no
                     == (i4_max_inter_frm_int - 1- ((!i4_is_gop_closed)
                         * ps_pic_handling->i4_b_in_incomp_subgop_mix_gop)))
@@ -831,6 +891,7 @@ void irc_add_pic_to_stack(pic_handling_t *ps_pic_handling, WORD32 i4_enc_pic_id)
                                             - ps_pic_handling->i4_b_in_incomp_subgop_mix_gop;
         }
     }
+#endif
 
     /* End of GOP updates */
     if(i4_pic_disp_order_no == (i4_p_frms_in_prd + i4_b_frms_in_prd + 1))
@@ -856,10 +917,8 @@ void irc_add_pic_to_stack(pic_handling_t *ps_pic_handling, WORD32 i4_enc_pic_id)
 
     /* Updating the vars which work on the encoded pics */
     /* For the first gop */
-    if(((ps_pic_handling->i4_is_first_gop)
-                    && (ps_pic_handling->i4_pic_disp_order_no
-                                    == (i4_max_inter_frm_int - 1)))
-                    || (i4_intra_frm_int == 1))
+    if ((ps_pic_handling->i4_is_first_gop)
+                    && (ps_pic_handling->i4_pic_disp_order_no == 0))
     {
         ps_pic_handling->i4_coded_pic_no = 0;
         ps_pic_handling->i4_stack_count = 0;
diff --git a/encoder/irc_picture_type.h b/encoder/irc_picture_type.h
index 1af5424..021ee33 100644
--- a/encoder/irc_picture_type.h
+++ b/encoder/irc_picture_type.h
@@ -34,6 +34,7 @@ WORD32 irc_pic_handling_num_fill_use_free_memtab(pic_handling_handle *pps_pic_ha
 
 void irc_init_pic_handling(pic_handling_handle ps_pic_handling,
                            WORD32 i4_intra_frm_int,
+                           WORD32 i4_inter_frm_int,
                            WORD32 i4_max_inter_frm_int,
                            WORD32 i4_is_gop_closed);
 
diff --git a/encoder/irc_rate_control_api.c b/encoder/irc_rate_control_api.c
index 6c6586e..95befce 100644
--- a/encoder/irc_rate_control_api.c
+++ b/encoder/irc_rate_control_api.c
@@ -43,6 +43,10 @@
 #include "irc_rate_control_api_structs.h"
 #include "irc_trace_support.h"
 
+
+#define MIN(a,b)   (((a) < (b)) ? (a) : (b))
+#define MAX(a,b)   (((a) > (b)) ? (a) : (b))
+
 #define DEV_Q   4       /*Q format(Shift) for Deviation range factor */
 #define HI_DEV_FCTR     22  /* 1.4*16 */
 #define LO_DEV_FCTR     12  /* 0.75*16 */
@@ -73,7 +77,7 @@ WORD32 irc_rate_control_num_fill_use_free_memtab(rate_control_handle *pps_rate_c
                                                  ITT_FUNC_TYPE_E e_func_type)
 {
     WORD32 i4_mem_tab_idx = 0, i;
-    static rate_control_api_t s_temp_rc_api;
+    rate_control_api_t s_temp_rc_api;
 
     /*
      * Hack for al alloc, during which we dont have any state memory.
@@ -148,6 +152,7 @@ void irc_initialise_rate_control(rate_control_api_t *ps_rate_control_api,
                                  UWORD32 u4_frame_rate,
                                  UWORD32 u4_max_delay,
                                  UWORD32 u4_intra_frame_interval,
+                                 WORD32  i4_inter_frm_int,
                                  UWORD8 *pu1_init_qp,
                                  UWORD32 u4_max_vbv_buff_size,
                                  WORD32 i4_max_inter_frm_int,
@@ -172,7 +177,8 @@ void irc_initialise_rate_control(rate_control_api_t *ps_rate_control_api,
 
     /* Initialize the pic_handling module */
     irc_init_pic_handling(ps_rate_control_api->ps_pic_handling,
-                          (WORD32)u4_intra_frame_interval, i4_max_inter_frm_int,
+                          (WORD32)u4_intra_frame_interval,
+                          i4_inter_frm_int, i4_max_inter_frm_int,
                           i4_is_gop_closed);
 
     /*** Initialize the rate control modules  ***/
@@ -597,20 +603,25 @@ UWORD8 irc_get_frame_level_qp(rate_control_api_t *ps_rate_control_api,
                     }
                 }
 
-                hi_dev_qp = GET_HI_DEV_QP(prev_qp);
                 /*
-                 * For lower QPs due to scale factor and fixed point arithmetic,
-                 * the hi_dev_qp can be same as that of the prev qp and in which
-                 * case it gets stuck in the lower most qp and thus not allowing
-                 * QPs not to change. To avoid this,for lower qps the hi_dev_qp
-                 * should be made slightly more than prev_qp
+                 * Due to the inexact nature of translation tables, QP may
+                 * get locked at some values. This is because of the inexactness of
+                 * the tables causing a change of +-1 in back and forth translations.
+                 * In that case, if we restrict the QP swing to +-1, we will get
+                 * the lock up condition. Hence we make it such that we will have
+                 * a swing of atleast +- 2 from prev_qp
                  */
-                if(prev_qp == hi_dev_qp)
-                {
-                    hi_dev_qp += 1;
-                }
+
                 lo_dev_qp = GET_LO_DEV_QP(prev_qp);
-                u1_frame_qp = (UWORD8)CLIP_QP((WORD32)u1_frame_qp, hi_dev_qp, lo_dev_qp);
+                lo_dev_qp = MIN(lo_dev_qp, prev_qp - 2);
+                lo_dev_qp = MAX(lo_dev_qp, ps_rate_control_api->au1_min_max_qp[(e_pic_type << 1)]);
+
+                hi_dev_qp = GET_HI_DEV_QP(prev_qp);
+                hi_dev_qp = MAX(hi_dev_qp, prev_qp + 2);
+                hi_dev_qp = MIN(hi_dev_qp, ps_rate_control_api->au1_min_max_qp[(e_pic_type << 1) + 1]);
+
+                u1_frame_qp = (UWORD8)CLIP_QP((WORD32)u1_frame_qp, hi_dev_qp , lo_dev_qp);
+
             }
             else
             {
diff --git a/encoder/irc_rate_control_api.h b/encoder/irc_rate_control_api.h
index 0173037..4b24ece 100644
--- a/encoder/irc_rate_control_api.h
+++ b/encoder/irc_rate_control_api.h
@@ -42,6 +42,7 @@ void irc_initialise_rate_control(rate_control_handle ps_rate_control_api,
                                  UWORD32 u4_frame_rate,
                                  UWORD32 u4_max_delay,
                                  UWORD32 u4_intra_frame_interval,
+                                 WORD32 i4_inter_frm_int,
                                  UWORD8 *pu1_init_qp,
                                  UWORD32 u4_max_vbv_buff_size,
                                  WORD32 i4_max_inter_frm_int,
diff --git a/encoder/irc_rd_model.c b/encoder/irc_rd_model.c
index f5c0737..62c7811 100644
--- a/encoder/irc_rd_model.c
+++ b/encoder/irc_rd_model.c
@@ -55,7 +55,7 @@ WORD32 irc_rd_model_num_fill_use_free_memtab(rc_rd_model_t **pps_rc_rd_model,
                                              ITT_FUNC_TYPE_E e_func_type)
 {
     WORD32 i4_mem_tab_idx = 0;
-    static rc_rd_model_t s_rc_rd_model_temp;
+    rc_rd_model_t s_rc_rd_model_temp;
 
     /*
      * Hack for al alloc, during which we don't have any state memory.
@@ -115,11 +115,6 @@ static UWORD8 find_model_coeffs(UWORD32 *pi4_res_bits,
     UWORD8 u1_num_frms_used = 0;
     UWORD8 u1_frm_indx;
 
-#if !(ENABLE_QUAD_RC_MODEL||ENABLE_LIN_MODEL_WITH_INTERCEPT)
-    UNUSED(pu1_num_skips);
-    UNUSED(pmc_model_coeff);
-    UNUSED(pmc_model_coeff_lin);
-#endif
     float sum_y = 0;
     float sum_x_y = 0;
     float sum_x2_y = 0;
@@ -131,6 +126,12 @@ static UWORD8 find_model_coeffs(UWORD32 *pi4_res_bits,
     float x0, y0;
     float model_coeff_a = 0.0, model_coeff_b = 0.0, model_coeff_c = 0.0;
 
+#if !(ENABLE_QUAD_RC_MODEL||ENABLE_LIN_MODEL_WITH_INTERCEPT)
+    UNUSED(pu1_num_skips);
+    UNUSED(pmc_model_coeff);
+    UNUSED(pmc_model_coeff_lin);
+#endif
+
     for(i = 0; i < u1_num_frms; i++)
     {
         if(-1 == pi1_frame_index[i])
diff --git a/encoder/irc_vbr_storage_vbv.c b/encoder/irc_vbr_storage_vbv.c
index 23e9959..aaf0d6e 100644
--- a/encoder/irc_vbr_storage_vbv.c
+++ b/encoder/irc_vbr_storage_vbv.c
@@ -73,7 +73,7 @@ WORD32 irc_vbr_vbv_num_fill_use_free_memtab(vbr_storage_vbv_t **pps_vbr_storage_
                                             ITT_FUNC_TYPE_E e_func_type)
 {
     WORD32 i4_mem_tab_idx = 0;
-    static vbr_storage_vbv_t s_vbr_storage_vbv_temp;
+    vbr_storage_vbv_t s_vbr_storage_vbv_temp;
 
     /*
      * Hack for al alloc, during which we don't have any state memory.
diff --git a/encoder/ithread.h b/encoder/ithread.h
deleted file mode 100644
index 82170a5..0000000
--- a/encoder/ithread.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/******************************************************************************
- *
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- *****************************************************************************
- * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
-*/
-/*****************************************************************************/
-/*                                                                           */
-/*  File Name         : ithread.h                                            */
-/*                                                                           */
-/*  Description       : This file contains all the necessary structure and   */
-/*                      enumeration definitions needed for the Application   */
-/*                      Program Interface(API) of the                        */
-/*                      Thread Abstraction Layer                             */
-/*                                                                           */
-/*  List of Functions : ithread_get_handle_size()                            */
-/*                      ithread_get_mutex_lock_size()                        */
-/*                      ithread_create()                                     */
-/*                      ithread_exit()                                       */
-/*                      ithread_join()                                       */
-/*                      ithread_get_mutex_struct_size()                      */
-/*                      ithread_mutex_init()                                 */
-/*                      ithread_mutex_destroy()                              */
-/*                      ithread_mutex_lock()                                 */
-/*                      ithread_mutex_unlock()                               */
-/*                      ithread_yield()                                      */
-/*                      ithread_sleep()                                      */
-/*                      ithread_msleep()                                     */
-/*                      ithread_usleep()                                     */
-/*                      ithread_get_sem_struct_size()                        */
-/*                      ithread_sem_init()                                   */
-/*                      ithread_sem_post()                                   */
-/*                      ithread_sem_wait()                                   */
-/*                      ithread_sem_destroy()                                */
-/*                      ithread_set_affinity()                               */
-/*                                                                           */
-/*  Issues / Problems : None                                                 */
-/*                                                                           */
-/*  Revision History  :                                                      */
-/*                                                                           */
-/*         DD MM YYYY   Author(s)       Changes                              */
-/*         06 09 2012   Harish          Initial Version                      */
-/*                                                                           */
-/*****************************************************************************/
-
-#ifndef _ITHREAD_H_
-#define _ITHREAD_H_
-
-UWORD32 ithread_get_handle_size(void);
-
-UWORD32 ithread_get_mutex_lock_size(void);
-
-WORD32  ithread_create(void *thread_handle, void *attribute, void *strt, void *argument);
-
-void    ithread_exit(void *val_ptr);
-
-WORD32  ithread_join(void *thread_id, void ** val_ptr);
-
-WORD32  ithread_get_mutex_struct_size(void);
-
-WORD32  ithread_mutex_init(void *mutex);
-
-WORD32  ithread_mutex_destroy(void *mutex);
-
-WORD32  ithread_mutex_lock(void *mutex);
-
-WORD32  ithread_mutex_unlock(void *mutex);
-
-void    ithread_yield(void);
-
-void    ithread_sleep(UWORD32 u4_time);
-
-void    ithread_msleep(UWORD32 u4_time_ms);
-
-void    ithread_usleep(UWORD32 u4_time_us);
-
-UWORD32 ithread_get_sem_struct_size(void);
-
-WORD32  ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value);
-
-WORD32  ithread_sem_post(void *sem);
-
-WORD32  ithread_sem_wait(void *sem);
-
-WORD32  ithread_sem_destroy(void *sem);
-
-WORD32  ithread_set_affinity(WORD32 core_id);
-#endif /* _ITHREAD_H_ */
diff --git a/encoder/ive2.h b/encoder/ive2.h
index 8cb0fd1..7a543bb 100644
--- a/encoder/ive2.h
+++ b/encoder/ive2.h
@@ -293,7 +293,7 @@ typedef struct
     UWORD32                                 u4_max_bitrate;
 
     /** Maximum number of consecutive  B frames                             */
-    UWORD32                                 u4_max_num_bframes;
+    UWORD32                                 u4_num_bframes;
 
     /** Content type Interlaced/Progressive                                 */
     IV_CONTENT_TYPE_T                       e_content_type;
@@ -394,6 +394,15 @@ typedef struct
     /* encoded frame type                                               */
     UWORD32                                 u4_encoded_frame_type;
 
+    /** Flag to indicate if this is the last output from the encoder    */
+    UWORD32                                 u4_is_last;
+
+    /** Lower 32bits of input time stamp                                */
+    UWORD32                                 u4_timestamp_low;
+
+    /** Upper 32bits of input time stamp                                */
+    UWORD32                                 u4_timestamp_high;
+
     /** Descriptor for input raw buffer freed from codec                */
     iv_raw_buf_t                            s_inp_buf;
 
@@ -1339,9 +1348,6 @@ typedef struct
     /** IDR frame interval                                              */
     UWORD32                                     u4_idr_frm_interval;
 
-    /** consecutive B frames                                            */
-    UWORD32                                     u4_num_b_frames;
-
     /** Lower 32bits of time stamp corresponding to input buffer,
      * from which this command takes effect                             */
     UWORD32                                 u4_timestamp_low;
@@ -1428,6 +1434,9 @@ typedef struct
      * from which this command takes effect                             */
     UWORD32                                 u4_timestamp_high;
 
+    /** Entropy coding mode flag: 0-CAVLC, 1-CABAC                       */
+    UWORD32                                 u4_entropy_coding_mode;
+
 }ive_ctl_set_profile_params_ip_t;
 
 /** Output structure : Set Profile Params                               */
diff --git a/encoder/mips/ih264e_function_selector.c b/encoder/mips/ih264e_function_selector.c
index 58ec4d0..980a744 100644
--- a/encoder/mips/ih264e_function_selector.c
+++ b/encoder/mips/ih264e_function_selector.c
@@ -58,8 +58,8 @@
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -68,14 +68,15 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
-
+#include "ih264_cabac_tables.h"
 #include "ih264_macros.h"
 #include "ih264_platform_macros.h"
-#include "ih264e_defs.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
+#include "ih264e_cabac.h"
 #include "ih264e_platform_macros.h"
 
 /**
diff --git a/encoder/x86/ih264e_function_selector.c b/encoder/x86/ih264e_function_selector.c
index 429cdab..b0acb19 100644
--- a/encoder/x86/ih264e_function_selector.c
+++ b/encoder/x86/ih264e_function_selector.c
@@ -58,8 +58,8 @@
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -68,14 +68,15 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
-
+#include "ih264_cabac_tables.h"
 #include "ih264_macros.h"
 #include "ih264_platform_macros.h"
-#include "ih264e_defs.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
+#include "ih264e_cabac.h"
 #include "ih264e_platform_macros.h"
 
 /**
diff --git a/encoder/x86/ih264e_function_selector_sse42.c b/encoder/x86/ih264e_function_selector_sse42.c
index d953c76..6888e5d 100644
--- a/encoder/x86/ih264e_function_selector_sse42.c
+++ b/encoder/x86/ih264e_function_selector_sse42.c
@@ -59,8 +59,8 @@
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -69,23 +69,18 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
-
+#include "ih264_cabac_tables.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
+#include "ih264e_cabac.h"
 #include "ih264e_platform_macros.h"
-#include "ih264_intra_pred_filters.h"
-#include "ih264_trans_quant_itrans_iquant.h"
-#include "ih264e_defs.h"
-#include "ih264e_structs.h"
-#include "ih264_deblk_edge_filters.h"
 #include "ih264e_core_coding.h"
 #include "ih264_cavlc_tables.h"
 #include "ih264e_cavlc.h"
-#include "ih264_padding.h"
 #include "ih264e_intra_modes_eval.h"
-#include "ih264_mem_fns.h"
 #include "ih264e_fmt_conv.h"
 #include "ih264e_half_pel.h"
 
diff --git a/encoder/x86/ih264e_function_selector_ssse3.c b/encoder/x86/ih264e_function_selector_ssse3.c
index 4eb4c7b..4419112 100644
--- a/encoder/x86/ih264e_function_selector_ssse3.c
+++ b/encoder/x86/ih264e_function_selector_ssse3.c
@@ -59,8 +59,8 @@
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
 #include "ime_distortion_metrics.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-#include "ih264_defs.h"
 #include "ih264_error.h"
 #include "ih264_structs.h"
 #include "ih264_trans_quant_itrans_iquant.h"
@@ -69,23 +69,18 @@
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
-
+#include "ih264_cabac_tables.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
 #include "ih264e_platform_macros.h"
-#include "ih264_intra_pred_filters.h"
-#include "ih264_trans_quant_itrans_iquant.h"
-#include "ih264e_defs.h"
-#include "ih264e_structs.h"
-#include "ih264_deblk_edge_filters.h"
+#include "ih264e_cabac.h"
 #include "ih264e_core_coding.h"
 #include "ih264_cavlc_tables.h"
 #include "ih264e_cavlc.h"
-#include "ih264_padding.h"
 #include "ih264e_intra_modes_eval.h"
-#include "ih264_mem_fns.h"
 #include "ih264e_fmt_conv.h"
 #include "ih264e_half_pel.h"
 
diff --git a/encoder/x86/ih264e_half_pel_ssse3.c b/encoder/x86/ih264e_half_pel_ssse3.c
index 42580fa..8da73b7 100644
--- a/encoder/x86/ih264e_half_pel_ssse3.c
+++ b/encoder/x86/ih264e_half_pel_ssse3.c
@@ -55,7 +55,6 @@
 #include "ih264_defs.h"
 #include "ih264e_half_pel.h"
 #include "ih264_macros.h"
-#include "ih264e_half_pel.h"
 #include "ih264e_debug.h"
 #include "ih264_inter_pred_filters.h"
 #include "ih264_mem_fns.h"
diff --git a/encoder/x86/ih264e_intra_modes_eval_ssse3.c b/encoder/x86/ih264e_intra_modes_eval_ssse3.c
index 0f4a9ad..c11d7f2 100644
--- a/encoder/x86/ih264e_intra_modes_eval_ssse3.c
+++ b/encoder/x86/ih264e_intra_modes_eval_ssse3.c
@@ -67,18 +67,20 @@
 #include "ih264_inter_pred_filters.h"
 #include "ih264_mem_fns.h"
 #include "ih264_padding.h"
-#include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"
 #include "ime_distortion_metrics.h"
 #include "ih264e_error.h"
 #include "ih264e_bitstream.h"
+#include "ime_defs.h"
 #include "ime_structs.h"
-
+#include "ih264_cabac_tables.h"
 #include "irc_cntrl_param.h"
 #include "irc_frame_info_collector.h"
 #include "ih264e_rate_control.h"
 
+#include "ih264e_cabac_structs.h"
 #include "ih264e_structs.h"
+#include "ih264e_cabac.h"
 #include "ih264e_intra_modes_eval.h"
 #include "ih264e_globals.h"
 #include "ime_platform_macros.h"
diff --git a/encoder/x86/ime_distortion_metrics_sse42.c b/encoder/x86/ime_distortion_metrics_sse42.c
index baf18a4..0266916 100644
--- a/encoder/x86/ime_distortion_metrics_sse42.c
+++ b/encoder/x86/ime_distortion_metrics_sse42.c
@@ -249,12 +249,12 @@ void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src,
                     WORD32 i4_max_sad,
                     WORD32 *pi4_mb_distortion)
 {
-    UNUSED (i4_max_sad);
     __m128i src_r0, src_r1, src_r2, src_r3;
     __m128i est_r0, est_r1, est_r2, est_r3;
     __m128i res_r0, res_r1, res_r2, res_r3;
     __m128i sad_val;
     int val1, val2;
+    UNUSED (i4_max_sad);
 
     // Row 0-3 sad calculation
     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
@@ -500,7 +500,6 @@ void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src,
                                 WORD32 i4_max_sad,
                                 WORD32 *pi4_mb_distortion)
 {
-    UNUSED (i4_max_sad);
     __m128i src_r0, src_r1, src_r2, src_r3;
     __m128i est_r0, est_r1, est_r2, est_r3;
     __m128i res_r0, res_r1, res_r2, res_r3;
@@ -509,6 +508,7 @@ void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src,
     WORD32 i4_sad;
     UWORD8 *pu1_src_temp = pu1_src + src_strd;
     UWORD8 *pu1_est_temp = pu1_est + est_strd;
+    UNUSED (i4_max_sad);
 
     // Row 0,2,4,6 sad calculation
     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
diff --git a/test/Android.mk b/test/Android.mk
index adb14f0..0085832 100644
--- a/test/Android.mk
+++ b/test/Android.mk
@@ -6,4 +6,3 @@ include $(LOCAL_PATH)/encoder.mk
 
 # decoder
 include $(LOCAL_PATH)/decoder.mk
-
diff --git a/test/decoder/main.c b/test/decoder/main.c
index 921c240..8c9e885 100644
--- a/test/decoder/main.c
+++ b/test/decoder/main.c
@@ -1612,9 +1612,6 @@ WORD32 display_thread(void *pv_ctx)
     }
     ps_app_ctx->disp_deinit(ps_app_ctx->pv_disp_ctx);
 
-    /* destroy the display thread */
-    ithread_exit(ps_app_ctx->display_thread_handle);
-
     return 0;
 }
 
@@ -1827,6 +1824,7 @@ int main(WORD32 argc, CHAR *argv[])
     UWORD32 frm_cnt = 0;
     WORD32 total_bytes_comsumed;
     UWORD32 max_op_frm_ts;
+    UWORD32 u4_num_disp_bufs_with_dec;;
 
 #ifdef PROFILE_ENABLE
     UWORD32 u4_tot_cycles = 0;
@@ -2313,6 +2311,7 @@ int main(WORD32 argc, CHAR *argv[])
 
                 ivd_ctl_getbufinfo_ip_t s_ctl_ip;
                 ivd_ctl_getbufinfo_op_t s_ctl_op;
+                WORD32 outlen = 0;
 
                 s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
                 s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_GETBUFINFO;
@@ -2342,7 +2341,6 @@ int main(WORD32 argc, CHAR *argv[])
                 /* Or if shared and output is 420P */
                 if((0 == s_app_ctx.u4_share_disp_buf) || (IV_YUV_420P == s_app_ctx.e_output_chroma_format))
                 {
-                    UWORD32 outlen;
                     ps_out_buf->u4_min_out_buf_size[0] =
                                     s_ctl_op.u4_min_out_buf_size[0];
                     ps_out_buf->u4_min_out_buf_size[1] =
@@ -2377,6 +2375,56 @@ int main(WORD32 argc, CHAR *argv[])
                     ps_out_buf->u4_num_bufs = s_ctl_op.u4_min_num_out_bufs;
                 }
 
+#ifdef APP_EXTRA_BUFS
+                s_app_ctx.disp_delay = EXTRA_DISP_BUFFERS;
+                s_ctl_op.u4_num_disp_bufs += EXTRA_DISP_BUFFERS;
+#endif
+
+                /*****************************************************************************/
+                /*   API Call: Allocate display buffers for display buffer shared case       */
+                /*****************************************************************************/
+
+                for(i = 0; i < s_ctl_op.u4_num_disp_bufs; i++)
+                {
+
+                    s_app_ctx.s_disp_buffers[i].u4_min_out_buf_size[0] =
+                                    s_ctl_op.u4_min_out_buf_size[0];
+                    s_app_ctx.s_disp_buffers[i].u4_min_out_buf_size[1] =
+                                    s_ctl_op.u4_min_out_buf_size[1];
+                    s_app_ctx.s_disp_buffers[i].u4_min_out_buf_size[2] =
+                                    s_ctl_op.u4_min_out_buf_size[2];
+
+                    outlen = s_ctl_op.u4_min_out_buf_size[0];
+                    if(s_ctl_op.u4_min_num_out_bufs > 1)
+                        outlen += s_ctl_op.u4_min_out_buf_size[1];
+
+                    if(s_ctl_op.u4_min_num_out_bufs > 2)
+                        outlen += s_ctl_op.u4_min_out_buf_size[2];
+
+                    s_app_ctx.s_disp_buffers[i].pu1_bufs[0] = (UWORD8 *)malloc(outlen);
+
+                    if(s_app_ctx.s_disp_buffers[i].pu1_bufs[0] == NULL)
+                    {
+                        sprintf(ac_error_str,
+                                "\nAllocation failure for output buffer of i4_size %d",
+                                outlen);
+                        codec_exit(ac_error_str);
+                    }
+
+                    if(s_ctl_op.u4_min_num_out_bufs > 1)
+                        s_app_ctx.s_disp_buffers[i].pu1_bufs[1] =
+                                        s_app_ctx.s_disp_buffers[i].pu1_bufs[0]
+                                                        + (s_ctl_op.u4_min_out_buf_size[0]);
+
+                    if(s_ctl_op.u4_min_num_out_bufs > 2)
+                        s_app_ctx.s_disp_buffers[i].pu1_bufs[2] =
+                                        s_app_ctx.s_disp_buffers[i].pu1_bufs[1]
+                                                        + (s_ctl_op.u4_min_out_buf_size[1]);
+
+                    s_app_ctx.s_disp_buffers[i].u4_num_bufs =
+                                    s_ctl_op.u4_min_num_out_bufs;
+                }
+                s_app_ctx.num_disp_buf = s_ctl_op.u4_num_disp_bufs;
             }
         }
 
@@ -2556,73 +2604,6 @@ int main(WORD32 argc, CHAR *argv[])
     /*************************************************************************/
     //if(1 == s_app_ctx.u4_share_disp_buf)
     {
-        ivd_ctl_getbufinfo_ip_t s_ctl_ip;
-        ivd_ctl_getbufinfo_op_t s_ctl_op;
-        WORD32 outlen = 0;
-
-        s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
-        s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_GETBUFINFO;
-        s_ctl_ip.u4_size = sizeof(ivd_ctl_getbufinfo_ip_t);
-        s_ctl_op.u4_size = sizeof(ivd_ctl_getbufinfo_op_t);
-        ret = ivd_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
-                                   (void *)&s_ctl_op);
-        if(ret != IV_SUCCESS)
-        {
-            sprintf(ac_error_str, "Error in Get Buf Info %x", s_ctl_op.u4_error_code);
-            codec_exit(ac_error_str);
-        }
-
-#ifdef APP_EXTRA_BUFS
-        s_app_ctx.disp_delay = EXTRA_DISP_BUFFERS;
-        s_ctl_op.u4_num_disp_bufs += EXTRA_DISP_BUFFERS;
-#endif
-
-        /*****************************************************************************/
-        /*   API Call: Allocate display buffers for display buffer shared case       */
-        /*****************************************************************************/
-
-        for(i = 0; i < s_ctl_op.u4_num_disp_bufs; i++)
-        {
-
-            s_app_ctx.s_disp_buffers[i].u4_min_out_buf_size[0] =
-                            s_ctl_op.u4_min_out_buf_size[0];
-            s_app_ctx.s_disp_buffers[i].u4_min_out_buf_size[1] =
-                            s_ctl_op.u4_min_out_buf_size[1];
-            s_app_ctx.s_disp_buffers[i].u4_min_out_buf_size[2] =
-                            s_ctl_op.u4_min_out_buf_size[2];
-
-            outlen = s_ctl_op.u4_min_out_buf_size[0];
-            if(s_ctl_op.u4_min_num_out_bufs > 1)
-                outlen += s_ctl_op.u4_min_out_buf_size[1];
-
-            if(s_ctl_op.u4_min_num_out_bufs > 2)
-                outlen += s_ctl_op.u4_min_out_buf_size[2];
-
-            s_app_ctx.s_disp_buffers[i].pu1_bufs[0] = (UWORD8 *)malloc(outlen);
-
-            if(s_app_ctx.s_disp_buffers[i].pu1_bufs[0] == NULL)
-            {
-                sprintf(ac_error_str,
-                        "\nAllocation failure for output buffer of i4_size %d",
-                        outlen);
-                codec_exit(ac_error_str);
-            }
-
-            if(s_ctl_op.u4_min_num_out_bufs > 1)
-                s_app_ctx.s_disp_buffers[i].pu1_bufs[1] =
-                                s_app_ctx.s_disp_buffers[i].pu1_bufs[0]
-                                                + (s_ctl_op.u4_min_out_buf_size[0]);
-
-            if(s_ctl_op.u4_min_num_out_bufs > 2)
-                s_app_ctx.s_disp_buffers[i].pu1_bufs[2] =
-                                s_app_ctx.s_disp_buffers[i].pu1_bufs[1]
-                                                + (s_ctl_op.u4_min_out_buf_size[1]);
-
-            s_app_ctx.s_disp_buffers[i].u4_num_bufs =
-                            s_ctl_op.u4_min_num_out_bufs;
-        }
-        s_app_ctx.num_disp_buf = s_ctl_op.u4_num_disp_bufs;
-
         /*****************************************************************************/
         /*   API Call: Send the allocated display buffers to codec                   */
         /*****************************************************************************/
@@ -2638,7 +2619,7 @@ int main(WORD32 argc, CHAR *argv[])
 
             memcpy(&(s_set_display_frame_ip.s_disp_buffer),
                    &(s_app_ctx.s_disp_buffers),
-                   s_ctl_op.u4_num_disp_bufs * sizeof(ivd_out_bufdesc_t));
+                   s_app_ctx.num_disp_buf * sizeof(ivd_out_bufdesc_t));
 
             ret = ivd_api_function((iv_obj_t *)codec_obj,
                                        (void *)&s_set_display_frame_ip,
@@ -2732,7 +2713,17 @@ int main(WORD32 argc, CHAR *argv[])
 #ifndef PRINT_PICSIZE
     get_version(codec_obj);
 #endif
-    max_op_frm_ts = (s_app_ctx.u4_max_frm_ts > 0)? (s_app_ctx.u4_max_frm_ts + s_app_ctx.disp_delay): 0xffffffff;
+
+
+    max_op_frm_ts = s_app_ctx.u4_max_frm_ts + s_app_ctx.disp_delay;
+
+    if(max_op_frm_ts <  s_app_ctx.disp_delay)
+        max_op_frm_ts = 0xffffffff;/* clip as overflow has occured*/
+
+    max_op_frm_ts = (s_app_ctx.u4_max_frm_ts > 0)? (max_op_frm_ts): 0xffffffff;
+
+    u4_num_disp_bufs_with_dec = 0;
+
     while(u4_op_frm_ts < max_op_frm_ts)
     {
 
@@ -2759,9 +2750,10 @@ int main(WORD32 argc, CHAR *argv[])
 
         }
 #endif
-        if(u4_ip_frm_ts < s_app_ctx.num_disp_buf)
+        if(u4_num_disp_bufs_with_dec < s_app_ctx.num_disp_buf)
         {
-            release_disp_frame(codec_obj, u4_ip_frm_ts);
+            release_disp_frame(codec_obj, u4_num_disp_bufs_with_dec);
+            u4_num_disp_bufs_with_dec ++;
         }
 
 
@@ -2991,6 +2983,9 @@ int main(WORD32 argc, CHAR *argv[])
                     sprintf(ac_error_str, "Error in Reset");
                     codec_exit(ac_error_str);
                 }
+
+                /*when reset all buffers are released by lib*/
+                u4_num_disp_bufs_with_dec = 0;
                 /*************************************************************************/
                 /* set num of cores                                                      */
                 /*************************************************************************/
diff --git a/test/encoder/app.h b/test/encoder/app.h
index 7c16fcd..ad45f5a 100644
--- a/test/encoder/app.h
+++ b/test/encoder/app.h
@@ -42,6 +42,8 @@
 #include "ive2.h"
 #ifdef WINDOWS_TIMER
 #include <windows.h>
+#else
+#include <sys/time.h>
 #endif
 /*****************************************************************************/
 /* Function Macros                                                           */
@@ -55,13 +57,13 @@
 /* Constant Macros                                                           */
 /*****************************************************************************/
 
-#define DEFAULT_NUM_INPUT_BUFS   1
+#define DEFAULT_NUM_INPUT_BUFS   32
 #define DEFAULT_MAX_INPUT_BUFS   32
 
-#define DEFAULT_NUM_OUTPUT_BUFS  1
+#define DEFAULT_NUM_OUTPUT_BUFS  32
 #define DEFAULT_MAX_OUTPUT_BUFS  32
 
-#define DEFAULT_NUM_RECON_BUFS   1
+#define DEFAULT_NUM_RECON_BUFS   32
 #define DEFAULT_MAX_RECON_BUFS   DEFAULT_NUM_RECON_BUFS
 
 
@@ -69,11 +71,12 @@
 #define MAX_VBV_BUFF_SIZE        (120 * 16384)
 #define MAX_NUM_IO_BUFS           3
 
-#define DEFAULT_MAX_REF_FRM         1
+#define DEFAULT_MAX_REF_FRM         2
 #define DEFAULT_MAX_REORDER_FRM     0
-#define DEFAULT_QP_MIN              0
+#define DEFAULT_QP_MIN              4
 #define DEFAULT_QP_MAX              51
 #define DEFAULT_MAX_BITRATE         20000000
+#define DEFAULT_NUM_BFRAMES         0
 #define DEFAULT_MAX_SRCH_RANGE_X    256
 #define DEFAULT_MAX_SRCH_RANGE_Y    256
 #define DEFAULT_MAX_FRAMERATE       120000
@@ -94,7 +97,7 @@
 #define DEFAULT_TGT_FRAME_RATE      30
 #define DEFAULT_MAX_WD              1920
 #define DEFAULT_MAX_HT              1920
-#define DEFAULT_MAX_LEVEL           50
+#define DEFAULT_MAX_LEVEL           40
 #define DEFAULT_STRIDE              0
 #define DEFAULT_WD                  0
 #define DEFAULT_HT                  0
@@ -127,6 +130,8 @@
 #define DEFAULT_EPROFILE            IV_PROFILE_BASE
 #define DEFAULT_SLICE_MODE          0
 #define DEFAULT_SLICE_PARAM         256
+#define DEFAULT_ENTROPY_CODING_MODE 0
+
 #define STRLENGTH               500
 
 
@@ -281,6 +286,7 @@ typedef struct
     UWORD32 u4_i_interval;
     UWORD32 u4_idr_interval;
     UWORD32 u4_b_frames;
+    UWORD32 u4_num_bframes;
     UWORD32 u4_disable_deblk_level;
     UWORD32 u4_hpel;
     UWORD32 u4_qpel;
@@ -289,6 +295,7 @@ typedef struct
 
     UWORD32 u4_slice_mode;
     UWORD32 u4_slice_param;
+    UWORD32 u4_entropy_coding_mode;
 
     void *pv_input_thread_handle;
     void *pv_output_thread_handle;
diff --git a/test/encoder/input.c b/test/encoder/input.c
index c292612..1d40eb0 100644
--- a/test/encoder/input.c
+++ b/test/encoder/input.c
@@ -27,7 +27,6 @@
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
-#include <sys/time.h>
 
 /* User include files */
 #include "ih264_typedefs.h"
diff --git a/test/encoder/main.c b/test/encoder/main.c
index 26420e2..4ff71af 100644
--- a/test/encoder/main.c
+++ b/test/encoder/main.c
@@ -28,7 +28,6 @@
 #include <stddef.h>
 #include <assert.h>
 #include <string.h>
-#include <sys/time.h>
 
 #ifndef IOS
 #include <malloc.h>
@@ -36,6 +35,8 @@
 
 #ifdef WINDOWS_TIMER
 #include "windows.h"
+#else
+#include <sys/time.h>
 #endif
 /* User include files */
 #include "ih264_typedefs.h"
@@ -91,6 +92,7 @@ typedef enum
     I_QP_MIN,
     P_QP_MIN,
     B_QP_MIN,
+    ENTROPY,
     AIR,
     AIR_REFRESH_PERIOD,
     ARCH,
@@ -105,6 +107,7 @@ typedef enum
     I_INTERVAL,
     IDR_INTERVAL,
     B_FRMS,
+    NUM_B_FRMS,
     DISABLE_DBLK,
     PROFILE,
     FAST_SAD,
@@ -153,7 +156,7 @@ static const argument_t argument_mapping[] =
                 { "--", "--src_framerate", SRC_FRAMERATE, "Source frame rate \n" },
                 { "--", "--i_interval", I_INTERVAL,  "Intra frame interval \n" },
                 { "--", "--idr_interval", IDR_INTERVAL,  "IDR frame interval \n" },
-                { "--", "--bframes", B_FRMS,  "Consecutive B frames \n" },
+                { "--", "--bframes", NUM_B_FRMS, "Maximum number of consecutive B frames \n" },
                 { "--", "--speed", ENC_SPEED, "Encoder speed preset 0 (slowest) and 100 (fastest)\n" },
                 { "--", "--me_speed", ME_SPEED, "Encoder speed preset 0 (slowest) and 100 (fastest)\n" },
                 { "--", "--fast_sad", FAST_SAD, " Flag for faster sad execution\n" },
@@ -193,6 +196,7 @@ static const argument_t argument_mapping[] =
                 { "--", "--qp_i_min",     I_QP_MIN,              "Min QP for I frames\n"},
                 { "--", "--qp_p_min",     P_QP_MIN,              "Min QP for P frames\n"},
                 { "--", "--qp_b_min",     B_QP_MIN,              "Min QP for B frames\n"},
+                { "--", "--entropy",      ENTROPY,              "Entropy coding mode(0: CAVLC or 1: CABAC)\n"},
                 { "--", "--vbv_delay",    VBV_DELAY,             "VBV buffer delay\n"},
                 { "--", "--vbv_size",     VBV_SIZE,              "VBV buffer size\n"},
                 { "-i4", "--intra_4x4_enable", INTRA_4x4_ENABLE, "Intra 4x4 enable \n" },
@@ -657,6 +661,10 @@ void parse_argument(app_ctxt_t *ps_app_ctxt, CHAR *argument, CHAR *value)
           sscanf(value, "%d", &ps_app_ctxt->u4_b_qp_min);
           break;
 
+      case ENTROPY:
+          sscanf(value, "%d", &ps_app_ctxt->u4_entropy_coding_mode);
+          break;
+
       case AIR:
           sscanf(value, "%d", &ps_app_ctxt->u4_air);
           break;
@@ -742,8 +750,8 @@ void parse_argument(app_ctxt_t *ps_app_ctxt, CHAR *argument, CHAR *value)
         sscanf(value, "%d", &ps_app_ctxt->u4_idr_interval);
         break;
 
-      case B_FRMS:
-        sscanf(value, "%d", &ps_app_ctxt->u4_b_frames);
+      case NUM_B_FRMS:
+        sscanf(value, "%d", &ps_app_ctxt->u4_num_bframes);
         break;
 
       case DISABLE_DEBLOCK_LEVEL:
@@ -886,7 +894,13 @@ void validate_params(app_ctxt_t *ps_app_ctxt)
         sprintf(ac_error, "Invalid number of frames to be encoded: %d", ps_app_ctxt->u4_max_num_frms);
         invalid_argument_exit(ac_error);
     }
-
+    if ((0 != (WORD32)ps_app_ctxt->u4_entropy_coding_mode)
+                    && (1 != (WORD32)ps_app_ctxt->u4_entropy_coding_mode))
+    {
+        sprintf(ac_error, "Invalid entropy codeing mode: %d",
+                ps_app_ctxt->u4_entropy_coding_mode);
+        invalid_argument_exit(ac_error);
+    }
     return;
 }
 
@@ -944,6 +958,7 @@ void init_default_params(app_ctxt_t *ps_app_ctxt)
     ps_app_ctxt->u4_enable_alt_ref       = DEFAULT_ENABLE_ALT_REF;
     ps_app_ctxt->u4_rc                   = DEFAULT_RC;
     ps_app_ctxt->u4_max_bitrate          = DEFAULT_MAX_BITRATE;
+    ps_app_ctxt->u4_num_bframes          = DEFAULT_NUM_BFRAMES;
     ps_app_ctxt->u4_bitrate              = DEFAULT_BITRATE;
     ps_app_ctxt->u4_i_qp                 = DEFAULT_I_QP;
     ps_app_ctxt->u4_p_qp                 = DEFAULT_P_QP;
@@ -960,7 +975,6 @@ void init_default_params(app_ctxt_t *ps_app_ctxt)
     ps_app_ctxt->u4_srch_rng_y           = DEFAULT_SRCH_RNG_Y;
     ps_app_ctxt->u4_i_interval           = DEFAULT_I_INTERVAL;
     ps_app_ctxt->u4_idr_interval         = DEFAULT_IDR_INTERVAL;
-    ps_app_ctxt->u4_b_frames             = DEFAULT_B_FRAMES;
     ps_app_ctxt->u4_disable_deblk_level  = DEFAULT_DISABLE_DEBLK_LEVEL;
     ps_app_ctxt->u4_hpel                 = DEFAULT_HPEL;
     ps_app_ctxt->u4_qpel                 = DEFAULT_QPEL;
@@ -979,6 +993,7 @@ void init_default_params(app_ctxt_t *ps_app_ctxt)
     ps_app_ctxt->u4_psnr_cnt             = 0;
     ps_app_ctxt->pu1_psnr_buf            = NULL;
     ps_app_ctxt->u4_psnr_buf_size        = 0;
+    ps_app_ctxt->u4_entropy_coding_mode  = DEFAULT_ENTROPY_CODING_MODE;
 
     return;
 }
@@ -1334,7 +1349,6 @@ void set_gop_params(app_ctxt_t *ps_app_ctxt,
 
     s_gop_params_ip.s_ive_ip.u4_i_frm_interval = ps_app_ctxt->u4_i_interval;
     s_gop_params_ip.s_ive_ip.u4_idr_frm_interval = ps_app_ctxt->u4_idr_interval;
-    s_gop_params_ip.s_ive_ip.u4_num_b_frames = ps_app_ctxt->u4_b_frames;
 
     s_gop_params_ip.s_ive_ip.u4_timestamp_high = u4_timestamp_high;
     s_gop_params_ip.s_ive_ip.u4_timestamp_low = u4_timestamp_low;
@@ -1368,6 +1382,8 @@ void set_profile_params(app_ctxt_t *ps_app_ctxt,
 
     s_profile_params_ip.s_ive_ip.e_profile = ps_app_ctxt->e_profile;
 
+    s_profile_params_ip.s_ive_ip.u4_entropy_coding_mode = ps_app_ctxt->u4_entropy_coding_mode;
+
     s_profile_params_ip.s_ive_ip.u4_timestamp_high = u4_timestamp_high;
     s_profile_params_ip.s_ive_ip.u4_timestamp_low = u4_timestamp_low;
 
@@ -1433,7 +1449,7 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
 
     IV_STATUS_T status = IV_SUCCESS;
 
-    WORD32 i, read_failed = 0, is_last = 0, buff_size = 0, num_bytes = 0;
+    WORD32 i, is_last = 0, buff_size = 0, num_bytes = 0;
     UWORD32 u4_total_time = 0;
     UWORD8 *pu1_buf = NULL;
     UWORD32 u4_timestamp_low, u4_timestamp_high;
@@ -1449,6 +1465,7 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
     iv_raw_buf_t s_inp_buf, s_recon_buf;
     CHAR ac_error[STRLENGTH];
     WORD32 end_of_frames=0;
+    WORD32 i4_inp_done =0;
 
     u4_timestamp_low = 0;
     u4_timestamp_high = 0;
@@ -1498,18 +1515,6 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
         }
     }
 
-#if 0 //Input buffer dump
-    //if(1 == ps_app_ctxt->u4_psnr_enable)
-    {
-        ps_app_ctxt->fp_dump_op              = fopen("D:\\dump\\inp.yuv", "wb");
-        if(NULL == ps_app_ctxt->fp_dump_op)
-        {
-            sprintf(ac_error, "Unable to open output file for input dump: %s", "D:\\dump\\inp.yuv");
-            invalid_argument_exit(ac_error);
-        }
-    }
-#endif //Input buffer dump
-
     /* If PSNR is enabled, open input file again and hold a different file pointer
      * This makes it easy to compute PSNR without adding dependency between input and recon threads
      */
@@ -1548,10 +1553,6 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
     while(1)
     {
 
-
-
-
-
         /******************************************************************************/
         /****************** Input Initialization **************************************/
         /******************************************************************************/
@@ -1568,6 +1569,12 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
             }
         }
 
+        if (i == DEFAULT_MAX_INPUT_BUFS)
+        {
+            printf("\n Unable to find a free input buffer!!");
+            exit(0);
+        }
+
         ps_video_encode_ip->u4_size = sizeof(ih264e_video_encode_ip_t);
         ps_video_encode_op->u4_size = sizeof(ih264e_video_encode_op_t);
 
@@ -1637,11 +1644,19 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
             ps_inp_raw_buf->au4_strd[0] = ps_app_ctxt->u4_strd *2;
         }
 
+        /*
+         * Here we read input and other associated buffers. Regardless of success
+         * we will proceed from here as we will need extra calls to flush out
+         * input queue in encoder. Note that this is not necessary. You can just
+         * send encode calls till with valid output and recon buffers till the
+         * queue is flushed.
+         */
         while(1)
         {
             IV_STATUS_T mb_info_status = IV_SUCCESS, pic_info_status = IV_SUCCESS;
-            read_failed = 0;
+
             status = read_input(ps_app_ctxt->fp_ip, ps_inp_raw_buf);
+
             if (ps_app_ctxt->u4_mb_info_type != 0)
             {
                 mb_info_status = read_mb_info(ps_app_ctxt, pv_mb_info);
@@ -1656,15 +1671,12 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
                 if(0 == ps_app_ctxt->u4_loopback)
                 {
                     is_last = 1;
-                    read_failed = 1;
-
                     break;
                 }
                 else
                     fseek(ps_app_ctxt->fp_ip, 0, SEEK_SET);
             }
-            else
-                break;
+            break;
         }
 
         /******************************************************************************/
@@ -1716,13 +1728,12 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
             ps_inp_raw_buf->apv_bufs[0] = NULL;
             ps_inp_raw_buf->apv_bufs[1] = NULL;
             ps_inp_raw_buf->apv_bufs[2] = NULL;
-            end_of_frames = 1;
         }
 
         ps_video_encode_ip->u4_is_last = is_last;
         ps_video_encode_ip->u4_mb_info_type = ps_app_ctxt->u4_mb_info_type;
         ps_video_encode_ip->u4_pic_info_type = ps_app_ctxt->u4_pic_info_type;;
-        ps_video_encode_op->s_out_buf.pv_buf= 0;
+        ps_video_encode_op->s_out_buf.pv_buf= NULL;
         ps_video_encode_ip->u4_timestamp_high = u4_timestamp_high;
         ps_video_encode_ip->u4_timestamp_low = u4_timestamp_low;
 
@@ -1766,11 +1777,7 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
         /****************** Writing Output ********************************************/
         /******************************************************************************/
         num_bytes = 0;
-        /* Break if all the encoded frames are taken from encoder */
-        if(1 == end_of_frames && 0 == ps_video_encode_op->output_present)
-        {
-            break;
-        }
+
         if(1 == ps_video_encode_op->output_present)
         {
             num_bytes = ps_video_encode_op->s_out_buf.u4_bytes;
@@ -1783,7 +1790,11 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
                 printf("Error: Unable to write to output file\n");
                 break;
             }
+        }
 
+        /* free input bufer if codec returns a valid input buffer */
+        if (ps_video_encode_op->s_inp_buf.apv_bufs[0])
+        {
             /* Reuse of freed input buffer */
             for(i = 0; i < DEFAULT_MAX_INPUT_BUFS; i++)
             {
@@ -1793,8 +1804,11 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
                     break;
                 }
             }
+        }
 
-            /* Reuse of freed output buffer */
+        /* free output buffer if codec returns a valid output buffer */
+        // if(ps_video_encode_op->s_out_buf.pv_buf)
+        {
             for(i = 0; i < DEFAULT_MAX_OUTPUT_BUFS; i++)
             {
                 if(ps_app_ctxt->as_output_buf[i].pu1_buf == ps_video_encode_op->s_out_buf.pv_buf)
@@ -1805,132 +1819,169 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
             }
         }
 
-        if (ps_video_encode_op->dump_recon == 1)
+        /**********************************************************************
+         *  Print stats
+         **********************************************************************/
         {
-            ps_app_ctxt->u4_pics_cnt++;
+            UWORD8 u1_pic_type[][5] =
+                { "IDR", "I", "P", "B", "NA" };
+            WORD32 lookup_idx = 0;
 
-            ps_app_ctxt->avg_time = u4_total_time / ps_app_ctxt->u4_pics_cnt;
-            if (ps_app_ctxt->u4_psnr_enable == 0)
+            if (ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type
+                            == IV_IDR_FRAME)
             {
-                UWORD8 u1_pic_type[][5] = { "IDR", "I", "P","NA" };
-                WORD32 lookup_idx = 0;
+                lookup_idx = 0;
+            }
+            else if(ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type
+                            == IV_I_FRAME)
+            {
+                lookup_idx = 1;
+            }
+            else if(ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type
+                            == IV_P_FRAME)
+            {
+                lookup_idx = 2;
+            }
+            else if(ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type
+                            == IV_B_FRAME)
+            {
+                lookup_idx = 3;
+            }
+            else if(ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type
+                            == IV_NA_FRAME)
+            {
+                lookup_idx = 4;
+            }
 
-                if (ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type == IV_IDR_FRAME)
-                {
-                    lookup_idx = 0;
-                }
-                else if (ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type == IV_I_FRAME)
-                {
-                    lookup_idx = 1;
-                }
-                else if (ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type == IV_P_FRAME)
-                {
-                    lookup_idx = 2;
-                }
-                else if (ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type == IV_NA_FRAME)
-                {
-                     lookup_idx = 3;
-                }
+            if (ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type
+                            != IV_NA_FRAME)
+            {
+                ps_app_ctxt->u4_pics_cnt++;
+                ps_app_ctxt->avg_time = u4_total_time / ps_app_ctxt->u4_pics_cnt;
+                ps_app_ctxt->u4_total_bytes += num_bytes;
+            }
 
-                printf("[%s] PicNum %4d Bytes Generated %6d TimeTaken(microsec): %6d AvgTime: %6d PeakAvgTimeMax: %6d\n", u1_pic_type[lookup_idx], ps_app_ctxt->u4_pics_cnt, num_bytes, timetaken, ps_app_ctxt->avg_time, peak_avg_max);
+            if (ps_app_ctxt->u4_psnr_enable == 0)
+            {
+                printf("[%s] PicNum %4d Bytes Generated %6d TimeTaken(microsec): %6d AvgTime: %6d PeakAvgTimeMax: %6d\n",
+                       u1_pic_type[lookup_idx], ps_app_ctxt->u4_pics_cnt,
+                       num_bytes, timetaken, ps_app_ctxt->avg_time,
+                       peak_avg_max);
             }
+        }
+
 
-            ps_app_ctxt->u4_total_bytes += num_bytes;
+        /* For psnr computation, we need to read the correct input frame and
+         * compare with recon. The difficulty with doing it is that we only know
+         * that the frame number of recon is monotonically increasing. There
+         * may be gaps in the recon if any pre or post enc skip happens. There are
+         * 3 senarios
+         *  1) A frame is encoded -> returns the pic type
+         *  2) A frame is not encoded -> Encoder is waiting, the frame may get
+         *     encoded later
+         *  3) A frame is not encoded -> A post enc or pre enc skip happend. The
+         *     frame is not going to be encoded
+         *
+         *     The 1st and 2nd scenarios are easy, since we just needs to increment
+         *     recon cnt whenever we get a valid recon. This cnt can we used to
+         *     sync the recon and input
+         *     3rd scenario in conjuction with 2nd will pose problems. Even if
+         *     the returning frame is NA, we donot know we should increment the
+         *     recon cnt or not becasue it can be case 2 or case 3.
+         *
+         *  Solutions:
+         *  -------------------------
+         *   One way to over come this will be to return more information as of
+         *   the frame type. We can send if a frame was skipped as a part of the
+         *   return frame type.
+         *   This will not work. Since the output and recon are not in sync, we
+         *   cannot use the current output frame type to determine if a recon
+         *   is present currently or not. We need some other way to acheive this.
+         *
+         *   Other way to do this which is cleaner and maintains the seperation
+         *   between recon and the ouptut is to set the width [& height] of output recon
+         *   buffer to be zero. Hence we will in effect be saying :"look there
+         *   is a recon, but due to frame not being encoded it is having a width 0".
+         *   To be more clear we need to make height also to be zero.
+         *
+         *   But are we using these variables for allocating and deallocating
+         *   the buffers some where ? No we are not. The buffer gets re-init
+         *   at every encode call
+         *
+         *   Fixes
+         *   ------------------------
+         *   Currently the recon buff width and height are set in the encoder.
+         *   This will not work now because since recon and input are not
+         *   in sync. Hence a recon buff sent at time stamp x will get used to
+         *   fill recon of input at time stamp y (x > y). If we reduced the
+         *   frame dimensions in between, the recon buffer will not have enough
+         *   space. Hence we need to set the with and height appropriatley inside
+         *   lib itself.
+         */
 
-            /******************************************************************************/
-            /****************** Writing Recon  ********************************************/
-            /******************************************************************************/
-            if(1 == ps_video_encode_op->output_present)
+        if (ps_app_ctxt->u4_recon_enable || ps_app_ctxt->u4_chksum_enable
+                        || ps_app_ctxt->u4_psnr_enable)
+        {
+            if (ps_video_encode_op->dump_recon)
             {
                 s_recon_buf = ps_video_encode_op->s_recon_buf;
 
-                /* Dump recon when enabled, and output bytes != 0*/
-                if(ps_app_ctxt->u4_recon_enable)
-                {
-                    status = write_recon(ps_app_ctxt->fp_recon, &s_recon_buf);
-                    if(IV_SUCCESS != status)
-                    {
-                        printf("Error: Unable to write to recon file\n");
-                        break;
-                    }
-                }
-
-
-                if(ps_app_ctxt->u4_psnr_enable)
-                {
+                /* Read input for psnr computuation */
+                if (ps_app_ctxt->u4_psnr_enable)
                     read_input(ps_app_ctxt->fp_psnr_ip, &s_inp_buf);
-                    compute_psnr(ps_app_ctxt, &s_recon_buf, &s_inp_buf);
-                }
-
 
-                if(ps_app_ctxt->u4_chksum_enable)
+                /* if we have a valid recon buffer do the assocated tasks */
+                if (s_recon_buf.au4_wd[0])
                 {
-                    WORD32 comp;
-                    WORD32 num_comp;
-                    num_comp = 2;
-                    if(IV_YUV_420P == s_recon_buf.e_color_fmt)
-                        num_comp = 3;
+                    /* Dump recon when enabled, and output bytes != 0 */
+                    if (ps_app_ctxt->u4_recon_enable)
+                    {
+                        status = write_recon(ps_app_ctxt->fp_recon, &s_recon_buf);
+                        if (IV_SUCCESS != status)
+                        {
+                            printf("Error: Unable to write to recon file\n");
+                            break;
+                        }
+                    }
 
-                    for(comp = 0; comp < num_comp; comp++ )
+                    if (ps_app_ctxt->u4_psnr_enable)
                     {
-                        UWORD8 au1_chksum[16];
+                        compute_psnr(ps_app_ctxt, &s_recon_buf, &s_inp_buf);
+                    }
 
-                        calc_md5_cksum((UWORD8 *)s_recon_buf.apv_bufs[comp],
-                                       s_recon_buf.au4_strd[comp],
-                                       s_recon_buf.au4_wd[comp],
-                                       s_recon_buf.au4_ht[comp],
-                                       au1_chksum);
 
-                        fwrite(au1_chksum, sizeof(UWORD8), 16, ps_app_ctxt->fp_chksum);
+                    if (ps_app_ctxt->u4_chksum_enable)
+                    {
+                        WORD32 comp, num_comp = 2;
+
+                        if (IV_YUV_420P == s_recon_buf.e_color_fmt)
+                            num_comp = 3;
+
+                        for (comp = 0; comp < num_comp; comp++)
+                        {
+                            UWORD8 au1_chksum[16];
+                            calc_md5_cksum((UWORD8 *)s_recon_buf.apv_bufs[comp],
+                                           s_recon_buf.au4_strd[comp],
+                                           s_recon_buf.au4_wd[comp],
+                                           s_recon_buf.au4_ht[comp],
+                                           au1_chksum);
+                            fwrite(au1_chksum, sizeof(UWORD8), 16, ps_app_ctxt->fp_chksum);
+                        }
                     }
                 }
-
-
             }
         }
-        else
-        {
-            if (ps_app_ctxt->u4_psnr_enable == 0)
-            {
-                UWORD8 u1_pic_type[][5] = { "IDR", "I", "P", "NA" };
-                WORD32 lookup_idx = 0;
 
-                if (ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type == IV_IDR_FRAME)
-                {
-                    lookup_idx = 0;
-                }
-                else if (ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type == IV_I_FRAME)
-                {
-                    lookup_idx = 1;
-                }
-                else if (ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type == IV_P_FRAME)
-                {
-                    lookup_idx = 2;
-                }
-                else if (ih264e_video_encode_op.s_ive_op.u4_encoded_frame_type == IV_NA_FRAME)
-                {
-                    lookup_idx = 3;
-                }
-
-                printf("[%s] PicNum %4d Bytes Generated %6d TimeTaken(microsec): %6d AvgTime: %6d PeakAvgTimeMax: %6d\n", u1_pic_type[lookup_idx], ps_app_ctxt->u4_pics_cnt, num_bytes, timetaken, ps_app_ctxt->avg_time, peak_avg_max);
-            }
-            else
-            {
-                read_input(ps_app_ctxt->fp_psnr_ip, &s_inp_buf);
-            }
-        }
-#if 0 //Input buffer dump
-        /*Dump input buffers to a file*/
-        dump_input(ps_app_ctxt->fp_dump_op, ps_inp_raw_buf);
-#endif //Input buffer dump
+        u4_timestamp_low++;
 
-        if(is_last)
+        /* Break if all the encoded frames are taken from encoder */
+        if (1 == ps_video_encode_op->u4_is_last)
+        {
             break;
-
-        u4_timestamp_low++;
+        }
     }
 
-    /* Pic count is 1 more than actual num frames encoded, beacause last call is to just get the output  */
+    /* Pic count is 1 more than actual num frames encoded, because last call is to just get the output  */
     ps_app_ctxt->u4_pics_cnt--;
 
     if(ps_app_ctxt->u4_psnr_enable)
@@ -2008,9 +2059,10 @@ int main(int argc, char *argv[])
 
     /* error status */
     IV_STATUS_T status = IV_SUCCESS;
-
+#ifdef IOS
     /* temp var */
     CHAR filename_with_path[STRLENGTH];
+#endif
     WORD32 num_mem_recs;
     iv_obj_t *ps_enc;
     WORD32 i;
@@ -2036,6 +2088,11 @@ int main(int argc, char *argv[])
     }
     else if(argc == 2)
     {
+        if (!strcmp(argv[1], "--help"))
+        {
+            print_usage();
+            exit(-1);
+        }
         strcpy(ac_cfg_fname, argv[1]);
     }
 
@@ -2053,7 +2110,7 @@ int main(int argc, char *argv[])
     /* Read command line arguments */
     if(argc > 2)
     {
-        for(i = 1; i < argc; i += 2)
+        for(i = 1; i + 1 < argc; i += 2)
         {
             if(CONFIG == get_argument(argv[i]))
             {
@@ -2247,7 +2304,7 @@ int main(int argc, char *argv[])
         s_init_ip.s_ive_ip.e_rc_mode            = s_app_ctxt.u4_rc;
         s_init_ip.s_ive_ip.u4_max_framerate     = s_app_ctxt.u4_max_frame_rate;
         s_init_ip.s_ive_ip.u4_max_bitrate       = s_app_ctxt.u4_max_bitrate;
-        s_init_ip.s_ive_ip.u4_max_num_bframes   = DEFAULT_B_FRAMES;
+        s_init_ip.s_ive_ip.u4_num_bframes       = s_app_ctxt.u4_num_bframes;
         s_init_ip.s_ive_ip.e_content_type       = IV_PROGRESSIVE;
         s_init_ip.s_ive_ip.u4_max_srch_rng_x    = DEFAULT_MAX_SRCH_RANGE_X;
         s_init_ip.s_ive_ip.u4_max_srch_rng_y    = DEFAULT_MAX_SRCH_RANGE_Y;
@@ -2443,10 +2500,13 @@ int main(int argc, char *argv[])
         WORD32 achieved_bitrate;
 
         if(s_app_ctxt.u4_pics_cnt != 0)
+        {
             bytes_per_frame = (s_app_ctxt.u4_total_bytes) / (s_app_ctxt.u4_pics_cnt);
+        }
         else
+        {
             bytes_per_frame = 0;
-
+        }
         bytes_per_second = (bytes_per_frame * s_app_ctxt.u4_tgt_frame_rate);
 
         achieved_bitrate = bytes_per_second * 8;
diff --git a/test/encoder/output.c b/test/encoder/output.c
index e0f27dd..8438869 100644
--- a/test/encoder/output.c
+++ b/test/encoder/output.c
@@ -28,7 +28,6 @@
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
-#include <sys/time.h>
 /* User include files */
 
 #include "ih264_typedefs.h"
diff --git a/test/encoder/psnr.c b/test/encoder/psnr.c
index c9bb6a1..6913cb3 100644
--- a/test/encoder/psnr.c
+++ b/test/encoder/psnr.c
@@ -26,7 +26,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
-#include <sys/time.h>
 
 /* User include files */
 #include "ih264_typedefs.h"
diff --git a/test/encoder/recon.c b/test/encoder/recon.c
index ed63aac..d177a62 100644
--- a/test/encoder/recon.c
+++ b/test/encoder/recon.c
@@ -28,7 +28,6 @@
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
-#include <sys/time.h>
 /* User include files */
 
 #include "ih264_typedefs.h"
@@ -54,7 +53,7 @@
 IV_STATUS_T write_recon(FILE *fp, iv_raw_buf_t *ps_raw_buf)
 {
     WORD32 bytes;
-    WORD32 wd, ht, strd;
+    WORD32 wd, ht;
     UWORD8 *pu1_buf;
     WORD32 i;
     WORD32 comp;
@@ -68,7 +67,6 @@ IV_STATUS_T write_recon(FILE *fp, iv_raw_buf_t *ps_raw_buf)
     {
         wd = ps_raw_buf->au4_wd[comp];
         ht = ps_raw_buf->au4_ht[comp];
-        strd = ps_raw_buf->au4_strd[comp];
         pu1_buf = ps_raw_buf->apv_bufs[comp];
         for(i = 0; i < ht; i++)
         {
@@ -163,7 +161,7 @@ void init_raw_buf_descr(app_ctxt_t *ps_app_ctxt, iv_raw_buf_t *ps_raw_buf, UWORD
 
     /* All the pointers and dimensions are initialized here
      * to support change in resolution from the application */
-    luma_size = ALIGN16(ps_app_ctxt->u4_wd) * ALIGN16(ps_app_ctxt->u4_ht);
+    luma_size = ps_app_ctxt->u4_max_wd * ps_app_ctxt->u4_max_ht;
     chroma_size = (luma_size) / 4;
 
     ps_raw_buf->apv_bufs[0] = pu1_buf;