38 files changed, 13706 insertions, 0 deletions
diff --git a/common/arm/impeg2_format_conv.s b/common/arm/impeg2_format_conv.s
new file mode 100644
index 0000000..c07edda
--- /dev/null
+++ b/common/arm/impeg2_format_conv.s
@@ -0,0 +1,391 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+
+@/*
+@//----------------------------------------------------------------------------
+@// File Name            : impeg2_format_conv.s
+@//
+@// Description          : This file has the Idct Implementations for the
+@//                        MPEG4 SP decoder on neon platform.
+@//
+@// Reference Document   :
+@//
+@// Revision History     :
+@//      Date            Author                  Detail Description
+@//   ------------    ----------------    ----------------------------------
+@//   Jul 07, 2008     Naveen Kumar T                Created
+@//
+@//-------------------------------------------------------------------------
+@*/
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Include Files
+@// ----------------------------------------------------------------------------
+@*/
+.text
+.p2align 2
+.equ log2_16 ,  4
+.equ log2_2  ,  1
+@/*
+@// ----------------------------------------------------------------------------
+@// Struct/Union Types and Define
+@// ----------------------------------------------------------------------------
+@*/
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Global Data section variables
+@// ----------------------------------------------------------------------------
+@*/
+@//--------------------------- NONE --------------------------------------------
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Prototype Functions
+@// ----------------------------------------------------------------------------
+@*/
+@// -------------------------- NONE --------------------------------------------
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Exported functions
+@// ----------------------------------------------------------------------------
+@*/
+
+@/*****************************************************************************
+@*                                                                            *
+@*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q()                      *
+@*                                                                            *
+@*  Description      : This function conversts the image from YUV420P color   *
+@*                     space to 420SP color space(UV interleaved).        *
+@*                                                                            *
+@*  Arguments        : R0           pu1_y                                     *
+@*                     R1           pu1_u                                     *
+@*                     R2           pu1_v                                     *
+@*                     R3           pu1_dest_y                                *
+@*                     [R13 #40]    pu1_dest_uv                               *
+@*                     [R13 #44]    u2_height                                 *
+@*                     [R13 #48]    u2_width                                  *
+@*                     [R13 #52]    u2_stridey                                *
+@*                     [R13 #56]    u2_strideu                                *
+@*                     [R13 #60]    u2_stridev                                *
+@*                     [R13 #64]    u2_dest_stride_y                          *
+@*                     [R13 #68]    u2_dest_stride_uv                         *
+@*                     [R13 #72]    convert_uv_only                           *
+@*                                                                            *
+@*  Values Returned  : None                                                   *
+@*                                                                            *
+@*  Register Usage   : R0 - R8, Q0                                            *
+@*                                                                            *
+@*  Stack Usage      : 24 Bytes                                               *
+@*                                                                            *
+@*  Interruptibility : Interruptible                                          *
+@*                                                                            *
+@*  Known Limitations                                                         *
+@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
+@*                     greater than or equal to 16                *
+@*                     Image Height:    Assumed to be even.                   *
+@*                                                                            *
+@*  Revision History :                                                        *
+@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
+@*         07 06 2010   Varshita        Draft                                 *
+@*         07 06 2010   Naveen Kr T     Completed                             *
+@*                                                                            *
+@*****************************************************************************/
+                .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q
+impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q:
+
+    @// push the registers on the stack
+    stmfd           sp!, {r4-r8, lr}
+
+    ldr             r4, [sp, #56]       @// Load convert_uv_only
+
+    cmp             r4, #1
+    beq             yuv420sp_uv_chroma
+    @/* Do the preprocessing before the main loops start */
+    @// Load the parameters from stack
+    ldr             r4, [sp, #28]       @// Load u2_height from stack
+
+    ldr             r5, [sp, #32]       @// Load u2_width from stack
+
+    ldr             r7, [sp, #36]       @// Load u2_stridey from stack
+
+    ldr             r8, [sp, #48]       @// Load u2_dest_stride_y from stack
+
+    sub             r7, r7, r5          @// Source increment
+
+    sub             r8, r8, r5          @// Destination increment
+
+
+yuv420sp_uv_row_loop_y:
+    mov             r6, r5
+
+yuv420sp_uv_col_loop_y:
+    pld             [r0, #128]
+    vld1.8          {q0}, [r0]!
+    vst1.8          {q0}, [r3]!
+    sub             r6, r6, #16
+    cmp             r6, #15
+    bgt             yuv420sp_uv_col_loop_y
+
+    cmp             r6, #0
+    beq             yuv420sp_uv_row_loop_end_y
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    rsb             r6, r6, #16
+    sub             r0, r0, r6
+    sub             r3, r3, r6
+
+    vld1.8          {q0}, [r0]!
+    vst1.8          {q0}, [r3]!
+
+yuv420sp_uv_row_loop_end_y:
+    add             r0, r0, r7
+    add             r3, r3, r8
+    subs            r4, r4, #1
+    bgt             yuv420sp_uv_row_loop_y
+
+yuv420sp_uv_chroma:
+
+    ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
+
+    ldr             r4, [sp, #28]       @// Load u2_height from stack
+
+    ldr             r5, [sp, #32]       @// Load u2_width from stack
+
+
+    ldr             r7, [sp, #40]       @// Load u2_strideu from stack
+
+    ldr             r8, [sp, #52]       @// Load u2_dest_stride_uv from stack
+
+    sub             r7, r7, r5, lsr #1  @// Source increment
+
+    sub             r8, r8, r5          @// Destination increment
+
+    mov             r5, r5, lsr #1
+    mov             r4, r4, lsr #1
+    ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
+yuv420sp_uv_row_loop_uv:
+    mov             r6, r5
+
+
+yuv420sp_uv_col_loop_uv:
+    pld             [r1, #128]
+    pld             [r2, #128]
+    vld1.8          d0, [r1]!
+    vld1.8          d1, [r2]!
+    vst2.8          {d0, d1}, [r3]!
+    sub             r6, r6, #8
+    cmp             r6, #7
+    bgt             yuv420sp_uv_col_loop_uv
+
+    cmp             r6, #0
+    beq             yuv420sp_uv_row_loop_end_uv
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    rsb             r6, r6, #8
+    sub             r1, r1, r6
+    sub             r2, r2, r6
+    sub             r3, r3, r6, lsl #1
+
+    vld1.8          d0, [r1]!
+    vld1.8          d1, [r2]!
+    vst2.8          {d0, d1}, [r3]!
+
+yuv420sp_uv_row_loop_end_uv:
+    add             r1, r1, r7
+    add             r2, r2, r7
+    add             r3, r3, r8
+    subs            r4, r4, #1
+    bgt             yuv420sp_uv_row_loop_uv
+    @//POP THE REGISTERS
+    ldmfd           sp!, {r4-r8, pc}
+
+
+
+
+
+@/*****************************************************************************
+@*                                                                            *
+@*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q()                      *
+@*                                                                            *
+@*  Description      : This function conversts the image from YUV420P color   *
+@*                     space to 420SP color space(VU interleaved).        *
+@*             This function is similar to above function         *
+@*             IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in   *
+@*             VLD1.8 for chroma - order of registers is different    *
+@*                                                                            *
+@*  Arguments        : R0           pu1_y                                     *
+@*                     R1           pu1_u                                     *
+@*                     R2           pu1_v                                     *
+@*                     R3           pu1_dest_y                                *
+@*                     [R13 #40]    pu1_dest_uv                               *
+@*                     [R13 #44]    u2_height                                 *
+@*                     [R13 #48]    u2_width                                  *
+@*                     [R13 #52]    u2_stridey                                *
+@*                     [R13 #56]    u2_strideu                                *
+@*                     [R13 #60]    u2_stridev                                *
+@*                     [R13 #64]    u2_dest_stride_y                          *
+@*                     [R13 #68]    u2_dest_stride_uv                         *
+@*                     [R13 #72]    convert_uv_only                           *
+@*                                                                            *
+@*  Values Returned  : None                                                   *
+@*                                                                            *
+@*  Register Usage   : R0 - R8, Q0                                            *
+@*                                                                            *
+@*  Stack Usage      : 24 Bytes                                               *
+@*                                                                            *
+@*  Interruptibility : Interruptible                                          *
+@*                                                                            *
+@*  Known Limitations                                                         *
+@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
+@*                     greater than or equal to 16                *
+@*                     Image Height:    Assumed to be even.                   *
+@*                                                                            *
+@*  Revision History :                                                        *
+@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
+@*         07 06 2010   Varshita        Draft                                 *
+@*         07 06 2010   Naveen Kr T     Completed                             *
+@*                                                                            *
+@*****************************************************************************/
+
+                .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q
+impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q:
+
+    @// push the registers on the stack
+    stmfd           sp!, {r4-r8, lr}
+
+    ldr             r4, [sp, #56]       @// Load convert_uv_only
+
+    cmp             r4, #1
+    beq             yuv420sp_vu_chroma
+
+    @/* Do the preprocessing before the main loops start */
+    @// Load the parameters from stack
+    ldr             r4, [sp, #28]       @// Load u2_height from stack
+
+    ldr             r5, [sp, #32]       @// Load u2_width from stack
+
+    ldr             r7, [sp, #36]       @// Load u2_stridey from stack
+
+    ldr             r8, [sp, #48]       @// Load u2_dest_stride_y from stack
+
+    sub             r7, r7, r5          @// Source increment
+
+    sub             r8, r8, r5          @// Destination increment
+
+
+yuv420sp_vu_row_loop_y:
+    mov             r6, r5
+
+yuv420sp_vu_col_loop_y:
+    pld             [r0, #128]
+    vld1.8          {q0}, [r0]!
+    vst1.8          {q0}, [r3]!
+    sub             r6, r6, #16
+    cmp             r6, #15
+    bgt             yuv420sp_vu_col_loop_y
+
+    cmp             r6, #0
+    beq             yuv420sp_vu_row_loop_end_y
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    rsb             r6, r6, #16
+    sub             r0, r0, r6
+    sub             r3, r3, r6
+
+    vld1.8          {q0}, [r0]!
+    vst1.8          {q0}, [r3]!
+
+yuv420sp_vu_row_loop_end_y:
+    add             r0, r0, r7
+    add             r3, r3, r8
+    subs            r4, r4, #1
+    bgt             yuv420sp_vu_row_loop_y
+
+yuv420sp_vu_chroma:
+
+    ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
+
+    ldr             r4, [sp, #28]       @// Load u2_height from stack
+
+    ldr             r5, [sp, #32]       @// Load u2_width from stack
+
+
+    ldr             r7, [sp, #40]       @// Load u2_strideu from stack
+
+    ldr             r8, [sp, #52]       @// Load u2_dest_stride_uv from stack
+
+    sub             r7, r7, r5, lsr #1  @// Source increment
+
+    sub             r8, r8, r5          @// Destination increment
+
+    mov             r5, r5, lsr #1
+    mov             r4, r4, lsr #1
+    ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
+yuv420sp_vu_row_loop_uv:
+    mov             r6, r5
+
+
+yuv420sp_vu_col_loop_uv:
+    pld             [r1, #128]
+    pld             [r2, #128]
+    vld1.8          d1, [r1]!
+    vld1.8          d0, [r2]!
+    vst2.8          {d0, d1}, [r3]!
+    sub             r6, r6, #8
+    cmp             r6, #7
+    bgt             yuv420sp_vu_col_loop_uv
+
+    cmp             r6, #0
+    beq             yuv420sp_vu_row_loop_end_uv
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    rsb             r6, r6, #8
+    sub             r1, r1, r6
+    sub             r2, r2, r6
+    sub             r3, r3, r6, lsl #1
+
+    vld1.8          d1, [r1]!
+    vld1.8          d0, [r2]!
+    vst2.8          {d0, d1}, [r3]!
+
+yuv420sp_vu_row_loop_end_uv:
+    add             r1, r1, r7
+    add             r2, r2, r7
+    add             r3, r3, r8
+    subs            r4, r4, #1
+    bgt             yuv420sp_vu_row_loop_uv
+    @//POP THE REGISTERS
+    ldmfd           sp!, {r4-r8, pc}
+
+
+
+
+
diff --git a/common/arm/impeg2_idct.s b/common/arm/impeg2_idct.s
new file mode 100644
index 0000000..22225bf
--- /dev/null
+++ b/common/arm/impeg2_idct.s
@@ -0,0 +1,1204 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+
+@/*
+@//----------------------------------------------------------------------------
+@// File Name            : impeg2_idct.s
+@//
+@// Description          : This file has the Idct Implementations for the
+@//                        MPEG2 SP decoder on neon platform.
+@//
+@// Reference Document   :
+@//
+@// Revision History     :
+@//      Date            Author                  Detail Description
+@//   ------------    ----------------    ----------------------------------
+@//   Feb 22, 2008     Naveen Kumar T                Created
+@//
+@//-------------------------------------------------------------------------
+@*/
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Include Files
+@// ----------------------------------------------------------------------------
+@*/
+
+.text
+.p2align 2
+.equ idct_stg1_shift       ,            12
+.equ idct_stg2_shift       ,            16
+.equ idct_stg1_round     ,          (1 << (idct_stg1_shift - 1))
+.equ idct_stg2_round     ,          (1 << (idct_stg2_shift - 1))
+@/*
+@// ----------------------------------------------------------------------------
+@// Struct/Union Types and Define
+@// ----------------------------------------------------------------------------
+@*/
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Global Data section variables
+@// ----------------------------------------------------------------------------
+@*/
+@//--------------------------- NONE --------------------------------------------
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Prototype Functions
+@// ----------------------------------------------------------------------------
+@*/
+@// -------------------------- NONE --------------------------------------------
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Exported functions
+@// ----------------------------------------------------------------------------
+@*/
+
+    .extern gai2_impeg2_idct_q15
+.hidden gai2_impeg2_idct_q15
+    .extern gai2_impeg2_idct_q11
+.hidden gai2_impeg2_idct_q11
+    .extern gai2_impeg2_idct_first_col_q15
+.hidden gai2_impeg2_idct_first_col_q15
+    .extern gai2_impeg2_idct_first_col_q11
+.hidden gai2_impeg2_idct_first_col_q11
+    .extern gai2_impeg2_mismatch_stg2_additive
+.hidden gai2_impeg2_mismatch_stg2_additive
+
+gai2_impeg2_idct_q15_addr1:
+    .long gai2_impeg2_idct_q15 - q15lbl1 - 8
+gai2_impeg2_idct_q15_addr2:
+    .long gai2_impeg2_idct_q15 - q15lbl2 - 8
+gai2_impeg2_idct_q11_addr1:
+    .long gai2_impeg2_idct_q11 - q11lbl1 - 8
+gai2_impeg2_idct_q11_addr2:
+    .long gai2_impeg2_idct_q11 - q11lbl2 - 8
+gai2_impeg2_idct_first_col_q15_addr1:
+    .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl1 - 8
+gai2_impeg2_idct_first_col_q15_addr2:
+    .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl2 - 8
+gai2_impeg2_idct_first_col_q15_addr3:
+    .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl3 - 8
+gai2_impeg2_mismatch_stg2_additive_addr:
+    .long gai2_impeg2_mismatch_stg2_additive - additive_lbl - 8
+gai2_impeg2_idct_first_col_q11_addr1:
+    .long gai2_impeg2_idct_first_col_q11 - fcq11_lbl1 - 8
+gai2_impeg2_idct_first_col_q11_addr2:
+    .long gai2_impeg2_idct_first_col_q11 - fcq11_lbl2 - 8
+
+    .global impeg2_idct_recon_dc_a9q
+impeg2_idct_recon_dc_a9q:
+    stmfd           sp!, {r4, r6, r12, lr}
+    @//r0: pi2_src
+    @//r1: pi2_tmp - not used, used as pred_strd
+    @//r2: pu1_pred
+    @//r3: pu1_dst
+    @//r4: used as scratch
+    @//r5:
+
+    ldr             r1, [sp, #20]       @//pred_strd
+    ldr             r6, [sp, #24]       @//dst_strd
+
+    ldr             r14, gai2_impeg2_idct_q15_addr1
+q15lbl1:
+    add             r14, r14, pc
+    ldrsh           r12, [r14]
+    ldrsh           r4, [r0]
+
+    vld1.8          d0, [r2], r1
+    mul             r4, r4, r12
+
+    vld1.8          d1, [r2], r1
+    add             r4, #idct_stg1_round
+
+    vld1.8          d2, [r2], r1
+    asr             r4, r4, #idct_stg1_shift
+
+    ldr             r14, gai2_impeg2_idct_q11_addr1
+q11lbl1:
+    add             r14, r14, pc
+    ldrsh           r12, [r14]
+
+    vld1.8          d3, [r2], r1
+    mul             r4, r4, r12
+
+    vld1.8          d4, [r2], r1
+    add             r4, #idct_stg2_round
+
+    vld1.8          d5, [r2], r1
+    asr             r4, r4, #idct_stg2_shift
+
+    vld1.8          d6, [r2], r1
+    vdup.s16        q15, r4
+
+
+    vld1.8          d7, [r2], r1
+
+    vaddw.u8        q4, q15, d0
+
+    vaddw.u8        q5, q15, d1
+    vqmovun.s16     d0, q4
+
+    vaddw.u8        q6, q15, d2
+    vqmovun.s16     d1, q5
+    vst1.8          d0, [r3], r6
+
+    vaddw.u8        q7, q15, d3
+    vqmovun.s16     d2, q6
+    vst1.8          d1, [r3], r6
+
+    vaddw.u8        q8, q15, d4
+    vqmovun.s16     d3, q7
+    vst1.8          d2, [r3], r6
+
+    vaddw.u8        q9, q15, d5
+    vqmovun.s16     d4, q8
+    vst1.8          d3, [r3], r6
+
+    vaddw.u8        q10, q15, d6
+    vqmovun.s16     d5, q9
+    vst1.8          d4, [r3], r6
+
+    vaddw.u8        q11, q15, d7
+    vqmovun.s16     d6, q10
+    vst1.8          d5, [r3], r6
+
+    vqmovun.s16     d7, q11
+    vst1.8          d6, [r3], r6
+
+
+    vst1.8          d7, [r3], r6
+
+    ldmfd           sp!, {r4, r6, r12, pc}
+
+
+
+
+    .global impeg2_idct_recon_dc_mismatch_a9q
+impeg2_idct_recon_dc_mismatch_a9q:
+    stmfd           sp!, {r4-r12, lr}
+
+    ldr             r1, [sp, #44]       @//pred_strd
+    ldr             r6, [sp, #48]       @//dst_strd
+
+    ldr             r14, gai2_impeg2_idct_q15_addr2
+q15lbl2:
+    add             r14, r14, pc
+    ldrsh           r12, [r14]
+    ldrsh           r4, [r0]
+
+    mul             r4, r4, r12
+    add             r4, #idct_stg1_round
+    asr             r4, r4, #idct_stg1_shift
+
+    ldr             r14, gai2_impeg2_idct_q11_addr2
+q11lbl2:
+    add             r14, r14, pc
+    ldrsh           r12, [r14]
+    mul             r4, r4, r12
+    vdup.s32        q0, r4
+
+    mov             r14, #16            @//Increment for table read
+    ldr             r4, gai2_impeg2_mismatch_stg2_additive_addr
+additive_lbl:
+    add             r4, r4, pc
+
+    vld1.16         {q1}, [r4], r14
+
+    vld1.8          d30, [r2], r1
+    vmovl.s16       q4, d2
+    vmovl.s16       q5, d3
+    vraddhn.s32     d12, q0, q4
+    vraddhn.s32     d13, q0, q5
+    vaddw.u8        q7, q6, d30
+    vqmovun.s16     d30, q7
+    vst1.8          d30, [r3], r6
+
+    vld1.16         {q1}, [r4], r14
+    vld1.8          d30, [r2], r1
+    vmovl.s16       q4, d2
+    vmovl.s16       q5, d3
+    vraddhn.s32     d12, q0, q4
+    vraddhn.s32     d13, q0, q5
+    vaddw.u8        q7, q6, d30
+    vqmovun.s16     d30, q7
+    vst1.8          d30, [r3], r6
+
+    vld1.16         {q1}, [r4], r14
+    vld1.8          d30, [r2], r1
+    vmovl.s16       q4, d2
+    vmovl.s16       q5, d3
+    vraddhn.s32     d12, q0, q4
+    vraddhn.s32     d13, q0, q5
+    vaddw.u8        q7, q6, d30
+    vqmovun.s16     d30, q7
+    vst1.8          d30, [r3], r6
+
+    vld1.16         {q1}, [r4], r14
+    vld1.8          d30, [r2], r1
+    vmovl.s16       q4, d2
+    vmovl.s16       q5, d3
+    vraddhn.s32     d12, q0, q4
+    vraddhn.s32     d13, q0, q5
+    vaddw.u8        q7, q6, d30
+    vqmovun.s16     d30, q7
+    vst1.8          d30, [r3], r6
+
+    vld1.16         {q1}, [r4], r14
+    vld1.8          d30, [r2], r1
+    vmovl.s16       q4, d2
+    vmovl.s16       q5, d3
+    vraddhn.s32     d12, q0, q4
+    vraddhn.s32     d13, q0, q5
+    vaddw.u8        q7, q6, d30
+    vqmovun.s16     d30, q7
+    vst1.8          d30, [r3], r6
+
+    vld1.16         {q1}, [r4], r14
+    vld1.8          d30, [r2], r1
+    vmovl.s16       q4, d2
+    vmovl.s16       q5, d3
+    vraddhn.s32     d12, q0, q4
+    vraddhn.s32     d13, q0, q5
+    vaddw.u8        q7, q6, d30
+    vqmovun.s16     d30, q7
+    vst1.8          d30, [r3], r6
+
+    vld1.16         {q1}, [r4], r14
+    vld1.8          d30, [r2], r1
+    vmovl.s16       q4, d2
+    vmovl.s16       q5, d3
+    vraddhn.s32     d12, q0, q4
+    vraddhn.s32     d13, q0, q5
+    vaddw.u8        q7, q6, d30
+    vqmovun.s16     d30, q7
+    vst1.8          d30, [r3], r6
+
+    vld1.16         {q1}, [r4], r14
+    vld1.8          d30, [r2], r1
+    vmovl.s16       q4, d2
+    vmovl.s16       q5, d3
+    vraddhn.s32     d12, q0, q4
+    vraddhn.s32     d13, q0, q5
+    vaddw.u8        q7, q6, d30
+    vqmovun.s16     d30, q7
+    vst1.8          d30, [r3], r6
+
+
+    ldmfd           sp!, {r4-r12, pc}
+
+
+
+
+@/**
+@ *******************************************************************************
+@ *
+@ * ;brief
+@ *  This function performs Inverse transform  and reconstruction for 8x8
+@ * input block
+@ *
+@ * ;par Description:
+@ *  Performs inverse transform and adds the prediction  data and clips output
+@ * to 8 bit
+@ *
+@ * ;param[in] pi2_src
+@ *  Input 8x8 coefficients
+@ *
+@ * ;param[in] pi2_tmp
+@ *  Temporary 8x8 buffer for storing inverse
+@ *
+@ *  transform
+@ *  1st stage output
+@ *
+@ * ;param[in] pu1_pred
+@ *  Prediction 8x8 block
+@ *
+@ * ;param[out] pu1_dst
+@ *  Output 8x8 block
+@ *
+@ * ;param[in] src_strd
+@ *  Input stride
+@ *
+@ * ;param[in] pred_strd
+@ *  Prediction stride
+@ *
+@ * ;param[in] dst_strd
+@ *  Output Stride
+@ *
+@ * ;param[in] shift
+@ *  Output shift
+@ *
+@ * ;param[in] zero_cols
+@ *  Zero columns in pi2_src
+@ *
+@ * ;returns  Void
+@ *
+@ * ;remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@ */
+
+@void impeg2_itrans_recon_8x8(WORD16 *pi2_src,
+@                            WORD16 *pi2_tmp,
+@                            UWORD8 *pu1_pred,
+@                            UWORD8 *pu1_dst,
+@                            WORD32 src_strd,
+@                            WORD32 pred_strd,
+@                            WORD32 dst_strd,
+@                            WORD32 zero_cols
+@                            WORD32 zero_rows               )
+
+@**************Variables Vs Registers*************************
+@   r0 => *pi2_src
+@   r1 => *pi2_tmp
+@   r2 => *pu1_pred
+@   r3 => *pu1_dst
+@   src_strd
+@   pred_strd
+@   dst_strd
+@   zero_cols
+
+
+
+    .global impeg2_idct_recon_a9q
+impeg2_idct_recon_a9q:
+@//Register Usage Reference     - loading and Until IDCT of columns
+@// Cosine Constants    -   D0
+@// Sine Constants      -   D1
+@// Row 0 First Half    -   D2      -   y0
+@// Row 1 First Half    -   D6      -   y1
+@// Row 2 First Half    -   D3      -   y2
+@// Row 3 First Half    -   D7      -   y3
+@// Row 4 First Half    -   D10     -   y4
+@// Row 5 First Half    -   D14     -   y5
+@// Row 6 First Half    -   D11     -   y6
+@// Row 7 First Half    -   D15     -   y7
+
+@// Row 0 Second Half   -   D4      -   y0
+@// Row 1 Second Half   -   D8      -   y1
+@// Row 2 Second Half   -   D5      -   y2
+@// Row 3 Second Half   -   D9      -   y3
+@// Row 4 Second Half   -   D12     -   y4
+@// Row 5 Second Half   -   D16     -   y5
+@// Row 6 Second Half   -   D13     -   y6
+@// Row 7 Second Half   -   D17     -   y7
+
+    @// Copy the input pointer to another register
+    @// Step 1 : load all constants
+    stmfd           sp!, {r4-r12, lr}
+    add             sp, sp, #40
+    ldr             r8, [sp, #4]        @ prediction stride
+    ldr             r7, [sp, #8]        @ destination stride
+    ldr             r6, [sp]            @ src stride
+    ldr             r12, [sp, #12]
+    ldr             r11, [sp, #16]
+    mov             r6, r6, lsl #1      @ x sizeof(word16)
+    add             r9, r0, r6, lsl #1  @ 2 rows
+
+    add             r10, r6, r6, lsl #1 @ 3 rows
+
+    sub             r10, r10, #8        @ - 4 cols * sizeof(WORD16)
+    sub             r5, r6, #8          @ src_strd - 4 cols * sizeof(WORD16)
+
+
+    ldr             r14, gai2_impeg2_idct_first_col_q15_addr1
+fcq15_lbl1:
+    add             r14, r14, pc
+    vld1.16         {d0, d1}, [r14]     @//D0,D1 are used for storing the constant data
+
+    @//Step 2 Load all the input data
+    @//Step 3 Operate first 4 colums at a time
+
+    and             r11, r11, #0xff
+    and             r12, r12, #0xff
+
+    cmp             r11, #0xf0
+    bge             skip_last4_rows
+
+
+    vld1.16         d2, [r0]!
+    vld1.16         d3, [r9]!
+    vld1.16         d4, [r0], r5
+    vmull.s16       q10, d2, d0[0]      @// y0 * cos4(part of c0 and c1)
+    vld1.16         d5, [r9], r5
+    vmull.s16       q9, d3, d1[2]       @// y2 * sin2 (Q3 is freed by this time)(part of d1)
+    vld1.16         d6, [r0]!
+    vld1.16         d7, [r9]!
+    vmull.s16       q12, d6, d0[1]      @// y1 * cos1(part of b0)
+    vld1.16         d8, [r0], r10
+    vmull.s16       q13, d6, d0[3]      @// y1 * cos3(part of b1)
+    vld1.16         d9, [r9], r10
+    vmull.s16       q14, d6, d1[1]      @// y1 * sin3(part of b2)
+    vld1.16         d10, [r0]!
+    vmull.s16       q15, d6, d1[3]      @// y1 * sin1(part of b3)
+    vld1.16         d11, [r9]!
+    vmlal.s16       q12, d7, d0[3]      @// y1 * cos1 + y3 * cos3(part of b0)
+    vld1.16         d12, [r0], r5
+    vmlsl.s16       q13, d7, d1[3]      @// y1 * cos3 - y3 * sin1(part of b1)
+    vld1.16         d13, [r9], r5
+    vmlsl.s16       q14, d7, d0[1]      @// y1 * sin3 - y3 * cos1(part of b2)
+    vld1.16         d14, [r0]!
+    vmlsl.s16       q15, d7, d1[1]      @// y1 * sin1 - y3 * sin3(part of b3)
+    vld1.16         d15, [r9]!
+    vmull.s16       q11, d10, d0[0]     @// y4 * cos4(part of c0 and c1)
+    vld1.16         d16, [r0], r10
+    vmull.s16       q3, d3, d0[2]       @// y2 * cos2(part of d0)
+    vld1.16         d17, [r9], r10
+
+    @/* This following was activated when alignment is not there */
+@// VLD1.16     D2,[r0]!
+@// VLD1.16     D3,[r2]!
+@// VLD1.16     D4,[r0]!
+@// VLD1.16     D5,[r2]!
+@// VLD1.16     D6,[r0]!
+@// VLD1.16     D7,[r2]!
+@// VLD1.16     D8,[r0],r3
+@// VLD1.16     D9,[r2],r3
+@// VLD1.16     D10,[r0]!
+@// VLD1.16     D11,[r2]!
+@// VLD1.16     D12,[r0]!
+@// VLD1.16     D13,[r2]!
+@// VLD1.16     D14,[r0]!
+@// VLD1.16     D15,[r2]!
+@// VLD1.16     D16,[r0],r3
+@// VLD1.16     D17,[r2],r3
+
+
+
+
+    vmlal.s16       q12, d14, d1[1]     @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+    vmlsl.s16       q13, d14, d0[1]     @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+    vmlal.s16       q14, d14, d1[3]     @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+    vmlal.s16       q15, d14, d0[3]     @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    vmlsl.s16       q9, d11, d0[2]      @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+    vmlal.s16       q3, d11, d1[2]      @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    vadd.s32        q5, q10, q11        @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    vsub.s32        q10, q10, q11       @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    vmlal.s16       q12, d15, d1[3]     @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
+    vmlsl.s16       q13, d15, d1[1]     @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
+    vmlal.s16       q14, d15, d0[3]     @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
+    vmlsl.s16       q15, d15, d0[1]     @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
+
+    vadd.s32        q7, q5, q3          @// a0 = c0 + d0(part of r0,r7)
+    vsub.s32        q5, q5, q3          @// a3 = c0 - d0(part of r3,r4)
+    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of r2,r5)
+    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of r1,r6)
+
+    vadd.s32        q10, q7, q12        @// a0 + b0(part of r0)
+    vsub.s32        q3, q7, q12         @// a0 - b0(part of r7)
+
+    vadd.s32        q12, q11, q14       @// a2 + b2(part of r2)
+    vsub.s32        q11, q11, q14       @// a2 - b2(part of r5)
+
+    vadd.s32        q14, q9, q13        @// a1 + b1(part of r1)
+    vsub.s32        q9, q9, q13         @// a1 - b1(part of r6)
+
+    vadd.s32        q13, q5, q15        @// a3 + b3(part of r3)
+    vsub.s32        q15, q5, q15        @// a3 - b3(part of r4)
+
+    vqrshrn.s32     d2, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d15, q3, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d3, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d14, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d6, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d11, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d7, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d10, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+
+
+    b               last4_cols
+
+
+
+skip_last4_rows:
+
+
+    ldr             r14, gai2_impeg2_idct_first_col_q15_addr2
+fcq15_lbl2:
+    add             r14, r14, pc
+    vld1.16         {d0, d1}, [r14]     @//D0,D1 are used for storing the constant data
+
+    vld1.16         d2, [r0]!
+    vld1.16         d3, [r9]!
+    vld1.16         d4, [r0], r5
+    vld1.16         d5, [r9], r5
+    vld1.16         d6, [r0]!
+    vld1.16         d7, [r9]!
+    vld1.16         d8, [r0], r10
+    vld1.16         d9, [r9], r10
+
+
+
+    vmov.s16        q6, #0
+    vmov.s16        q8, #0
+
+
+
+
+    vmull.s16       q12, d6, d0[1]      @// y1 * cos1(part of b0)
+    vmull.s16       q13, d6, d0[3]      @// y1 * cos3(part of b1)
+    vmull.s16       q14, d6, d1[1]      @// y1 * sin3(part of b2)
+    vmull.s16       q15, d6, d1[3]      @// y1 * sin1(part of b3)
+
+    vmlal.s16       q12, d7, d0[3]      @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16       q13, d7, d1[3]      @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16       q14, d7, d0[1]      @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16       q15, d7, d1[1]      @// y1 * sin1 - y3 * sin3(part of b3)
+
+    vmull.s16       q9, d3, d1[2]       @// y2 * sin2 (Q3 is freed by this time)(part of d1)
+    vmull.s16       q3, d3, d0[2]       @// y2 * cos2(part of d0)
+
+    vmull.s16       q10, d2, d0[0]      @// y0 * cos4(part of c0 and c1)
+
+
+    vadd.s32        q7, q10, q3         @// a0 = c0 + d0(part of r0,r7)
+    vsub.s32        q5, q10, q3         @// a3 = c0 - d0(part of r3,r4)
+    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of r2,r5)
+    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of r1,r6)
+
+    vadd.s32        q10, q7, q12        @// a0 + b0(part of r0)
+    vsub.s32        q3, q7, q12         @// a0 - b0(part of r7)
+
+    vadd.s32        q12, q11, q14       @// a2 + b2(part of r2)
+    vsub.s32        q11, q11, q14       @// a2 - b2(part of r5)
+
+    vadd.s32        q14, q9, q13        @// a1 + b1(part of r1)
+    vsub.s32        q9, q9, q13         @// a1 - b1(part of r6)
+
+    vadd.s32        q13, q5, q15        @// a3 + b3(part of r3)
+    vsub.s32        q15, q5, q15        @// a3 - b3(part of r4)
+
+    vqrshrn.s32     d2, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d15, q3, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d3, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d14, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d6, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d11, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d7, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d10, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+
+
+last4_cols:
+
+
+    cmp             r12, #0xf0
+    bge             skip_last4cols
+
+    ldr             r14, gai2_impeg2_idct_first_col_q15_addr3
+fcq15_lbl3:
+    add             r14, r14, pc
+    vld1.16         {d0, d1}, [r14]     @//D0,D1 are used for storing the constant data
+
+    vmull.s16       q12, d8, d0[1]      @// y1 * cos1(part of b0)
+    vmull.s16       q13, d8, d0[3]      @// y1 * cos3(part of b1)
+    vmull.s16       q14, d8, d1[1]      @// y1 * sin3(part of b2)
+    vmull.s16       q15, d8, d1[3]      @// y1 * sin1(part of b3)
+
+    vmlal.s16       q12, d9, d0[3]      @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16       q13, d9, d1[3]      @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16       q14, d9, d0[1]      @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16       q15, d9, d1[1]      @// y1 * sin1 - y3 * sin3(part of b3)
+
+    vmull.s16       q9, d5, d1[2]       @// y2 * sin2 (Q4 is freed by this time)(part of d1)
+    vmull.s16       q4, d5, d0[2]       @// y2 * cos2(part of d0)
+
+    vmull.s16       q10, d4, d0[0]      @// y0 * cos4(part of c0 and c1)
+    vmull.s16       q11, d12, d0[0]     @// y4 * cos4(part of c0 and c1)
+
+    vmlal.s16       q12, d16, d1[1]     @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+    vmlsl.s16       q13, d16, d0[1]     @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+    vmlal.s16       q14, d16, d1[3]     @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+    vmlal.s16       q15, d16, d0[3]     @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    vmlsl.s16       q9, d13, d0[2]      @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+    vmlal.s16       q4, d13, d1[2]      @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    vadd.s32        q6, q10, q11        @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    vsub.s32        q10, q10, q11       @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    vmlal.s16       q12, d17, d1[3]     @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
+    vmlsl.s16       q13, d17, d1[1]     @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
+    vmlal.s16       q14, d17, d0[3]     @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
+    vmlsl.s16       q15, d17, d0[1]     @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
+
+    vadd.s32        q8, q6, q4          @// a0 = c0 + d0(part of e0,e7)
+    vsub.s32        q6, q6, q4          @// a3 = c0 - d0(part of e3,e4)
+    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of e2,e5)
+    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of e1,e6)
+
+    vadd.s32        q10, q8, q12        @// a0 + b0(part of e0)
+    vsub.s32        q4, q8, q12         @// a0 - b0(part of e7)
+
+    vadd.s32        q12, q11, q14       @// a2 + b2(part of e2)
+    vsub.s32        q11, q11, q14       @// a2 - b2(part of e5)
+
+    vadd.s32        q14, q9, q13        @// a1 + b1(part of e1)
+    vsub.s32        q9, q9, q13         @// a1 - b1(part of e6)
+
+    vadd.s32        q13, q6, q15        @// a3 + b3(part of e3)
+    vsub.s32        q15, q6, q15        @// a3 - b3(part of r4)
+
+    vqrshrn.s32     d4, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d17, q4, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d5, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d16, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d8, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d13, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d9, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+    vqrshrn.s32     d12, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+    b               end_skip_last4cols
+
+
+
+skip_last4cols:
+
+
+
+    ldr             r14, gai2_impeg2_idct_first_col_q11_addr1
+fcq11_lbl1:
+    add             r14, r14, pc
+    vld1.16         {d0, d1}, [r14]     @//D0,D1 are used for storing the constant data
+
+
+
+    vtrn.16         q1, q3              @//[r3,r1],[r2,r0] first qudrant transposing
+
+    vtrn.16         q5, q7              @//[r7,r5],[r6,r4] third qudrant transposing
+
+
+    vtrn.32         d6, d7              @//r0,r1,r2,r3 first qudrant transposing continued.....
+    vtrn.32         d2, d3              @//r0,r1,r2,r3 first qudrant transposing continued.....
+
+    vtrn.32         d10, d11            @//r4,r5,r6,r7 third qudrant transposing continued.....
+    vtrn.32         d14, d15            @//r4,r5,r6,r7 third qudrant transposing continued.....
+
+
+    vmull.s16       q12, d6, d0[1]      @// y1 * cos1(part of b0)
+    vmull.s16       q13, d6, d0[3]      @// y1 * cos3(part of b1)
+    vmull.s16       q14, d6, d1[1]      @// y1 * sin3(part of b2)
+    vmull.s16       q15, d6, d1[3]      @// y1 * sin1(part of b3)
+
+    vmlal.s16       q12, d7, d0[3]      @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16       q13, d7, d1[3]      @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16       q14, d7, d0[1]      @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16       q15, d7, d1[1]      @// y1 * sin1 - y3 * sin3(part of b3)
+
+    vmull.s16       q10, d2, d0[0]      @// y0 * cos4(part of c0 and c1)
+@   VMULL.S16   Q11,D4,D0[0]                    ;// y4 * cos4(part of c0 and c1)
+
+    vmull.s16       q9, d3, d1[2]       @// y2 * sin2 (Q3 is freed by this time)(part of d1)
+    vmull.s16       q3, d3, d0[2]       @// y2 * cos2(part of d0)
+
+
+
+
+    vsub.s32        q11, q10, q3        @// a3 = c0 - d0(part of r3,r4)
+    vadd.s32        q2, q10, q3         @// a0 = c0 + d0(part of r0,r7)
+
+
+    vadd.s32        q1, q2, q12
+
+    vsub.s32        q3, q2, q12
+
+    vadd.s32        q4, q11, q15
+
+    vsub.s32        q12, q11, q15
+
+    vqrshrn.s32     d5, q4, #idct_stg2_shift
+    vqrshrn.s32     d2, q1, #idct_stg2_shift
+    vqrshrn.s32     d9, q3, #idct_stg2_shift
+    vqrshrn.s32     d6, q12, #idct_stg2_shift
+
+    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of r2,r5)
+    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of r1,r6)
+
+
+    vadd.s32        q15, q11, q14
+
+    vsub.s32        q12, q11, q14
+
+    vadd.s32        q14, q9, q13
+
+    vsub.s32        q11, q9, q13
+    vqrshrn.s32     d4, q15, #idct_stg2_shift
+    vqrshrn.s32     d7, q12, #idct_stg2_shift
+    vqrshrn.s32     d3, q14, #idct_stg2_shift
+    vqrshrn.s32     d8, q11, #idct_stg2_shift
+
+
+
+
+
+
+
+
+
+
+    vmull.s16       q12, d14, d0[1]     @// y1 * cos1(part of b0)
+
+    vmull.s16       q13, d14, d0[3]     @// y1 * cos3(part of b1)
+    vmull.s16       q14, d14, d1[1]     @// y1 * sin3(part of b2)
+    vmull.s16       q15, d14, d1[3]     @// y1 * sin1(part of b3)
+
+    vmlal.s16       q12, d15, d0[3]     @// y1 * cos1 + y3 * cos3(part of b0)
+    vtrn.16         d2, d3
+    vmlsl.s16       q13, d15, d1[3]     @// y1 * cos3 - y3 * sin1(part of b1)
+    vtrn.16         d4, d5
+    vmlsl.s16       q14, d15, d0[1]     @// y1 * sin3 - y3 * cos1(part of b2)
+    vtrn.16         d6, d7
+    vmlsl.s16       q15, d15, d1[1]     @// y1 * sin1 - y3 * sin3(part of b3)
+    vtrn.16         d8, d9
+    vmull.s16       q10, d10, d0[0]     @// y0 * cos4(part of c0 and c1)
+    vtrn.32         d2, d4
+
+    vtrn.32         d3, d5
+    vmull.s16       q9, d11, d1[2]      @// y2 * sin2 (Q7 is freed by this time)(part of d1)
+    vtrn.32         d6, d8
+    vmull.s16       q7, d11, d0[2]      @// y2 * cos2(part of d0)
+    vtrn.32         d7, d9
+
+
+    add             r4, r2, r8, lsl #1  @ r4 = r2 + pred_strd * 2    => r4 points to 3rd row of pred data
+
+
+    add             r5, r8, r8, lsl #1  @
+
+
+    add             r0, r3, r7, lsl #1  @ r0 points to 3rd row of dest data
+
+
+    add             r10, r7, r7, lsl #1 @
+
+
+    vswp            d3, d6
+
+
+    vswp            d5, d8
+
+
+    vsub.s32        q11, q10, q7        @// a3 = c0 - d0(part of r3,r4)
+    vadd.s32        q6, q10, q7         @// a0 = c0 + d0(part of r0,r7)
+
+
+    vadd.s32        q0, q6, q12
+
+
+    vsub.s32        q12, q6, q12
+
+
+    vadd.s32        q6, q11, q15
+
+
+    vsub.s32        q7, q11, q15
+
+    vqrshrn.s32     d10, q0, #idct_stg2_shift
+    vqrshrn.s32     d17, q12, #idct_stg2_shift
+    vqrshrn.s32     d13, q6, #idct_stg2_shift
+    vqrshrn.s32     d14, q7, #idct_stg2_shift
+
+    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of r2,r5)
+    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of r1,r6)
+
+
+    vadd.s32        q0, q11, q14
+
+
+    vsub.s32        q12, q11, q14
+
+
+    vadd.s32        q14, q9, q13
+
+
+    vsub.s32        q13, q9, q13
+    vld1.8          d18, [r2], r8
+
+    vqrshrn.s32     d12, q0, #idct_stg2_shift
+    vld1.8          d20, [r2], r5
+
+
+    vqrshrn.s32     d15, q12, #idct_stg2_shift
+    vld1.8          d19, [r2], r8
+
+
+
+
+    vqrshrn.s32     d11, q14, #idct_stg2_shift
+    vld1.8          d22, [r4], r8
+
+
+
+
+    vqrshrn.s32     d16, q13, #idct_stg2_shift
+    vld1.8          d21, [r2], r5
+
+
+    b               pred_buff_addition
+end_skip_last4cols:
+
+    ldr             r14, gai2_impeg2_idct_first_col_q11_addr2
+fcq11_lbl2:
+    add             r14, r14, pc
+    vld1.16         {d0, d1}, [r14]     @//D0,D1 are used for storing the constant data
+
+
+@/* Now the Idct of columns is done, transpose so that row idct done efficiently(step5) */
+    vtrn.16         q1, q3              @//[r3,r1],[r2,r0] first qudrant transposing
+    vtrn.16         q2, q4              @//[r3,r1],[r2,r0] second qudrant transposing
+    vtrn.16         q5, q7              @//[r7,r5],[r6,r4] third qudrant transposing
+    vtrn.16         q6, q8              @//[r7,r5],[r6,r4] fourth qudrant transposing
+
+    vtrn.32         d6, d7              @//r0,r1,r2,r3 first qudrant transposing continued.....
+    vtrn.32         d2, d3              @//r0,r1,r2,r3 first qudrant transposing continued.....
+    vtrn.32         d4, d5              @//r0,r1,r2,r3 second qudrant transposing continued.....
+    vtrn.32         d8, d9              @//r0,r1,r2,r3 second qudrant transposing continued.....
+    vtrn.32         d10, d11            @//r4,r5,r6,r7 third qudrant transposing continued.....
+    vtrn.32         d14, d15            @//r4,r5,r6,r7 third qudrant transposing continued.....
+    vtrn.32         d12, d13            @//r4,r5,r6,r7 fourth qudrant transposing continued.....
+    vtrn.32         d16, d17            @//r4,r5,r6,r7 fourth qudrant transposing continued.....
+
+    @//step6 Operate on first four rows and find their idct
+    @//Register Usage Reference     - storing and IDCT of rows
+@// Cosine Constants    -   D0
+@// Sine Constants      -   D1
+@// Element 0 First four    -   D2      -   y0
+@// Element 1 First four    -   D6      -   y1
+@// Element 2 First four    -   D3      -   y2
+@// Element 3 First four    -   D7      -   y3
+@// Element 4 First four    -   D4      -   y4
+@// Element 5 First four    -   D8      -   y5
+@// Element 6 First four    -   D5      -   y6
+@// Element 7 First four    -   D9      -   y7
+@// Element 0 Second four   -   D10     -   y0
+@// Element 1 Second four   -   D14     -   y1
+@// Element 2 Second four   -   D11     -   y2
+@// Element 3 Second four   -   D15     -   y3
+@// Element 4 Second four   -   D12     -   y4
+@// Element 5 Second four   -   D16     -   y5
+@// Element 6 Second four   -   D13     -   y6
+@// Element 7 Second four   -   D17     -   y7
+
+    @// Map between first kernel code seq and current
+@//     D2  ->  D2
+@//     D6  ->  D6
+@//     D3  ->  D3
+@//     D7  ->  D7
+@//     D10 ->  D4
+@//     D14 ->  D8
+@//     D11 ->  D5
+@//     D15 ->  D9
+@//     Q3  ->  Q3
+@//     Q5  ->  Q2
+@//     Q7  ->  Q4
+
+    vmull.s16       q12, d6, d0[1]      @// y1 * cos1(part of b0)
+    vmull.s16       q13, d6, d0[3]      @// y1 * cos3(part of b1)
+    vmull.s16       q14, d6, d1[1]      @// y1 * sin3(part of b2)
+    vmull.s16       q15, d6, d1[3]      @// y1 * sin1(part of b3)
+
+    vmlal.s16       q12, d7, d0[3]      @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16       q13, d7, d1[3]      @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16       q14, d7, d0[1]      @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16       q15, d7, d1[1]      @// y1 * sin1 - y3 * sin3(part of b3)
+
+    vmull.s16       q10, d2, d0[0]      @// y0 * cos4(part of c0 and c1)
+    vmull.s16       q11, d4, d0[0]      @// y4 * cos4(part of c0 and c1)
+
+    vmull.s16       q9, d3, d1[2]       @// y2 * sin2 (Q3 is freed by this time)(part of d1)
+    vmull.s16       q3, d3, d0[2]       @// y2 * cos2(part of d0)
+
+
+    vmlal.s16       q12, d8, d1[1]      @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+    vmlsl.s16       q13, d8, d0[1]      @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+    vmlal.s16       q14, d8, d1[3]      @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+    vmlal.s16       q15, d8, d0[3]      @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    vmlsl.s16       q9, d5, d0[2]       @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+    vmlal.s16       q3, d5, d1[2]       @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    vadd.s32        q1, q10, q11        @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    vsub.s32        q10, q10, q11       @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    vmlal.s16       q12, d9, d1[3]      @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
+    vmlsl.s16       q13, d9, d1[1]      @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
+    vmlal.s16       q14, d9, d0[3]      @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
+    vmlsl.s16       q15, d9, d0[1]      @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
+
+    vsub.s32        q11, q1, q3         @// a3 = c0 - d0(part of r3,r4)
+    vadd.s32        q2, q1, q3          @// a0 = c0 + d0(part of r0,r7)
+
+
+    vadd.s32        q1, q2, q12
+
+    vsub.s32        q3, q2, q12
+
+    vadd.s32        q4, q11, q15
+
+    vsub.s32        q12, q11, q15
+
+    vqrshrn.s32     d5, q4, #idct_stg2_shift
+    vqrshrn.s32     d2, q1, #idct_stg2_shift
+    vqrshrn.s32     d9, q3, #idct_stg2_shift
+    vqrshrn.s32     d6, q12, #idct_stg2_shift
+
+    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of r2,r5)
+    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of r1,r6)
+
+
+    vadd.s32        q15, q11, q14
+
+    vsub.s32        q12, q11, q14
+
+    vadd.s32        q14, q9, q13
+
+    vsub.s32        q11, q9, q13
+    vqrshrn.s32     d4, q15, #idct_stg2_shift
+    vqrshrn.s32     d7, q12, #idct_stg2_shift
+    vqrshrn.s32     d3, q14, #idct_stg2_shift
+    vqrshrn.s32     d8, q11, #idct_stg2_shift
+
+
+
+
+
+
+
+
+
+
+    vmull.s16       q12, d14, d0[1]     @// y1 * cos1(part of b0)
+
+    vmull.s16       q13, d14, d0[3]     @// y1 * cos3(part of b1)
+    vmull.s16       q14, d14, d1[1]     @// y1 * sin3(part of b2)
+    vmull.s16       q15, d14, d1[3]     @// y1 * sin1(part of b3)
+
+    vmlal.s16       q12, d15, d0[3]     @// y1 * cos1 + y3 * cos3(part of b0)
+    vtrn.16         d2, d3
+    vmlsl.s16       q13, d15, d1[3]     @// y1 * cos3 - y3 * sin1(part of b1)
+    vtrn.16         d4, d5
+    vmlsl.s16       q14, d15, d0[1]     @// y1 * sin3 - y3 * cos1(part of b2)
+    vtrn.16         d6, d7
+    vmlsl.s16       q15, d15, d1[1]     @// y1 * sin1 - y3 * sin3(part of b3)
+    vtrn.16         d8, d9
+    vmull.s16       q10, d10, d0[0]     @// y0 * cos4(part of c0 and c1)
+    vtrn.32         d2, d4
+    vmull.s16       q11, d12, d0[0]     @// y4 * cos4(part of c0 and c1)
+    vtrn.32         d3, d5
+    vmull.s16       q9, d11, d1[2]      @// y2 * sin2 (Q7 is freed by this time)(part of d1)
+    vtrn.32         d6, d8
+    vmull.s16       q7, d11, d0[2]      @// y2 * cos2(part of d0)
+    vtrn.32         d7, d9
+    vmlal.s16       q12, d16, d1[1]     @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+
+    add             r4, r2, r8, lsl #1  @ r4 = r2 + pred_strd * 2    => r4 points to 3rd row of pred data
+    vmlsl.s16       q13, d16, d0[1]     @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+
+    add             r5, r8, r8, lsl #1  @
+    vmlal.s16       q14, d16, d1[3]     @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+
+    add             r0, r3, r7, lsl #1  @ r0 points to 3rd row of dest data
+    vmlal.s16       q15, d16, d0[3]     @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    add             r10, r7, r7, lsl #1 @
+    vmlsl.s16       q9, d13, d0[2]      @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+
+
+    vmlal.s16       q7, d13, d1[2]      @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    vadd.s32        q6, q10, q11        @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    vsub.s32        q10, q10, q11       @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    vmlal.s16       q12, d17, d1[3]     @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
+    vswp            d3, d6
+    vmlsl.s16       q13, d17, d1[1]     @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
+
+    vswp            d5, d8
+    vmlal.s16       q14, d17, d0[3]     @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
+    vmlsl.s16       q15, d17, d0[1]     @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
+
+    vsub.s32        q11, q6, q7         @// a3 = c0 - d0(part of r3,r4)
+    vadd.s32        q6, q6, q7          @// a0 = c0 + d0(part of r0,r7)
+
+
+    vadd.s32        q0, q6, q12
+
+
+    vsub.s32        q12, q6, q12
+
+
+    vadd.s32        q6, q11, q15
+
+
+    vsub.s32        q7, q11, q15
+
+    vqrshrn.s32     d10, q0, #idct_stg2_shift
+    vqrshrn.s32     d17, q12, #idct_stg2_shift
+    vqrshrn.s32     d13, q6, #idct_stg2_shift
+    vqrshrn.s32     d14, q7, #idct_stg2_shift
+
+    vsub.s32        q11, q10, q9        @// a2 = c1 - d1(part of r2,r5)
+    vadd.s32        q9, q10, q9         @// a1 = c1 + d1(part of r1,r6)
+
+
+    vadd.s32        q0, q11, q14
+
+
+    vsub.s32        q12, q11, q14
+
+
+    vadd.s32        q14, q9, q13
+
+
+    vsub.s32        q13, q9, q13
+    vld1.8          d18, [r2], r8
+
+    vqrshrn.s32     d12, q0, #idct_stg2_shift
+    vld1.8          d20, [r2], r5
+
+
+    vqrshrn.s32     d15, q12, #idct_stg2_shift
+    vld1.8          d19, [r2], r8
+
+
+
+
+    vqrshrn.s32     d11, q14, #idct_stg2_shift
+    vld1.8          d22, [r4], r8
+
+
+
+
+    vqrshrn.s32     d16, q13, #idct_stg2_shift
+    vld1.8          d21, [r2], r5
+
+
+
+
+pred_buff_addition:
+
+
+    vtrn.16         d10, d11
+    vld1.8          d24, [r4], r5
+
+    vtrn.16         d12, d13
+    vld1.8          d23, [r4], r8
+
+    vaddw.u8        q1, q1, d18
+    vld1.8          d25, [r4], r5
+
+    vtrn.16         d14, d15
+    vaddw.u8        q2, q2, d22
+
+    vtrn.16         d16, d17
+    vaddw.u8        q3, q3, d20
+
+    vtrn.32         d10, d12
+    vaddw.u8        q4, q4, d24
+
+    vtrn.32         d11, d13
+    vtrn.32         d14, d16
+    vtrn.32         d15, d17
+
+    vswp            d11, d14
+    vswp            d13, d16
+
+@ Row values stored in the q register.
+
+@Q1 :r0
+@Q3: r1
+@Q2: r2
+@Q4: r3
+@Q5: r4
+@Q7: r5
+@Q6: r6
+@Q8: r7
+
+
+
+@/// Adding the prediction buffer
+
+
+
+
+
+
+
+
+
+    @ Load prediction data
+
+
+
+
+
+    @Adding recon with prediction
+
+
+
+
+
+    vaddw.u8        q5, q5, d19
+    vqmovun.s16     d2, q1
+    vaddw.u8        q7, q7, d21
+    vqmovun.s16     d4, q2
+    vaddw.u8        q6, q6, d23
+    vqmovun.s16     d6, q3
+    vaddw.u8        q8, q8, d25
+    vqmovun.s16     d8, q4
+
+
+
+
+
+
+
+    vst1.8          {d2}, [r3], r7
+    vqmovun.s16     d10, q5
+    vst1.8          {d6}, [r3], r10
+    vqmovun.s16     d14, q7
+    vst1.8          {d4}, [r0], r7
+    vqmovun.s16     d12, q6
+    vst1.8          {d8}, [r0], r10
+    vqmovun.s16     d16, q8
+
+
+
+
+
+
+
+    vst1.8          {d10}, [r3], r7
+    vst1.8          {d14}, [r3], r10
+    vst1.8          {d12}, [r0], r7
+    vst1.8          {d16}, [r0], r10
+
+
+
+
+    sub             sp, sp, #40
+    ldmfd           sp!, {r4-r12, pc}
+
+
+
diff --git a/common/arm/impeg2_inter_pred.s b/common/arm/impeg2_inter_pred.s
new file mode 100644
index 0000000..f1b3dde
--- /dev/null
+++ b/common/arm/impeg2_inter_pred.s
@@ -0,0 +1,801 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+
+@/*
+@//----------------------------------------------------------------------------
+@// File Name            : impeg2_inter_pred.s
+@//
+@// Description          : This file has motion compensation related
+@//                        interpolation functions on Neon + CortexA-8 platform
+@//
+@// Reference Document   :
+@//
+@// Revision History     :
+@//      Date            Author                  Detail Description
+@//   ------------    ----------------    ----------------------------------
+@//   18 jun 2010     S Hamsalekha              Created
+@//
+@//-------------------------------------------------------------------------
+@*/
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Include Files
+@// ----------------------------------------------------------------------------
+@*/
+.text
+.p2align 2
+
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Struct/Union Types and Define
+@// ----------------------------------------------------------------------------
+@*/
+
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Global Data section variables
+@// ----------------------------------------------------------------------------
+@*/
+@// -------------------------- NONE --------------------------------------------
+
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Prototype Functions
+@// ----------------------------------------------------------------------------
+@*/
+@// -------------------------- NONE --------------------------------------------
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Exported functions
+@// ----------------------------------------------------------------------------
+@*/
+
+@//---------------------------------------------------------------------------
+@// Function Name      :   impeg2_copy_mb_a9q()
+@//
+@// Detail Description : Copies one MB worth of data from src to the dst
+@//
+@// Inputs             : r0 - pointer to src
+@//                      r1 - pointer to dst
+@//                      r2 - source width
+@//                      r3 - destination width
+@// Registers Used     : r4, r5, d0, d1
+@//
+@// Stack Usage        : 12 bytes
+@//
+@// Outputs            :
+@//
+@// Return Data        : None
+@//
+@// Programming Note   : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+
+
+        .global impeg2_copy_mb_a9q
+
+
+impeg2_copy_mb_a9q:
+
+    stmfd           r13!, {r4, r5, r14}
+
+
+    ldr             r4, [r0]            @src->y
+    ldr             r5, [r1]            @dst->y
+    @Read one row of data from the src
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+
+    @//Repeat 15 times for y
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
+    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
+
+    mov             r2, r2, lsr #1      @src_offset /= 2
+    mov             r3, r3, lsr #1      @dst_offset /= 2
+
+    ldr             r4, [r0, #4]        @src->u
+    ldr             r5, [r1, #4]        @dst->u
+    @Read one row of data from the src
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+
+    @//Repeat 7 times for u
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+
+    ldr             r4, [r0, #8]        @src->v
+    ldr             r5, [r1, #8]        @dst->v
+    @Read one row of data from the src
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+
+    @//Repeat 7 times for v
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+    vld1.8          {d0}, [r4], r2      @Load and increment src
+    vst1.8          {d0}, [r5], r3      @Store and increment dst
+
+    ldmfd           r13!, {r4, r5, pc}
+
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name      :   impeg2_mc_fullx_halfy_8x8_a9q()
+@//
+@// Detail Description : This function pastes the reference block in the
+@//                      current frame buffer.This function is called for
+@//                      blocks that are not coded and have motion vectors
+@//                      with a half pel resolution.
+@//
+@// Inputs             : r0 - out    : Current Block Pointer
+@//                      r1 - ref     : Refernce Block Pointer
+@//                      r2 - ref_wid   : Refernce Block Width
+@//                      r3 - out_wid   ; Current Block Width
+@//
+@// Registers Used     : D0-D9
+@//
+@// Stack Usage        : 4 bytes
+@//
+@// Outputs            : The Motion Compensated Block
+@//
+@// Return Data        : None
+@//
+@// Programming Note   : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+        .global impeg2_mc_fullx_halfy_8x8_a9q
+
+impeg2_mc_fullx_halfy_8x8_a9q:
+
+    stmfd           r13!, {r14}
+    add             r14, r1, r2
+    mov             r2, r2, lsl #1
+
+@/* Load 8 + 1 rows from reference block */
+@/* Do the addition with out rounding off as rounding value is 1 */
+    vld1.8          {d0}, [r1], r2      @// first row hence r1 = D0
+    vld1.8          {d2}, [r14], r2     @// second row hence r2 = D2
+    vld1.8          {d4}, [r1], r2      @// third row hence r3 = D4
+    vld1.8          {d6}, [r14], r2     @// fourth row hence r4 = D6
+    vld1.8          {d1}, [r1], r2      @// fifth row hence r5 = D1
+    vld1.8          {d3}, [r14], r2     @// sixth row hence r6 = D3
+    vrhadd.u8       d9, d1, d6          @// estimated row 4 = D9
+    vld1.8          {d5}, [r1], r2      @// seventh row hence r7 = D5
+    vrhadd.u8       q0, q0, q1          @// estimated row 1 = D0, row 5 = D1
+    vld1.8          {d7}, [r14], r2     @// eighth row hence r8 = D7
+    vrhadd.u8       q1, q1, q2          @// estimated row 2 = D2, row 6 = D3
+    vld1.8          {d8}, [r1], r2      @// ninth row hence r9 = D8
+    vrhadd.u8       q2, q2, q3          @// estimated row 3 = D4, row 7 = D5
+
+    add             r14, r0, r3
+    mov             r3, r3, lsl #1
+
+@/* Store the eight rows calculated above */
+    vst1.8          {d2}, [r14], r3     @// second row hence D2
+    vrhadd.u8       d7, d7, d8          @// estimated row 8 = D7
+    vst1.8          {d0}, [r0], r3      @// first row hence D0
+    vst1.8          {d9}, [r14], r3     @// fourth row hence D9
+    vst1.8          {d4}, [r0], r3      @// third row hence D4
+    vst1.8          {d3}, [r14], r3     @// sixth row hence r6 = D3
+    vst1.8          {d1}, [r0], r3      @// fifth row hence r5 = D1
+    vst1.8          {d7}, [r14], r3     @// eighth row hence r8 = D7
+    vst1.8          {d5}, [r0], r3      @// seventh row hence r7 = D5
+
+    ldmfd           sp!, {pc}
+
+
+
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name      :   impeg2_mc_halfx_fully_8x8_a9q()
+@//
+@// Detail Description : This function pastes the reference block in the
+@//                      current frame buffer.This function is called for
+@//                      blocks that are not coded and have motion vectors
+@//                      with a half pel resolutionand VopRoundingType is 0 ..
+@//
+@// Inputs             : r0 - out    : Current Block Pointer
+@//                      r1 - ref     : Refernce Block Pointer
+@//                      r2 - ref_wid   : Refernce Block Width
+@//                      r3 - out_wid   ; Current Block Width
+@//
+@// Registers Used     : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22
+
+@//
+@// Stack Usage        : 8 bytes
+@//
+@// Outputs            : The Motion Compensated Block
+@//
+@// Return Data        : None
+@//
+@// Programming Note   : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+
+
+        .global impeg2_mc_halfx_fully_8x8_a9q
+
+
+
+impeg2_mc_halfx_fully_8x8_a9q:
+
+    stmfd           sp!, {r12, lr}
+
+    add             r14, r1, r2, lsl #2
+
+    add             r12, r0, r3, lsl#2
+
+    vld1.8          {d0, d1}, [r1], r2  @load 16 pixels of  row1
+
+    vld1.8          {d2, d3}, [r14], r2 @ row5
+
+
+    vld1.8          {d4, d5}, [r1], r2  @load 16 pixels row2
+
+    vld1.8          {d6, d7}, [r14], r2 @row6
+
+
+    vext.8          d8, d0, d1, #1      @Extract pixels (1-8) of row1
+
+    vext.8          d12, d2, d3, #1     @Extract pixels (1-8) of row5
+
+    vext.8          d16, d4, d5, #1     @Extract pixels (1-8) of row2
+
+    vext.8          d20, d6, d7, #1     @Extract pixels (1-8) of row6
+
+
+    vld1.8          {d9, d10}, [r1], r2 @load row3
+
+    vld1.8          {d13, d14}, [r14], r2 @load row7
+
+    vld1.8          {d17, d18}, [r1], r2 @load  row4
+
+    vld1.8          {d21, d22}, [r14], r2 @load  row8
+
+
+    vext.8          d1, d9, d10, #1     @Extract pixels (1-8) of row3
+
+    vext.8          d3, d13, d14, #1    @Extract pixels (1-8) of row7
+
+
+
+    vext.8          d5, d17, d18, #1    @Extract pixels (1-8) of row4
+
+    vext.8          d7, d21, d22, #1    @Extract pixels (1-8) of row8
+
+
+    vrhadd.u8       q0, q0, q4          @operate on row1 and row3
+
+    vrhadd.u8       q1, q1, q6          @operate on row5 and row7
+
+
+    vrhadd.u8       q2, q2, q8          @operate on row2 and row4
+
+
+
+    vrhadd.u8       q3, q3, q10         @operate on row6 and row8
+
+    vst1.8          d0, [r0], r3        @store row1
+
+    vst1.8          d2, [r12], r3       @store row5
+
+    vst1.8          d4, [r0], r3        @store row2
+
+    vst1.8          d6, [r12], r3       @store row6
+
+    vst1.8          d1, [r0], r3        @store row3
+
+    vst1.8          d3, [r12], r3       @store row7
+
+    vst1.8          d5, [r0], r3        @store row4
+
+    vst1.8          d7, [r12], r3       @store row8
+
+
+
+    ldmfd           sp!, {r12, pc}
+
+
+
+
+
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name      :   impeg2_mc_halfx_halfy_8x8_a9q()
+@//
+@// Detail Description : This function pastes the reference block in the
+@//                      current frame buffer.This function is called for
+@//                      blocks that are not coded and have motion vectors
+@//                      with a half pel resolutionand VopRoundingType is 0 ..
+@//
+@// Inputs             : r0 - out    : Current Block Pointer
+@//                      r1 - ref     : Refernce Block Pointer
+@//                      r2 - ref_wid   : Refernce Block Width
+@//                      r3 - out_wid   ; Current Block Width
+@//
+@// Registers Used     : r14, q0-q15
+
+@//
+@// Stack Usage        : 4 bytes
+@//
+@// Outputs            : The Motion Compensated Block
+@//
+@// Return Data        : None
+@//
+@// Programming Note   : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+
+        .global impeg2_mc_halfx_halfy_8x8_a9q
+
+impeg2_mc_halfx_halfy_8x8_a9q:
+
+    stmfd           sp!, {r14}
+
+    add             r14, r1, r2, lsl #2
+
+    vld1.8          {d0, d1}, [r1], r2  @load 16 pixels of  row1
+
+    vld1.8          {d2, d3}, [r14], r2 @ row5
+
+    vld1.8          {d4, d5}, [r1], r2  @load 16 pixels row2
+
+    vld1.8          {d6, d7}, [r14], r2 @row6
+
+    vext.8          d1, d0, d1, #1      @Extract pixels (1-8) of row1
+
+
+
+    vext.8          d3, d2, d3, #1      @Extract pixels (1-8) of row5
+
+
+
+    vext.8          d5, d4, d5, #1      @Extract pixels (1-8) of row2
+
+    vext.8          d7, d6, d7, #1      @Extract pixels (1-8) of row6
+
+
+
+
+    vld1.8          {d8, d9}, [r1], r2  @load row3
+
+
+
+    vld1.8          {d10, d11}, [r14], r2 @load row7
+
+    vld1.8          {d12, d13}, [r1], r2 @load  row4
+
+    vld1.8          {d14, d15}, [r14], r2 @load  row8
+
+    vext.8          d9, d8, d9, #1      @Extract pixels (1-8) of row3
+
+    vld1.8          {d16, d17}, [r14], r2 @load  row9
+
+
+
+
+
+    vext.8          d11, d10, d11, #1   @Extract pixels (1-8) of row7
+
+
+
+    vext.8          d13, d12, d13, #1   @Extract pixels (1-8) of row4
+
+
+
+    vext.8          d15, d14, d15, #1   @Extract pixels (1-8) of row8
+
+    vext.8          d17, d16, d17, #1   @Extract pixels (1-8) of row9
+
+
+    @interpolation in x direction
+
+    vaddl.u8        q0, d0, d1          @operate row1
+
+    vaddl.u8        q1, d2, d3          @operate row5
+
+    vaddl.u8        q2, d4, d5          @operate row2
+
+    vaddl.u8        q3, d6, d7          @operate row6
+
+    vaddl.u8        q4, d8, d9          @operate row3
+
+    vaddl.u8        q5, d10, d11        @operate row7
+
+    vaddl.u8        q6, d12, d13        @operate row4
+
+    vaddl.u8        q7, d14, d15        @operate row8
+
+    vaddl.u8        q8, d16, d17        @operate row9
+
+    @interpolation in y direction
+
+    add             r14, r0, r3, lsl #2
+
+
+
+    vadd.u16        q9, q0, q2          @operate row1 and row2
+
+    vadd.u16        q13, q1, q3         @operate row5 and row6
+
+    vadd.u16        q10, q2, q4         @operate row2 and row3
+
+    vadd.u16        q14, q3, q5         @operate row6 and row7
+
+    vrshrn.u16      d18, q9, #2         @row1
+
+    vrshrn.u16      d26, q13, #2        @row5
+
+    vrshrn.u16      d20, q10, #2        @row2
+
+    vrshrn.u16      d28, q14, #2        @row6
+
+    vadd.u16        q11, q4, q6         @operate row3 and row4
+
+    vst1.8          d18, [r0], r3       @store row1
+
+    vadd.u16        q15, q5, q7         @operate row7 and row8
+
+    vst1.8          d26, [r14], r3      @store row5
+
+    vadd.u16        q12, q6, q1         @operate row4 and row5
+
+    vst1.8          d20, [r0], r3       @store row2
+
+    vadd.u16        q7, q7, q8          @operate row8 and row9
+
+    vst1.8          d28, [r14], r3      @store row6
+
+
+
+    vrshrn.u16      d22, q11, #2        @row3
+
+    vrshrn.u16      d30, q15, #2        @row7
+
+    vrshrn.u16      d24, q12, #2        @row4
+
+    vrshrn.u16      d14, q7, #2         @row8
+
+
+    vst1.8          d22, [r0], r3       @store row3
+    vst1.8          d30, [r14], r3      @store row7
+    vst1.8          d24, [r0], r3       @store row4
+    vst1.8          d14, [r14], r3      @store row8
+
+
+
+    ldmfd           sp!, {pc}
+
+
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name      :   impeg2_mc_fullx_fully_8x8_a9q()
+@//
+@// Detail Description : This function pastes the reference block in the
+@//                      current frame buffer.This function is called for
+@//                      blocks that are not coded and have motion vectors
+@//                      with a half pel resolutionand ..
+@//
+@// Inputs             : r0 - out    : Current Block Pointer
+@//                      r1 - ref     : Refernce Block Pointer
+@//                      r2 - ref_wid   : Refernce Block Width
+@//                      r3 - out_wid   ; Current Block Width
+@//
+@// Registers Used     : r12, r14, d0-d3
+
+@//
+@// Stack Usage        : 8 bytes
+@//
+@// Outputs            : The Motion Compensated Block
+@//
+@// Return Data        : None
+@//
+@// Programming Note   : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+
+        .global impeg2_mc_fullx_fully_8x8_a9q
+impeg2_mc_fullx_fully_8x8_a9q:
+
+
+    stmfd           sp!, {r12, lr}
+
+    add             r14, r1, r2, lsl #2
+
+    add             r12, r0, r3, lsl #2
+
+
+    vld1.8          d0, [r1], r2        @load row1
+
+    vld1.8          d1, [r14], r2       @load row4
+
+    vld1.8          d2, [r1], r2        @load row2
+
+    vld1.8          d3, [r14], r2       @load row5
+
+
+    vst1.8          d0, [r0], r3        @store row1
+
+    vst1.8          d1, [r12], r3       @store row4
+
+    vst1.8          d2, [r0], r3        @store row2
+
+    vst1.8          d3, [r12], r3       @store row5
+
+
+    vld1.8          d0, [r1], r2        @load row3
+
+    vld1.8          d1, [r14], r2       @load row6
+
+    vld1.8          d2, [r1], r2        @load row4
+
+    vld1.8          d3, [r14], r2       @load row8
+
+
+    vst1.8          d0, [r0], r3        @store row3
+
+    vst1.8          d1, [r12], r3       @store row6
+
+    vst1.8          d2, [r0], r3        @store row4
+
+    vst1.8          d3, [r12], r3       @store row8
+
+
+    ldmfd           sp!, {r12, pc}
+
+
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name      :   impeg2_interpolate_a9q()
+@//
+@// Detail Description : interpolates two buffers and adds pred
+@//
+@// Inputs             : r0 - pointer to src1
+@//                      r1 - pointer to src2
+@//                      r2 - dest buf
+@//                      r3 - dst stride
+@// Registers Used     : r4, r5, r7, r14, d0-d15
+@//
+@// Stack Usage        : 20 bytes
+@//
+@// Outputs            : The Motion Compensated Block
+@//
+@// Return Data        : None
+@//
+@// Programming Note   : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+
+        .global impeg2_interpolate_a9q
+
+
+impeg2_interpolate_a9q:
+
+    stmfd           r13!, {r4, r5, r7, r12, r14}
+
+    ldr             r4, [r0, #0]        @ptr_y src1
+
+    ldr             r5, [r1, #0]        @ptr_y src2
+
+    ldr             r7, [r2, #0]        @ptr_y dst buf
+
+    mov             r12, #4             @counter for number of blocks
+
+
+interp_lumablocks_stride:
+
+    vld1.8          {d0, d1}, [r4]!     @row1 src1
+
+    vld1.8          {d2, d3}, [r4]!     @row2 src1
+
+    vld1.8          {d4, d5}, [r4]!     @row3 src1
+
+    vld1.8          {d6, d7}, [r4]!     @row4 src1
+
+
+    vld1.8          {d8, d9}, [r5]!     @row1 src2
+
+    vld1.8          {d10, d11}, [r5]!   @row2 src2
+
+    vld1.8          {d12, d13}, [r5]!   @row3 src2
+
+    vld1.8          {d14, d15}, [r5]!   @row4 src2
+
+
+
+
+    vrhadd.u8       q0, q0, q4          @operate on row1
+
+    vrhadd.u8       q1, q1, q5          @operate on row2
+
+    vrhadd.u8       q2, q2, q6          @operate on row3
+
+    vrhadd.u8       q3, q3, q7          @operate on row4
+
+
+
+    vst1.8          {d0, d1}, [r7], r3  @row1
+
+    vst1.8          {d2, d3}, [r7], r3  @row2
+
+    vst1.8          {d4, d5}, [r7], r3  @row3
+
+    vst1.8          {d6, d7}, [r7], r3  @row4
+
+    subs            r12, r12, #1
+
+    bne             interp_lumablocks_stride
+
+
+    mov             r3, r3, lsr #1      @stride >> 1
+
+    ldr             r4, [r0, #4]        @ptr_u src1
+
+    ldr             r5, [r1, #4]        @ptr_u src2
+
+    ldr             r7 , [r2, #4]       @ptr_u dst buf
+
+    mov             r12, #2             @counter for number of blocks
+
+
+
+@chroma blocks
+
+interp_chromablocks_stride:
+
+    vld1.8          {d0, d1}, [r4]!     @row1 & 2 src1
+
+    vld1.8          {d2, d3}, [r4]!     @row3 & 4 src1
+
+    vld1.8          {d4, d5}, [r4]!     @row5 & 6 src1
+
+    vld1.8          {d6, d7}, [r4]!     @row7 & 8 src1
+
+
+    vld1.8          {d8, d9}, [r5]!     @row1 & 2 src2
+
+    vld1.8          {d10, d11}, [r5]!   @row3 & 4 src2
+
+    vld1.8          {d12, d13}, [r5]!   @row5 & 6 src2
+
+    vld1.8          {d14, d15}, [r5]!   @row7 & 8 src2
+
+
+
+
+    vrhadd.u8       q0, q0, q4          @operate on row1 & 2
+
+    vrhadd.u8       q1, q1, q5          @operate on row3 & 4
+
+    vrhadd.u8       q2, q2, q6          @operate on row5 & 6
+
+    vrhadd.u8       q3, q3, q7          @operate on row7 & 8
+
+
+    vst1.8          {d0}, [r7], r3      @row1
+
+    vst1.8          {d1}, [r7], r3      @row2
+
+    vst1.8          {d2}, [r7], r3      @row3
+
+    vst1.8          {d3}, [r7], r3      @row4
+
+    vst1.8          {d4}, [r7], r3      @row5
+
+    vst1.8          {d5}, [r7], r3      @row6
+
+    vst1.8          {d6}, [r7], r3      @row7
+
+    vst1.8          {d7}, [r7], r3      @row8
+
+
+
+    ldr             r4, [r0, #8]        @ptr_v src1
+
+    ldr             r5, [r1, #8]        @ptr_v src2
+
+    ldr             r7, [r2, #8]        @ptr_v dst buf
+
+    subs            r12, r12, #1
+
+    bne             interp_chromablocks_stride
+
+
+    ldmfd           r13!, {r4, r5, r7, r12, pc}
+
+
+
+
+
diff --git a/common/arm/impeg2_mem_func.s b/common/arm/impeg2_mem_func.s
new file mode 100755
index 0000000..869b7d7
--- /dev/null
+++ b/common/arm/impeg2_mem_func.s
@@ -0,0 +1,177 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+
+@/*
+@//----------------------------------------------------------------------------
+@// File Name            : impeg2_mem_func.s
+@//
+@// Description          : This file has motion compensation related
+@//                        interpolation functions on Neon + CortexA-8 platform
+@//
+@// Reference Document   :
+@//
+@// Revision History     :
+@//      Date            Author                  Detail Description
+@//   ------------    ----------------    ----------------------------------
+@//   18 jun 2010     S Hamsalekha              Created
+@//
+@//-------------------------------------------------------------------------
+@*/
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Include Files
+@// ----------------------------------------------------------------------------
+@*/
+.text
+.p2align 2
+
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Struct/Union Types and Define
+@// ----------------------------------------------------------------------------
+@*/
+
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Global Data section variables
+@// ----------------------------------------------------------------------------
+@*/
+@// -------------------------- NONE --------------------------------------------
+
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Prototype Functions
+@// ----------------------------------------------------------------------------
+@*/
+@// -------------------------- NONE --------------------------------------------
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Exported functions
+@// ----------------------------------------------------------------------------
+@*/
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name      : impeg2_memset_8bit_8x8_block_a9q()
+@//
+@// Detail Description : This routine intialises the Block matrix buffer contents to a
+@//                      particular Value. This function also assumes the buffer size
+@//                      to be set is 64 Bytes fixed. It also assumes that blk matrix
+@//                      used is 64 bit aligned.
+@//
+@// Inputs             : r0: pi2_blk_mat : Block Pointer
+@//                      r1: u2_val      : Value with which the block is initialized
+@//                      r2: u4_dst_width: Destination Width
+@//
+@// Registers Used     : q0
+@//
+@// Stack Usage        : 4 bytes
+@//
+@// Outputs            : Block Matrix Initialized to given value
+@//
+@// Return Data        : None
+@//
+@// Programming Note   : None
+@//-----------------------------------------------------------------------------
+@*/
+        .global impeg2_memset_8bit_8x8_block_a9q
+impeg2_memset_8bit_8x8_block_a9q:
+    str             lr, [sp, #-4]!
+
+    vdup.8          d0, r1              @//r1 is the 8-bit value to be set into
+
+    vst1.8          {d0}, [r0], r2      @//Store the row 1
+    vst1.8          {d0}, [r0], r2      @//Store the row 2
+    vst1.8          {d0}, [r0], r2      @//Store the row 3
+    vst1.8          {d0}, [r0], r2      @//Store the row 4
+    vst1.8          {d0}, [r0], r2      @//Store the row 5
+    vst1.8          {d0}, [r0], r2      @//Store the row 6
+    vst1.8          {d0}, [r0], r2      @//Store the row 7
+    vst1.8          {d0}, [r0], r2      @//Store the row 8
+
+    ldr             pc, [sp], #4
+
+
+
+
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name      :   impeg2_memset0_16bit_8x8_linear_block_a9q()
+@//
+@// Detail Description : memsets 128 byte long linear buf to 0
+@//
+@// Inputs             : r0 - Buffer
+@// Registers Used     : q0
+
+@//
+@// Stack Usage        : 4 bytes
+@//
+@// Outputs            : None
+@//
+@// Return Data        : None
+@//
+@// Programming Note   : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+
+
+        .global impeg2_memset0_16bit_8x8_linear_block_a9q
+
+
+impeg2_memset0_16bit_8x8_linear_block_a9q:
+
+    stmfd           r13!, {r14}
+
+    vmov.i16        q0, #0
+
+@Y data
+
+    vst1.16         {d0, d1} , [r0]!    @row1
+
+    vst1.16         {d0, d1} , [r0]!    @row2
+
+    vst1.16         {d0, d1} , [r0]!    @row3
+
+    vst1.16         {d0, d1} , [r0]!    @row4
+
+    vst1.16         {d0, d1} , [r0]!    @row5
+
+    vst1.16         {d0, d1} , [r0]!    @row6
+
+    vst1.16         {d0, d1} , [r0]!    @row7
+
+    vst1.16         {d0, d1} , [r0]!    @row8
+
+
+
+    ldmfd           r13!, {pc}
+
+
+
+
diff --git a/common/arm/impeg2_platform_macros.h b/common/arm/impeg2_platform_macros.h
new file mode 100644
index 0000000..11db302
--- /dev/null
+++ b/common/arm/impeg2_platform_macros.h
@@ -0,0 +1,75 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_PLATFORM_MACROS_H__
+#define __IMPEG2_PLATFORM_MACROS_H__
+
+
+#define CONV_LE_TO_BE(u4_temp2,u4_temp1) u4_temp2 =                            \
+                                         (u4_temp1 << 24) |                    \
+                                         ((u4_temp1 & 0xff00) << 8) |          \
+                                         ((u4_temp1 & 0xff0000) >> 8) |        \
+                                         (u4_temp1 >> 24);
+
+static __inline  UWORD32 CLZ(UWORD32 u4_word)
+{
+    if(u4_word)
+        return (__builtin_clz(u4_word));
+    else
+        return 32;
+}
+static __inline WORD32 CLIP_U8(WORD32 x)
+{
+    asm("usat %0, #8, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_S8(WORD32 x)
+{
+    asm("ssat %0, #8, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_U12(WORD32 x)
+{
+    asm("usat %0, #12, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_S12(WORD32 x)
+{
+    asm("ssat %0, #12, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_U16(WORD32 x)
+{
+    asm("usat %0, #16, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+static __inline WORD32 CLIP_S16(WORD32 x)
+{
+    asm("ssat %0, #16, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+#define INLINE
+#define PLD(x) __pld(x)
+
+#endif /* __IMPEG2_PLATFORM_MACROS_H__ */
diff --git a/common/armv8/impeg2_format_conv.s b/common/armv8/impeg2_format_conv.s
new file mode 100644
index 0000000..48baf04
--- /dev/null
+++ b/common/armv8/impeg2_format_conv.s
@@ -0,0 +1,409 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+
+///*
+////----------------------------------------------------------------------------
+//// File Name            : impeg2_format_conv.s
+////
+//// Description          : This file has the Idct Implementations for the
+////                        MPEG4 SP decoder on neon platform.
+////
+//// Reference Document   :
+////
+//// Revision History     :
+////      Date            Author                  Detail Description
+////   ------------    ----------------    ----------------------------------
+////   Jul 07, 2008     Naveen Kumar T                Created
+////
+////-------------------------------------------------------------------------
+//*/
+
+///*
+//// ----------------------------------------------------------------------------
+//// Include Files
+//// ----------------------------------------------------------------------------
+//*/
+.set log2_16                    ,      4
+.set log2_2                     ,      1
+
+.text
+.include "impeg2_neon_macros.s"
+///*
+//// ----------------------------------------------------------------------------
+//// Struct/Union Types and Define
+//// ----------------------------------------------------------------------------
+//*/
+
+///*
+//// ----------------------------------------------------------------------------
+//// Static Global Data section variables
+//// ----------------------------------------------------------------------------
+//*/
+////--------------------------- NONE --------------------------------------------
+
+///*
+//// ----------------------------------------------------------------------------
+//// Static Prototype Functions
+//// ----------------------------------------------------------------------------
+//*/
+//// -------------------------- NONE --------------------------------------------
+
+///*
+//// ----------------------------------------------------------------------------
+//// Exported functions
+//// ----------------------------------------------------------------------------
+//*/
+
+
+///*****************************************************************************
+//*                                                                            *
+//*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8()                      *
+//*                                                                            *
+//*  Description      : This function conversts the image from YUV420P color   *
+//*                     space to 420SP color space(UV interleaved).           *
+//*                                                                            *
+//*  Arguments        : x0          pu1_y                                     *
+//*                     x1          pu1_u                                     *
+//*                     x2          pu1_v                                     *
+//*                     x3          pu1_dest_y                                *
+//*                     x4          pu1_dest_uv                               *
+//*                     x5          u2_height                                 *
+//*                     x6          u2_width                                  *
+//*                     x7          u2_stridey                                *
+//*                     sp, #80     u2_strideu                                *
+//*                     sp, #88     u2_stridev                                *
+//*                     sp, #96     u2_dest_stride_y                          *
+//*                     sp, #104    u2_dest_stride_uv                         *
+//*                     sp, #112    convert_uv_only                           *
+//*                                                                            *
+//*  Values Returned  : None                                                   *
+//*                                                                            *
+//*  Register Usage   : x8, x10, x16, x20, v0, v1                              *
+//*                                                                            *
+//*  Stack Usage      : 80 Bytes                                               *
+//*                                                                            *
+//*  Interruptibility : Interruptible                                          *
+//*                                                                            *
+//*  Known Limitations                                                         *
+//*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
+//*                     greater than or equal to 16                  *
+//*                     Image Height:    Assumed to be even.                   *
+//*                                                                            *
+//*  Revision History :                                                        *
+//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
+//*         07 06 2010   Varshita        Draft                                 *
+//*         07 06 2010   Naveen Kr T     Completed                             *
+//*                                                                            *
+//*****************************************************************************/
+.global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8
+impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8:
+
+    //// push the registers on the stack
+    //    pu1_y,                - x0
+    //    pu1_u,                - x1
+    //    pu1_v,                - x2
+    //    pu1_dest_y,           - x3
+    //    pu1_dest_uv,          - x4
+    //    u2_height,            - x5
+    //    u2_width,             - x6
+    //    u2_stridey,           - x7
+    //    u2_strideu,           - sp, #80
+    //    u2_stridev,           - sp, #88
+    //    u2_dest_stride_y,     - sp, #96
+    //    u2_dest_stride_uv,    - sp, #104
+    //    convert_uv_only       - sp, #112
+    // STMFD sp!,{x4-x12,x14}
+    push_v_regs
+    stp             x19, x20, [sp, #-16]!
+
+    ldr             w14, [sp, #112]     //// Load convert_uv_only
+
+    cmp             w14, #1
+    beq             yuv420sp_uv_chroma
+    ///* Do the preprocessing before the main loops start */
+    //// Load the parameters from stack
+
+    ldr             w8, [sp, #96]       //// Load u2_dest_stride_y from stack
+    uxtw            x8, w8
+
+    sub             x7, x7, x6          //// Source increment
+
+    sub             x8, x8, x6          //// Destination increment
+
+
+yuv420sp_uv_row_loop_y:
+    mov             x16, x6
+
+yuv420sp_uv_col_loop_y:
+    prfm            pldl1keep, [x0, #128]
+    ld1             {v0.8b, v1.8b}, [x0], #16
+    st1             {v0.8b, v1.8b}, [x3], #16
+    sub             x16, x16, #16
+    cmp             x16, #15
+    bgt             yuv420sp_uv_col_loop_y
+
+    cmp             x16, #0
+    beq             yuv420sp_uv_row_loop__y
+    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    ////Ex if width is 162, above loop will process 160 pixels. And
+    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+    //// and written using VLD1 and VST1
+    sub             x20, x16, #16
+    neg             x16, x20
+    sub             x0, x0, x16
+    sub             x3, x3, x16
+
+    ld1             {v0.8b, v1.8b}, [x0], #16
+    st1             {v0.8b, v1.8b}, [x3], #16
+
+yuv420sp_uv_row_loop__y:
+    add             x0, x0, x7
+    add             x3, x3, x8
+    subs            x5, x5, #1
+    bgt             yuv420sp_uv_row_loop_y
+
+yuv420sp_uv_chroma:
+    ldr             w7, [sp, #88]       //// Load u2_strideu from stack
+    sxtw            x7, w7
+
+    ldr             w8, [sp, #104]      //// Load u2_dest_stride_uv from stack
+    sxtw            x8, w8
+
+    sub             x7, x7, x6, lsr #1  //// Source increment
+
+    sub             x8, x8, x6          //// Destination increment
+
+    lsr             x6, x6, #1
+    lsr             x5, x5, #1
+yuv420sp_uv_row_loop_uv:
+    mov             x16, x6
+
+
+yuv420sp_uv_col_loop_uv:
+    prfm            pldl1keep, [x1, #128]
+    prfm            pldl1keep, [x2, #128]
+
+    ld1             {v0.8b}, [x1], #8
+    ld1             {v1.8b}, [x2], #8
+    st2             {v0.8b, v1.8b}, [x4], #16
+
+    sub             x16, x16, #8
+    cmp             x16, #7
+    bgt             yuv420sp_uv_col_loop_uv
+
+    cmp             x16, #0
+    beq             yuv420sp_uv_row_loop__uv
+    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    ////Ex if width is 162, above loop will process 160 pixels. And
+    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+    //// and written using VLD1 and VST1
+    sub             x20, x16, #8
+    neg             x16, x20
+    sub             x1, x1, x16
+    sub             x2, x2, x16
+    sub             x4, x4, x16, lsl #1
+
+    ld1             {v0.8b}, [x1], #8
+    ld1             {v1.8b}, [x2], #8
+    st2             {v0.8b, v1.8b}, [x4], #16
+
+yuv420sp_uv_row_loop__uv:
+    add             x1, x1, x7
+    add             x2, x2, x7
+    add             x4, x4, x8
+    subs            x5, x5, #1
+    bgt             yuv420sp_uv_row_loop_uv
+    ////POP THE REGISTERS
+    // LDMFD sp!,{x4-x12,PC}
+    ldp             x19, x20, [sp], #16
+    pop_v_regs
+    ret
+
+
+
+
+
+///*****************************************************************************
+//*                                                                            *
+//*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8()                      *
+//*                                                                            *
+//*  Description      : This function conversts the image from YUV420P color   *
+//*                     space to 420SP color space(VU interleaved).           *
+//*               This function is similar to above function          *
+//*               IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in   *
+//*               VLD1.8 for chroma - order of registers is different    *
+//*                                                                            *
+//*  Arguments        : x0          pu1_y                                     *
+//*                     x1          pu1_u                                     *
+//*                     x2          pu1_v                                     *
+//*                     x3          pu1_dest_y                                *
+//*                     x4          pu1_dest_uv                               *
+//*                     x5          u2_height                                 *
+//*                     x6          u2_width                                  *
+//*                     x7          u2_stridey                                *
+//*                     sp, #80     u2_strideu                                *
+//*                     sp, #88     u2_stridev                                *
+//*                     sp, #96     u2_dest_stride_y                          *
+//*                     sp, #104    u2_dest_stride_uv                         *
+//*                     sp, #112    convert_uv_only                           *
+//*                                                                            *
+//*  Values Returned  : None                                                   *
+//*                                                                            *
+//*  Register Usage   : x8, x14, x16, x20, v0, v1                              *
+//*                                                                            *
+//*  Stack Usage      : 80 Bytes                                               *
+//*                                                                            *
+//*  Interruptibility : Interruptible                                          *
+//*                                                                            *
+//*  Known Limitations                                                         *
+//*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
+//*                     greater than or equal to 16                  *
+//*                     Image Height:    Assumed to be even.                   *
+//*                                                                            *
+//*  Revision History :                                                        *
+//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
+//*         07 06 2010   Varshita        Draft                                 *
+//*         07 06 2010   Naveen Kr T     Completed                             *
+//*                                                                            *
+//*****************************************************************************/
+
+.global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8
+impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8:
+
+    //// push the registers on the stack
+    //    pu1_y,                - x0
+    //    pu1_u,                - x1
+    //    pu1_v,                - x2
+    //    pu1_dest_y,           - x3
+    //    pu1_dest_uv,          - x4
+    //    u2_height,            - x5
+    //    u2_width,             - x6
+    //    u2_stridey,           - x7
+    //    u2_strideu,           - sp, #80
+    //    u2_stridev,           - sp, #88
+    //    u2_dest_stride_y,     - sp, #96
+    //    u2_dest_stride_uv,    - sp, #104
+    //    convert_uv_only       - sp, #112
+    // STMFD sp!,{x4-x12,x14}
+    push_v_regs
+    stp             x19, x20, [sp, #-16]!
+
+    ldr             w14, [sp, #112]     //// Load convert_uv_only
+
+    cmp             w14, #1
+    beq             yuv420sp_vu_chroma
+
+    ///* Do the preprocessing before the main loops start */
+    //// Load the parameters from stack
+
+    ldr             w8, [sp, #96]       //// Load u2_dest_stride_y from stack
+    uxtw            x8, w8
+
+    sub             x7, x7, x6          //// Source increment
+
+    sub             x8, x8, x6          //// Destination increment
+
+
+yuv420sp_vu_row_loop_y:
+    mov             x16, x6
+
+yuv420sp_vu_col_loop_y:
+    prfm            pldl1keep, [x0, #128]
+    ld1             {v0.8b, v1.8b}, [x0], #16
+    st1             {v0.8b, v1.8b}, [x3], #16
+    sub             x16, x16, #16
+    cmp             x16, #15
+    bgt             yuv420sp_vu_col_loop_y
+
+    cmp             x16, #0
+    beq             yuv420sp_vu_row_loop__y
+    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    ////Ex if width is 162, above loop will process 160 pixels. And
+    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+    //// and written using VLD1 and VST1
+    sub             x20, x16, #16
+    neg             x16, x20
+    sub             x0, x0, x16
+    sub             x3, x3, x16
+
+    ld1             {v0.8b, v1.8b}, [x0], #16
+    st1             {v0.8b, v1.8b}, [x3], #16
+
+yuv420sp_vu_row_loop__y:
+    add             x0, x0, x7
+    add             x3, x3, x8
+    subs            x5, x5, #1
+    bgt             yuv420sp_vu_row_loop_y
+
+yuv420sp_vu_chroma:
+    ldr             w7, [sp, #80]       //// Load u2_strideu from stack
+    sxtw            x7, w7
+
+    ldr             w8, [sp, #104]      //// Load u2_dest_stride_uv from stack
+    sxtw            x8, w8
+
+    sub             x7, x7, x6, lsr #1  //// Source increment
+
+    sub             x8, x8, x6          //// Destination increment
+
+    lsr             x6, x6, #1
+    lsr             x5, x5, #1
+yuv420sp_vu_row_loop_uv:
+    mov             x16, x6
+
+
+yuv420sp_vu_col_loop_uv:
+    prfm            pldl1keep, [x1, #128]
+    prfm            pldl1keep, [x2, #128]
+    ld1             {v1.8b}, [x1], #8
+    ld1             {v0.8b}, [x2], #8
+    st2             {v0.8b, v1.8b}, [x4], #16
+    sub             x16, x16, #8
+    cmp             x16, #7
+    bgt             yuv420sp_vu_col_loop_uv
+
+    cmp             x16, #0
+    beq             yuv420sp_vu_row_loop__uv
+    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    ////Ex if width is 162, above loop will process 160 pixels. And
+    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+    //// and written using VLD1 and VST1
+    sub             x20, x16, #8
+    neg             x16, x20
+    sub             x1, x1, x16
+    sub             x2, x2, x16
+    sub             x4, x4, x16, lsl #1
+
+    ld1             {v1.8b}, [x1], #8
+    ld1             {v0.8b}, [x2], #8
+    st2             {v0.8b, v1.8b}, [x4], #16
+
+yuv420sp_vu_row_loop__uv:
+    add             x1, x1, x7
+    add             x2, x2, x7
+    add             x4, x4, x8
+    subs            x5, x5, #1
+    bgt             yuv420sp_vu_row_loop_uv
+    ////POP THE REGISTERS
+    // LDMFD sp!,{x4-x12,PC}
+    ldp             x19, x20, [sp], #16
+    pop_v_regs
+    ret
+
diff --git a/common/armv8/impeg2_idct.s b/common/armv8/impeg2_idct.s
new file mode 100644
index 0000000..4956e54
--- /dev/null
+++ b/common/armv8/impeg2_idct.s
@@ -0,0 +1,1247 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+// *******************************************************************************
+// * @file
+// *  impeg2_idct.s
+// *
+// * @brief
+// *  contains function definitions for single stage  inverse transform
+// *
+// * @author
+// *  anand s
+// *
+// * @par list of functions:
+// *  - impeg2_idct_recon_dc_av8()
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+//*/
+
+///**
+// *******************************************************************************
+// *
+// * @brief
+// *  this function performs inverse transform  and reconstruction for 8x8
+// * input block
+// *
+// * @par description:
+// *  performs inverse transform and adds the prediction  data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// *  input 8x8 coefficients
+// *
+// * @param[in] pi2_tmp
+// *  temporary 8x8 buffer for storing inverse
+// *
+// *  transform
+// *  1st stage output
+// *
+// * @param[in] pu1_pred
+// *  prediction 8x8 block
+// *
+// * @param[out] pu1_dst
+// *  output 8x8 block
+// *
+// * @param[in] src_strd
+// *  input stride
+// *
+// * @param[in] pred_strd
+// *  prediction stride
+// *
+// * @param[in] dst_strd
+// *  output stride
+// *
+// * @param[in] shift
+// *  output shift
+// *
+// * @param[in] zero_cols
+// *  zero columns in pi2_src
+// *
+// * @returns  void
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+// */
+
+//void impeg2_itrans_recon_8x8(word16 *pi2_src,
+//                            word16 *pi2_tmp,
+//                            uword8 *pu1_pred,
+//                            uword8 *pu1_dst,
+//                            word32 src_strd,
+//                            word32 pred_strd,
+//                            word32 dst_strd,
+//                            word32 zero_cols
+//                             word32    zero_rows                )
+
+//**************variables vs registers*************************
+//    x0 => *pi2_src
+//    x1 => *pi2_tmp
+//    x2 => *pu1_pred
+//    x3 => *pu1_dst
+//    src_strd
+//    pred_strd
+//    dst_strd
+//    zero_cols
+
+
+
+.text
+.align 4
+.include "impeg2_neon_macros.s"
+
+.set idct_stg1_shift       ,            12
+.set idct_stg2_shift       ,            16
+.set idct_stg1_round        ,           (1 << (idct_stg1_shift - 1))
+.set idct_stg2_round        ,           (1 << (idct_stg2_shift - 1))
+
+.extern gai2_impeg2_idct_q15
+.extern gai2_impeg2_idct_q11
+.extern gai2_impeg2_idct_first_col_q15
+.extern gai2_impeg2_idct_first_col_q11
+.extern gai2_impeg2_mismatch_stg2_additive
+
+.global impeg2_idct_recon_dc_av8
+impeg2_idct_recon_dc_av8:
+    // STMFD sp!,{x4,x6,x12,x14}
+    push_v_regs
+    ////x0: pi2_src
+    ////x1: pi2_tmp - not used, used as pred_strd
+    ////x2: pu1_pred
+    ////x3: pu1_dst
+    ////x4: used as scratch
+    ////x5: pred_strd
+    ////x6: dst_strd
+
+    ldrsh           x4, [x0]
+    adrp            x14, :got:gai2_impeg2_idct_q15
+    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q15]
+    ldrsh           x12, [x14]
+
+    ld1             {v0.8b}, [x2], x5
+    mul             x4, x4, x12
+
+    ld1             {v1.8b}, [x2], x5
+    add             x4, x4, #idct_stg1_round
+
+    ld1             {v2.8b}, [x2], x5
+    asr             x4, x4, #idct_stg1_shift
+
+    adrp            x14, :got:gai2_impeg2_idct_q11
+    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q11]
+    ldrsh           x12, [x14]
+
+    ld1             {v3.8b}, [x2], x5
+    mul             x4, x4, x12
+
+    ld1             {v4.8b}, [x2], x5
+    add             x4, x4, #idct_stg2_round
+
+    ld1             {v5.8b}, [x2], x5
+    asr             x4, x4, #idct_stg2_shift
+
+    ld1             {v6.8b}, [x2], x5
+    dup             v30.8h, w4
+
+
+    ld1             {v7.8b}, [x2], x5
+
+    uaddw           v8.8h, v30.8h , v0.8b
+
+    uaddw           v10.8h, v30.8h , v1.8b
+    sqxtun          v0.8b, v8.8h
+
+    uaddw           v12.8h, v30.8h , v2.8b
+    sqxtun          v1.8b, v10.8h
+    st1             {v0.8b}, [x3], x6
+
+    uaddw           v14.8h, v30.8h , v3.8b
+    sqxtun          v2.8b, v12.8h
+    st1             {v1.8b}, [x3], x6
+
+    uaddw           v16.8h, v30.8h , v4.8b
+    sqxtun          v3.8b, v14.8h
+    st1             {v2.8b}, [x3], x6
+
+    uaddw           v18.8h, v30.8h , v5.8b
+    sqxtun          v4.8b, v16.8h
+    st1             {v3.8b}, [x3], x6
+
+    uaddw           v20.8h, v30.8h , v6.8b
+    sqxtun          v5.8b, v18.8h
+    st1             {v4.8b}, [x3], x6
+
+    uaddw           v22.8h, v30.8h , v7.8b
+    sqxtun          v6.8b, v20.8h
+    st1             {v5.8b}, [x3], x6
+
+    sqxtun          v7.8b, v22.8h
+    st1             {v6.8b}, [x3], x6
+
+
+    st1             {v7.8b}, [x3], x6
+
+    // LDMFD sp!,{x4,x6,x12,pc}
+    pop_v_regs
+    ret
+
+
+
+.global impeg2_idct_recon_dc_mismatch_av8
+.extern gai2_impeg2_idct_last_row_q11
+.extern gai2_impeg2_mismatch_stg1_outp
+impeg2_idct_recon_dc_mismatch_av8:
+    // STMFD sp!,{x4-x12,x14}
+    push_v_regs
+
+    ldrsh           x4, [x0]
+    adrp            x14, :got:gai2_impeg2_idct_q15
+    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q15]
+    ldrsh           x12, [x14]
+
+    mul             x4, x4, x12
+    add             x4, x4, #idct_stg1_round
+    asr             x4, x4, #idct_stg1_shift
+
+    adrp            x14, :got:gai2_impeg2_idct_q11
+    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q11]
+    ldrsh           x12, [x14]
+    mul             x4, x4, x12
+    dup             v0.4s, w4
+
+    mov             x14, #16            ////Increment for table read
+    adrp            x4, :got:gai2_impeg2_mismatch_stg2_additive
+    ldr             x4, [x4, #:got_lo12:gai2_impeg2_mismatch_stg2_additive]
+
+    ld1             {v2.4h, v3.4h}, [x4], x14
+    ld1             {v30.8b}, [x2], x5
+    sxtl            v8.4s, v2.4h
+    sxtl            v10.4s, v3.4h
+    raddhn          v12.4h, v0.4s, v8.4s
+    raddhn2         v12.8h, v0.4s, v10.4s
+    uaddw           v14.8h, v12.8h , v30.8b
+    sqxtun          v30.8b, v14.8h
+    st1             {v30.8b}, [x3], x6
+
+    ld1             {v2.4h, v3.4h}, [x4], x14
+    ld1             {v30.8b}, [x2], x5
+    sxtl            v8.4s, v2.4h
+    sxtl            v10.4s, v3.4h
+    raddhn          v12.4h, v0.4s, v8.4s
+    raddhn2         v12.8h, v0.4s, v10.4s
+    uaddw           v14.8h, v12.8h , v30.8b
+    sqxtun          v30.8b, v14.8h
+    st1             {v30.8b}, [x3], x6
+
+    ld1             {v2.4h, v3.4h}, [x4], x14
+    ld1             {v30.8b}, [x2], x5
+    sxtl            v8.4s, v2.4h
+    sxtl            v10.4s, v3.4h
+    raddhn          v12.4h, v0.4s, v8.4s
+    raddhn2         v12.8h, v0.4s, v10.4s
+    uaddw           v14.8h, v12.8h , v30.8b
+    sqxtun          v30.8b, v14.8h
+    st1             {v30.8b}, [x3], x6
+
+    ld1             {v2.4h, v3.4h}, [x4], x14
+    ld1             {v30.8b}, [x2], x5
+    sxtl            v8.4s, v2.4h
+    sxtl            v10.4s, v3.4h
+    raddhn          v12.4h, v0.4s, v8.4s
+    raddhn2         v12.8h, v0.4s, v10.4s
+    uaddw           v14.8h, v12.8h , v30.8b
+    sqxtun          v30.8b, v14.8h
+    st1             {v30.8b}, [x3], x6
+
+    ld1             {v2.4h, v3.4h}, [x4], x14
+    ld1             {v30.8b}, [x2], x5
+    sxtl            v8.4s, v2.4h
+    sxtl            v10.4s, v3.4h
+    raddhn          v12.4h, v0.4s, v8.4s
+    raddhn2         v12.8h, v0.4s, v10.4s
+    uaddw           v14.8h, v12.8h , v30.8b
+    sqxtun          v30.8b, v14.8h
+    st1             {v30.8b}, [x3], x6
+
+    ld1             {v2.4h, v3.4h}, [x4], x14
+    ld1             {v30.8b}, [x2], x5
+    sxtl            v8.4s, v2.4h
+    sxtl            v10.4s, v3.4h
+    raddhn          v12.4h, v0.4s, v8.4s
+    raddhn2         v12.8h, v0.4s, v10.4s
+    uaddw           v14.8h, v12.8h , v30.8b
+    sqxtun          v30.8b, v14.8h
+    st1             {v30.8b}, [x3], x6
+
+    ld1             {v2.4h, v3.4h}, [x4], x14
+    ld1             {v30.8b}, [x2], x5
+    sxtl            v8.4s, v2.4h
+    sxtl            v10.4s, v3.4h
+    raddhn          v12.4h, v0.4s, v8.4s
+    raddhn2         v12.8h, v0.4s, v10.4s
+    uaddw           v14.8h, v12.8h , v30.8b
+    sqxtun          v30.8b, v14.8h
+    st1             {v30.8b}, [x3], x6
+
+    ld1             {v2.4h, v3.4h}, [x4], x14
+    ld1             {v30.8b}, [x2], x5
+    sxtl            v8.4s, v2.4h
+    sxtl            v10.4s, v3.4h
+    raddhn          v12.4h, v0.4s, v8.4s
+    raddhn2         v12.8h, v0.4s, v10.4s
+    uaddw           v14.8h, v12.8h , v30.8b
+    sqxtun          v30.8b, v14.8h
+    st1             {v30.8b}, [x3], x6
+
+
+    // LDMFD sp!,{x4-x12,pc}
+    pop_v_regs
+    ret
+
+.globl impeg2_idct_recon_av8
+
+.type impeg2_idct_recon_av8, %function
+
+impeg2_idct_recon_av8:
+////register usage.extern        - loading and until idct of columns
+////    cosine constants     -     d0
+////    sine constants         -     d1
+////    row 0 first half     -     d2        -    y0
+////    row 1 first half     -     d6        -    y1
+////    row 2 first half     -     d3        -    y2
+////    row 3 first half     -     d7        -    y3
+////    row 4 first half     -     d10        -    y4
+////    row 5 first half     -     d14        -    y5
+////    row 6 first half     -     d11        -    y6
+////    row 7 first half     -     d15        -    y7
+
+////    row 0 second half    -     d4        -    y0
+////    row 1 second half    -     d8      -    y1
+////    row 2 second half    -     d5      -    y2
+////    row 3 second half    -     d9      -    y3
+////    row 4 second half    -     d12     -    y4
+////    row 5 second half    -     d16     -    y5
+////    row 6 second half    -     d13     -    y6
+////    row 7 second half    -     d17     -    y7
+
+    //// copy the input pointer to another register
+    //// step 1 : load all constants
+    // stmfd sp!,{x4-x12,x14}
+
+    ldr             w11, [sp]           // zero rows
+
+    push_v_regs
+    stp             x19, x20, [sp, #-16]!
+
+    mov             x12, x7             // zero columns
+    mov             x8, x5              // prediction stride
+    mov             x7, x6              // destination stride
+    mov             x6, x4              // src stride
+    lsl             x6, x6, #1          // x sizeof(word16)
+    add             x9, x0, x6, lsl #1  // 2 rows
+
+    add             x10, x6, x6, lsl #1 // 3 rows
+
+    sub             x10, x10, #8        // - 4 cols * sizeof(word16)
+    sub             x5, x6, #8          // src_strd - 4 cols * sizeof(word16)
+
+    adrp            x14, :got:gai2_impeg2_idct_first_col_q15
+    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
+    ld1             {v0.4h, v1.4h}, [x14] ////d0,d1 are used for storing the constant data
+
+    ////step 2 load all the input data
+    ////step 3 operate first 4 colums at a time
+
+    and             x11, x11, #0xff
+    and             x12, x12, #0xff
+
+    cmp             x11, #0xf0
+    bge             skip_last4_rows
+
+
+    ld1             {v2.4h}, [x0], #8
+    ld1             {v3.4h}, [x9], #8
+    ld1             {v4.4h}, [x0], x5
+    smull           v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+    ld1             {v5.4h}, [x9], x5
+    smull           v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+    ld1             {v6.4h}, [x0], #8
+    ld1             {v7.4h}, [x9], #8
+    smull           v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+    ld1             {v8.4h}, [x0], x10
+    smull           v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+    ld1             {v9.4h}, [x9], x10
+    smull           v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+    ld1             {v10.4h}, [x0], #8
+    smull           v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+    ld1             {v11.4h}, [x9], #8
+    smlal           v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+    ld1             {v12.4h}, [x0], x5
+    smlsl           v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+    ld1             {v13.4h}, [x9], x5
+    smlsl           v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+    ld1             {v14.4h}, [x0], #8
+    smlsl           v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+    ld1             {v15.4h}, [x9], #8
+    smull           v22.4s, v10.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+    ld1             {v16.4h}, [x0], x10
+    smull           v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+    ld1             {v17.4h}, [x9], x10
+
+    ///* this following was activated when alignment is not there */
+////    vld1.16        d2,[x0]!
+////    vld1.16        d3,[x2]!
+////    vld1.16        d4,[x0]!
+////    vld1.16        d5,[x2]!
+////    vld1.16        d6,[x0]!
+////    vld1.16        d7,[x2]!
+////    vld1.16        d8,[x0],x3
+////    vld1.16        d9,[x2],x3
+////    vld1.16        d10,[x0]!
+////    vld1.16        d11,[x2]!
+////    vld1.16        d12,[x0]!
+////    vld1.16        d13,[x2]!
+////    vld1.16        d14,[x0]!
+////    vld1.16        d15,[x2]!
+////    vld1.16        d16,[x0],x3
+////    vld1.16        d17,[x2],x3
+
+
+
+
+    smlal           v24.4s, v14.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+    smlsl           v26.4s, v14.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+    smlal           v28.4s, v14.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+    smlal           v30.4s, v14.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    smlsl           v18.4s, v11.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+    smlal           v6.4s, v11.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    add             v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    smlal           v24.4s, v15.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+    smlsl           v26.4s, v15.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+    smlal           v28.4s, v15.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+    smlsl           v30.4s, v15.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+
+    add             v14.4s, v10.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
+    sub             v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
+    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+    add             v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
+    sub             v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)
+
+    add             v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
+    sub             v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)
+
+    add             v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
+    sub             v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)
+
+    add             v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
+    sub             v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)
+
+    sqrshrn         v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+
+
+    b               last4_cols
+
+
+
+skip_last4_rows:
+    adrp            x14, :got:gai2_impeg2_idct_first_col_q15
+    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
+    ld1             {v0.4h, v1.4h}, [x14]
+
+    ld1             {v2.4h}, [x0], #8
+    ld1             {v3.4h}, [x9], #8
+    ld1             {v4.4h}, [x0], x5
+    ld1             {v5.4h}, [x9], x5
+    ld1             {v6.4h}, [x0], #8
+    ld1             {v7.4h}, [x9], #8
+    ld1             {v8.4h}, [x0], x10
+    ld1             {v9.4h}, [x9], x10
+
+
+
+    movi            v12.4h, #0
+    movi            v13.4h, #0
+    movi            v16.4h, #0
+    movi            v17.4h, #0
+
+
+
+
+    smull           v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+    smull           v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+    smull           v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+    smull           v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+    smlal           v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl           v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl           v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl           v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+    smull           v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+    smull           v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+    smull           v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+
+
+    add             v14.4s, v20.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
+    sub             v10.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
+    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+    add             v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
+    sub             v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)
+
+    add             v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
+    sub             v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)
+
+    add             v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
+    sub             v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)
+
+    add             v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
+    sub             v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)
+
+    sqrshrn         v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+
+
+last4_cols:
+    adrp            x14, :got:gai2_impeg2_idct_first_col_q15
+    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
+    ld1             {v0.4h, v1.4h}, [x14]
+
+
+    cmp             x12, #0xf0
+    bge             skip_last4cols
+
+    smull           v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
+    smull           v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
+    smull           v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
+    smull           v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+    smlal           v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl           v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl           v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl           v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+    smull           v18.4s, v5.4h, v1.4h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
+    smull           v8.4s, v5.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+    smull           v20.4s, v4.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+    smull           v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+
+    smlal           v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+    smlsl           v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+    smlal           v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+    smlal           v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    smlsl           v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+    smlal           v8.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    add             v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    smlal           v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
+    smlsl           v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
+    smlal           v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
+    smlsl           v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
+
+    add             v16.4s, v12.4s , v8.4s ////    a0 = c0 + d0(part of e0,e7)
+    sub             v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4)
+    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of e2,e5)
+    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of e1,e6)
+
+    add             v20.4s, v16.4s , v24.4s //// a0 + b0(part of e0)
+    sub             v8.4s, v16.4s , v24.4s //// a0 - b0(part of e7)
+
+    add             v24.4s, v22.4s , v28.4s //// a2 + b2(part of e2)
+    sub             v22.4s, v22.4s , v28.4s //// a2 - b2(part of e5)
+
+    add             v28.4s, v18.4s , v26.4s //// a1 + b1(part of e1)
+    sub             v18.4s, v18.4s , v26.4s //// a1 - b1(part of e6)
+
+    add             v26.4s, v12.4s , v30.4s //// a3 + b3(part of e3)
+    sub             v30.4s, v12.4s , v30.4s //// a3 - b3(part of x4)
+
+    sqrshrn         v4.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v17.4h, v8.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v5.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v16.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v8.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v13.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v9.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+    sqrshrn         v12.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+    b               end_skip_last4cols
+
+
+
+skip_last4cols:
+    adrp            x14, :got:gai2_impeg2_idct_first_col_q11
+    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11]
+    ld1             {v0.4h, v1.4h}, [x14]
+
+    umov            x15, v25.d[0]
+
+    trn1            v25.4h, v2.4h, v6.4h
+    trn2            v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
+
+    trn1            v27.4h, v3.4h, v7.4h
+    trn2            v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing
+
+    trn1            v6.2s, v29.2s, v31.2s
+    trn2            v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
+    trn1            v2.2s, v25.2s, v27.2s
+    trn2            v3.2s, v25.2s, v27.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
+
+
+    trn1            v25.4h, v10.4h, v14.4h
+    trn2            v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
+
+    trn1            v27.4h, v11.4h, v15.4h
+    trn2            v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing
+
+    trn1            v10.2s, v25.2s, v27.2s
+    trn2            v11.2s, v25.2s, v27.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
+    trn1            v14.2s, v29.2s, v31.2s
+    trn2            v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
+
+    mov             v25.d[0], x15
+
+    smull           v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+    smull           v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+    smull           v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+    smull           v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+    smlal           v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl           v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl           v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl           v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+    smull           v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+//    vmull.s16    q11,d4,d0[0]                    @// y4 * cos4(part of c0 and c1)
+
+    smull           v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+    smull           v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+
+
+
+    sub             v22.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
+    add             v4.4s, v20.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
+
+
+    add             v2.4s, v4.4s , v24.4s
+
+    sub             v6.4s, v4.4s , v24.4s
+
+    add             v8.4s, v22.4s , v30.4s
+
+    sub             v24.4s, v22.4s , v30.4s
+
+    sqrshrn         v5.4h, v8.4s, #idct_stg2_shift
+    sqrshrn         v2.4h, v2.4s, #idct_stg2_shift
+    sqrshrn         v9.4h, v6.4s, #idct_stg2_shift
+    sqrshrn         v6.4h, v24.4s, #idct_stg2_shift
+
+    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+
+    add             v30.4s, v22.4s , v28.4s
+
+    sub             v24.4s, v22.4s , v28.4s
+
+    add             v28.4s, v18.4s , v26.4s
+
+    sub             v22.4s, v18.4s , v26.4s
+    sqrshrn         v4.4h, v30.4s, #idct_stg2_shift
+    sqrshrn         v7.4h, v24.4s, #idct_stg2_shift
+    sqrshrn         v3.4h, v28.4s, #idct_stg2_shift
+    sqrshrn         v8.4h, v22.4s, #idct_stg2_shift
+
+
+
+    umov            x19, v25.d[0]
+    umov            x20, v25.d[1]
+
+    trn1            v27.4h, v2.4h, v3.4h
+    trn2            v29.4h, v2.4h, v3.4h
+    trn1            v25.4h, v4.4h, v5.4h
+    trn2            v31.4h, v4.4h, v5.4h
+
+    trn1            v2.2s, v27.2s, v25.2s
+    trn2            v4.2s, v27.2s, v25.2s
+    trn1            v3.2s, v29.2s, v31.2s
+    trn2            v5.2s, v29.2s, v31.2s
+
+    trn1            v27.4h, v6.4h, v7.4h
+    trn2            v29.4h, v6.4h, v7.4h
+    trn1            v25.4h, v8.4h, v9.4h
+    trn2            v31.4h, v8.4h, v9.4h
+
+    trn1            v6.2s, v27.2s, v25.2s
+    trn2            v8.2s, v27.2s, v25.2s
+    trn1            v7.2s, v29.2s, v31.2s
+    trn2            v9.2s, v29.2s, v31.2s
+
+    mov             v25.d[0], x19
+    mov             v25.d[1], x20
+
+    smull           v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
+
+    smull           v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
+    smull           v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
+    smull           v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+    smlal           v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl           v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl           v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl           v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+    smull           v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+    smull           v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
+    smull           v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+
+    add             x4, x2, x8, lsl #1  // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
+
+
+    add             x5, x8, x8, lsl #1  //
+
+
+    add             x0, x3, x7, lsl #1  // x0 points to 3rd row of dest data
+
+
+    add             x10, x7, x7, lsl #1 //
+
+    // swapping v3 and v6
+    mov             v31.d[0], v3.d[0]
+    mov             v3.d[0], v6.d[0]
+    mov             v6.d[0], v31.d[0]
+
+    // swapping v5 and v8
+    mov             v31.d[0], v5.d[0]
+    mov             v5.d[0], v8.d[0]
+    mov             v8.d[0], v31.d[0]
+
+
+    sub             v22.4s, v20.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
+    add             v12.4s, v20.4s , v14.4s ////    a0 = c0 + d0(part of x0,x7)
+
+
+    add             v0.4s, v12.4s , v24.4s
+
+
+    sub             v24.4s, v12.4s , v24.4s
+
+
+    add             v12.4s, v22.4s , v30.4s
+
+
+    sub             v14.4s, v22.4s , v30.4s
+
+    sqrshrn         v10.4h, v0.4s, #idct_stg2_shift
+    sqrshrn         v17.4h, v24.4s, #idct_stg2_shift
+    sqrshrn         v13.4h, v12.4s, #idct_stg2_shift
+    sqrshrn         v14.4h, v14.4s, #idct_stg2_shift
+
+    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+
+    add             v0.4s, v22.4s , v28.4s
+
+
+    sub             v24.4s, v22.4s , v28.4s
+
+
+    add             v28.4s, v18.4s , v26.4s
+
+
+    sub             v26.4s, v18.4s , v26.4s
+    ld1             {v18.8b}, [x2], x8
+
+    sqrshrn         v12.4h, v0.4s, #idct_stg2_shift
+    ld1             {v20.8b}, [x2], x5
+
+
+    sqrshrn         v15.4h, v24.4s, #idct_stg2_shift
+    ld1             {v19.8b}, [x2], x8
+
+
+
+
+    sqrshrn         v11.4h, v28.4s, #idct_stg2_shift
+    ld1             {v22.8b}, [x4], x8
+
+
+
+
+    sqrshrn         v16.4h, v26.4s, #idct_stg2_shift
+    ld1             {v21.8b}, [x2], x5
+
+
+    b               pred_buff_addition
+end_skip_last4cols:
+    adrp            x14, :got:gai2_impeg2_idct_first_col_q11
+    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11]
+    ld1             {v0.4h, v1.4h}, [x14]
+
+
+    umov            x19, v25.d[0]
+    umov            x20, v25.d[1]
+
+///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
+    trn1            v27.4h, v2.4h, v6.4h
+    trn2            v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
+    trn1            v25.4h, v3.4h, v7.4h
+    trn2            v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing
+
+    trn1            v2.2s, v27.2s, v25.2s
+    trn2            v3.2s, v27.2s, v25.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
+    trn1            v6.2s, v29.2s, v31.2s
+    trn2            v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
+
+    trn1            v27.4h, v4.4h, v8.4h
+    trn2            v29.4h, v4.4h, v8.4h ////[x3,x1],[x2,x0] second qudrant transposing
+    trn1            v25.4h, v5.4h, v9.4h
+    trn2            v31.4h, v5.4h, v9.4h ////[x3,x1],[x2,x0] second qudrant transposing
+
+    trn1            v4.2s, v27.2s, v25.2s
+    trn2            v5.2s, v27.2s, v25.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
+    trn1            v8.2s, v29.2s, v31.2s
+    trn2            v9.2s, v29.2s, v31.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
+
+    trn1            v27.4h, v10.4h, v14.4h
+    trn2            v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
+    trn1            v25.4h, v11.4h, v15.4h
+    trn2            v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing
+
+    trn1            v10.2s, v27.2s, v25.2s
+    trn2            v11.2s, v27.2s, v25.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
+    trn1            v14.2s, v29.2s, v31.2s
+    trn2            v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
+
+    trn1            v27.4h, v12.4h, v16.4h
+    trn2            v29.4h, v12.4h, v16.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
+    trn1            v25.4h, v13.4h, v17.4h
+    trn2            v31.4h, v13.4h, v17.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
+
+    trn1            v12.2s, v27.2s, v25.2s
+    trn2            v13.2s, v27.2s, v25.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
+    trn1            v16.2s, v29.2s, v31.2s
+    trn2            v17.2s, v29.2s, v31.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
+
+    mov             v25.d[0], x19
+    mov             v25.d[1], x20
+
+    ////step6 operate on first four rows and find their idct
+    ////register usage.extern        - storing and idct of rows
+////    cosine constants     -     d0
+////    sine constants         -     d1
+////    element 0 first four     -     d2        -    y0
+////    element 1 first four     -     d6        -    y1
+////    element 2 first four     -     d3        -    y2
+////    element 3 first four     -     d7        -    y3
+////    element 4 first four     -     d4        -    y4
+////    element 5 first four     -     d8        -    y5
+////    element 6 first four     -     d5        -    y6
+////    element 7 first four     -     d9        -    y7
+////    element 0 second four    -     d10        -    y0
+////    element 1 second four    -     d14     -    y1
+////    element 2 second four    -     d11     -    y2
+////    element 3 second four    -     d15     -    y3
+////    element 4 second four    -     d12     -    y4
+////    element 5 second four    -     d16     -    y5
+////    element 6 second four    -     d13     -    y6
+////    element 7 second four    -     d17     -    y7
+
+    //// map between first kernel code seq and current
+////        d2    ->    d2
+////        d6    ->    d6
+////        d3    ->    d3
+////        d7    ->    d7
+////        d10    ->    d4
+////        d14    ->    d8
+////        d11    ->    d5
+////        d15    ->    d9
+////        q3    ->    q3
+////        q5    ->    q2
+////        q7    ->    q4
+
+    smull           v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+    smull           v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+    smull           v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+    smull           v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+    smlal           v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl           v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl           v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl           v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+    smull           v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+    smull           v22.4s, v4.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+
+    smull           v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+    smull           v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+
+    smlal           v24.4s, v8.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+    smlsl           v26.4s, v8.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+    smlal           v28.4s, v8.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+    smlal           v30.4s, v8.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    smlsl           v18.4s, v5.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+    smlal           v6.4s, v5.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    add             v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    smlal           v24.4s, v9.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+    smlsl           v26.4s, v9.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+    smlal           v28.4s, v9.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+    smlsl           v30.4s, v9.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+
+    sub             v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
+    add             v4.4s, v2.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
+
+
+    add             v2.4s, v4.4s , v24.4s
+
+    sub             v6.4s, v4.4s , v24.4s
+
+    add             v8.4s, v22.4s , v30.4s
+
+    sub             v24.4s, v22.4s , v30.4s
+
+    sqrshrn         v5.4h, v8.4s, #idct_stg2_shift
+    sqrshrn         v2.4h, v2.4s, #idct_stg2_shift
+    sqrshrn         v9.4h, v6.4s, #idct_stg2_shift
+    sqrshrn         v6.4h, v24.4s, #idct_stg2_shift
+
+    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+
+    add             v30.4s, v22.4s , v28.4s
+
+    sub             v24.4s, v22.4s , v28.4s
+
+    add             v28.4s, v18.4s , v26.4s
+
+    sub             v22.4s, v18.4s , v26.4s
+    sqrshrn         v4.4h, v30.4s, #idct_stg2_shift
+    sqrshrn         v7.4h, v24.4s, #idct_stg2_shift
+    sqrshrn         v3.4h, v28.4s, #idct_stg2_shift
+    sqrshrn         v8.4h, v22.4s, #idct_stg2_shift
+
+
+
+    umov            x19, v25.d[0]
+    umov            x20, v25.d[1]
+
+    trn1            v27.4h, v2.4h, v3.4h
+    trn2            v29.4h, v2.4h, v3.4h
+    trn1            v25.4h, v4.4h, v5.4h
+    trn2            v31.4h, v4.4h, v5.4h
+
+    trn1            v2.2s, v27.2s, v25.2s
+    trn2            v4.2s, v27.2s, v25.2s
+    trn1            v3.2s, v29.2s, v31.2s
+    trn2            v5.2s, v29.2s, v31.2s
+
+    trn1            v27.4h, v6.4h, v7.4h
+    trn2            v29.4h, v6.4h, v7.4h
+    trn1            v25.4h, v8.4h, v9.4h
+    trn2            v31.4h, v8.4h, v9.4h
+
+    trn1            v6.2s, v27.2s, v25.2s
+    trn2            v8.2s, v27.2s, v25.2s
+    trn1            v7.2s, v29.2s, v31.2s
+    trn2            v9.2s, v29.2s, v31.2s
+
+    mov             v25.d[0], x19
+    mov             v25.d[1], x20
+
+
+
+    smull           v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
+    smull           v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
+    smull           v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
+    smull           v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
+    smlal           v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl           v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl           v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl           v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+    smull           v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+    smull           v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+    smull           v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
+    smull           v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
+    smlal           v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+
+    add             x4, x2, x8, lsl #1  // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
+    smlsl           v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+
+    add             x5, x8, x8, lsl #1  //
+    smlal           v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+
+    add             x0, x3, x7, lsl #1  // x0 points to 3rd row of dest data
+    smlal           v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    add             x10, x7, x7, lsl #1 //
+    smlsl           v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+
+
+    smlal           v14.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    add             v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    smlal           v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+
+    // swapping v3 and v6
+    mov             v31.d[0], v3.d[0]
+    mov             v3.d[0], v6.d[0]
+    mov             v6.d[0], v31.d[0]
+
+    smlsl           v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+    // swapping v5 and v8
+    mov             v31.d[0], v5.d[0]
+    mov             v5.d[0], v8.d[0]
+    mov             v8.d[0], v31.d[0]
+
+    smlal           v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+    smlsl           v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+
+    sub             v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
+    add             v12.4s, v12.4s , v14.4s ////    a0 = c0 + d0(part of x0,x7)
+
+
+    add             v0.4s, v12.4s , v24.4s
+
+
+    sub             v24.4s, v12.4s , v24.4s
+
+
+    add             v12.4s, v22.4s , v30.4s
+
+
+    sub             v14.4s, v22.4s , v30.4s
+
+    sqrshrn         v10.4h, v0.4s, #idct_stg2_shift
+    sqrshrn         v17.4h, v24.4s, #idct_stg2_shift
+    sqrshrn         v13.4h, v12.4s, #idct_stg2_shift
+    sqrshrn         v14.4h, v14.4s, #idct_stg2_shift
+
+    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+
+    add             v0.4s, v22.4s , v28.4s
+
+
+    sub             v24.4s, v22.4s , v28.4s
+
+
+    add             v28.4s, v18.4s , v26.4s
+
+
+    sub             v26.4s, v18.4s , v26.4s
+    ld1             {v18.8b}, [x2], x8
+
+    sqrshrn         v12.4h, v0.4s, #idct_stg2_shift
+    ld1             {v20.8b}, [x2], x5
+
+
+    sqrshrn         v15.4h, v24.4s, #idct_stg2_shift
+    ld1             {v19.8b}, [x2], x8
+
+
+
+
+    sqrshrn         v11.4h, v28.4s, #idct_stg2_shift
+    ld1             {v22.8b}, [x4], x8
+
+
+
+
+    sqrshrn         v16.4h, v26.4s, #idct_stg2_shift
+    ld1             {v21.8b}, [x2], x5
+
+
+
+
+pred_buff_addition:
+
+    umov            x19, v25.d[0]
+    umov            x20, v25.d[1]
+
+    trn1            v27.4h, v10.4h, v11.4h
+    trn2            v29.4h, v10.4h, v11.4h
+    trn1            v25.4h, v12.4h, v13.4h
+    trn2            v31.4h, v12.4h, v13.4h
+
+    trn1            v10.2s, v27.2s, v25.2s
+    trn2            v12.2s, v27.2s, v25.2s
+    trn1            v11.2s, v29.2s, v31.2s
+    trn2            v13.2s, v29.2s, v31.2s
+
+    trn1            v27.4h, v14.4h, v15.4h
+    trn2            v29.4h, v14.4h, v15.4h
+    trn1            v25.4h, v16.4h, v17.4h
+    trn2            v31.4h, v16.4h, v17.4h
+
+    trn1            v14.2s, v27.2s, v25.2s
+    trn2            v16.2s, v27.2s, v25.2s
+    trn1            v15.2s, v29.2s, v31.2s
+    trn2            v17.2s, v29.2s, v31.2s
+
+
+    mov             v25.d[0], x19
+    mov             v25.d[1], x20
+
+
+    ld1             {v24.8b}, [x4], x5
+    ld1             {v23.8b}, [x4], x8
+    ld1             {v25.8b}, [x4], x5
+    mov             v2.d[1], v3.d[0]
+    mov             v4.d[1], v5.d[0]
+    mov             v6.d[1], v7.d[0]
+    mov             v8.d[1], v9.d[0]
+    uaddw           v2.8h, v2.8h , v18.8b
+    uaddw           v4.8h, v4.8h , v22.8b
+    uaddw           v6.8h, v6.8h , v20.8b
+    uaddw           v8.8h, v8.8h , v24.8b
+
+    // swapping v11 and v14
+    mov             v31.d[0], v11.d[0]
+    mov             v11.d[0], v14.d[0]
+    mov             v14.d[0], v31.d[0]
+
+    // swapping v13 and v16
+    mov             v31.d[0], v13.d[0]
+    mov             v13.d[0], v16.d[0]
+    mov             v16.d[0], v31.d[0]
+// row values stored in the q register.
+
+//q1 :x0
+//q3: x1
+//q2: x2
+//q4: x3
+//q5: x4
+//q7: x5
+//q6: x6
+//q8: x7
+
+
+
+///// adding the prediction buffer
+
+
+
+
+
+
+
+
+
+    // load prediction data
+
+
+
+
+
+    //adding recon with prediction
+
+
+
+
+    mov             v10.d[1], v11.d[0]
+    mov             v12.d[1], v13.d[0]
+    mov             v14.d[1], v15.d[0]
+    mov             v16.d[1], v17.d[0]
+    uaddw           v10.8h, v10.8h , v19.8b
+    sqxtun          v2.8b, v2.8h
+    uaddw           v14.8h, v14.8h , v21.8b
+    sqxtun          v4.8b, v4.8h
+    uaddw           v12.8h, v12.8h , v23.8b
+    sqxtun          v6.8b, v6.8h
+    uaddw           v16.8h, v16.8h , v25.8b
+    sqxtun          v8.8b, v8.8h
+
+
+
+
+
+
+
+    st1             {v2.8b}, [x3], x7
+    sqxtun          v10.8b, v10.8h
+    st1             {v6.8b}, [x3], x10
+    sqxtun          v14.8b, v14.8h
+    st1             {v4.8b}, [x0], x7
+    sqxtun          v12.8b, v12.8h
+    st1             {v8.8b}, [x0], x10
+    sqxtun          v16.8b, v16.8h
+
+
+
+
+
+
+
+    st1             {v10.8b}, [x3], x7
+    st1             {v14.8b}, [x3], x10
+    st1             {v12.8b}, [x0], x7
+    st1             {v16.8b}, [x0], x10
+
+
+
+
+    // ldmfd sp!,{x4-x12,pc}
+    ldp             x19, x20, [sp], #16
+    pop_v_regs
+    ret
+
+
+
+
diff --git a/common/armv8/impeg2_inter_pred.s b/common/armv8/impeg2_inter_pred.s
new file mode 100644
index 0000000..98ade45
--- /dev/null
+++ b/common/armv8/impeg2_inter_pred.s
@@ -0,0 +1,814 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+
+///*
+////----------------------------------------------------------------------------
+//// File Name            : impeg2_inter_pred.s
+////
+//// Description          : This file has motion compensation related
+////                        interpolation functions on Neon + CortexA-8 platform
+////
+//// Reference Document   :
+////
+//// Revision History     :
+////      Date            Author                  Detail Description
+////   ------------    ----------------    ----------------------------------
+////   18 jun 2010      S Hamsalekha              Created
+////
+////-------------------------------------------------------------------------
+//*/
+
+///*
+//// ----------------------------------------------------------------------------
+//// Include Files
+//// ----------------------------------------------------------------------------
+//*/
+//              PRESERVE8
+.text
+.include "impeg2_neon_macros.s"
+
+///*
+//// ----------------------------------------------------------------------------
+//// Struct/Union Types and Define
+//// ----------------------------------------------------------------------------
+//*/
+
+
+///*
+//// ----------------------------------------------------------------------------
+//// Static Global Data section variables
+//// ----------------------------------------------------------------------------
+//*/
+//// -------------------------- NONE --------------------------------------------
+
+
+///*
+//// ----------------------------------------------------------------------------
+//// Static Prototype Functions
+//// ----------------------------------------------------------------------------
+//*/
+//// -------------------------- NONE --------------------------------------------
+
+///*
+//// ----------------------------------------------------------------------------
+//// Exported functions
+//// ----------------------------------------------------------------------------
+//*/
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name      :   impeg2_copy_mb_av8()
+////
+//// Detail Description : Copies one MB worth of data from src to the dst
+////
+//// Inputs             : x0 - pointer to src
+////                      x1 - pointer to dst
+////                      x2 - source width
+////                      x3 - destination width
+//// Registers Used     : v0, v1
+////
+//// Stack Usage        : 64 bytes
+////
+//// Outputs            :
+////
+//// Return Data        : None
+////
+//// Programming Note   : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+
+
+.global impeg2_copy_mb_av8
+
+
+impeg2_copy_mb_av8:
+
+//STMFD   x13!,{x4,x5,x12,x14}
+    push_v_regs
+
+
+    ldr             x4, [x0]            //src->y
+    ldr             x5, [x1]            //dst->y
+
+    //Read one row of data from the src
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+
+    ////Repeat 15 times for y
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+
+    lsr             x2, x2, #1          //src_offset /= 2
+    lsr             x3, x3, #1          //dst_offset /= 2
+
+    ldr             x4, [x0, #8]        //src->u
+    ldr             x5, [x1, #8]        //dst->u
+
+    //Read one row of data from the src
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+
+    ////Repeat 7 times for u
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+
+    ldr             x4, [x0, #16]       //src->v
+    ldr             x5, [x1, #16]       //dst->v
+
+    //Read one row of data from the src
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+
+    ////Repeat 7 times for v
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+    ld1             {v0.8b}, [x4], x2   //Load and increment src
+    st1             {v0.8b}, [x5], x3   //Store and increment dst
+
+//LDMFD   x13!,{x4,x5,x12,PC}
+    pop_v_regs
+    ret
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name      :   impeg2_mc_fullx_halfy_8x8_av8()
+////
+//// Detail Description : This function pastes the reference block in the
+////                      current frame buffer.This function is called for
+////                      blocks that are not coded and have motion vectors
+////                      with a half pel resolution.
+////
+//// Inputs             : x0 - out    : Current Block Pointer
+////                      x1 - ref     : Refernce Block Pointer
+////                      x2 - ref_wid   : Refernce Block Width
+////                      x3 - out_wid    @ Current Block Width
+////
+//// Registers Used     : x14, D0-D9
+////
+//// Stack Usage        : 64 bytes
+////
+//// Outputs            : The Motion Compensated Block
+////
+//// Return Data        : None
+////
+//// Programming Note   : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+.global impeg2_mc_fullx_halfy_8x8_av8
+
+impeg2_mc_fullx_halfy_8x8_av8:
+
+//STMFD       x13!,{x12,x14}
+    push_v_regs
+    add             x14, x1, x2
+    lsl             x2, x2, #1
+
+///* Load 8 + 1 rows from reference block */
+///* Do the addition with out rounding off as rounding value is 1 */
+    ld1             {v0.8b}, [x1], x2   //// first row hence x1 = D0
+    ld1             {v2.8b}, [x14], x2  //// second row hence x2 = D2
+    ld1             {v4.8b}, [x1], x2   //// third row hence x3 = D4
+    ld1             {v6.8b}, [x14], x2  //// fourth row hence x4 = D6
+    ld1             {v1.8b}, [x1], x2   //// fifth row hence x5 = D1
+    ld1             {v3.8b}, [x14], x2  //// sixth row hence x6 = D3
+    urhadd          v9.8b, v1.8b , v6.8b //// estimated row 4 = D9
+    ld1             {v5.8b}, [x1], x2   //// seventh row hence x7 = D5
+    urhadd          v0.16b, v0.16b , v2.16b //// estimated row 1 = D0, row 5 = D1
+    urhadd          v1.16b, v1.16b , v3.16b //// estimated row 1 = D0, row 5 = D1
+    ld1             {v7.8b}, [x14], x2  //// eighth row hence x8 = D7
+    urhadd          v2.16b, v2.16b , v4.16b //// estimated row 2 = D2, row 6 = D3
+    urhadd          v3.16b, v3.16b , v5.16b //// estimated row 2 = D2, row 6 = D3
+    ld1             {v8.8b}, [x1], x2   //// ninth row hence x9 = D8
+    urhadd          v4.16b, v4.16b , v6.16b //// estimated row 3 = D4, row 7 = D5
+    urhadd          v5.16b, v5.16b , v7.16b //// estimated row 3 = D4, row 7 = D5
+
+    add             x14, x0, x3
+    lsl             x3, x3, #1
+
+///* Store the eight rows calculated above */
+    st1             {v2.8b}, [x14], x3  //// second row hence D2
+    urhadd          v7.8b, v7.8b , v8.8b //// estimated row 8 = D7
+    st1             {v0.8b}, [x0], x3   //// first row hence D0
+    st1             {v9.8b}, [x14], x3  //// fourth row hence D9
+    st1             {v4.8b}, [x0], x3   //// third row hence D4
+    st1             {v3.8b}, [x14], x3  //// sixth row hence x6 = D3
+    st1             {v1.8b}, [x0], x3   //// fifth row hence x5 = D1
+    st1             {v7.8b}, [x14], x3  //// eighth row hence x8 = D7
+    st1             {v5.8b}, [x0], x3   //// seventh row hence x7 = D5
+
+// LDMFD sp!,{x12,pc}
+    pop_v_regs
+    ret
+
+
+
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name      :   impeg2_mc_halfx_fully_8x8_av8()
+////
+//// Detail Description : This function pastes the reference block in the
+////                      current frame buffer.This function is called for
+////                      blocks that are not coded and have motion vectors
+////                      with a half pel resolutionand VopRoundingType is 0 ..
+////
+//// Inputs             : x0 - out    : Current Block Pointer
+////                      x1 - ref     : Refernce Block Pointer
+////                      x2 - ref_wid   : Refernce Block Width
+////                      x3 - out_wid    @ Current Block Width
+////
+//// Registers Used     : x12, x14, v0-v10, v12-v14, v16-v18, v20-v22
+
+////
+//// Stack Usage        : 64 bytes
+////
+//// Outputs            : The Motion Compensated Block
+////
+//// Return Data        : None
+////
+//// Programming Note   : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+
+
+.global impeg2_mc_halfx_fully_8x8_av8
+
+
+
+impeg2_mc_halfx_fully_8x8_av8:
+
+    // STMFD sp!,{x12,x14}
+    push_v_regs
+
+    add             x14, x1, x2, lsl #2
+
+    add             x12, x0, x3, lsl#2
+
+    ld1             {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of  row1
+
+    ld1             {v2.8b, v3.8b}, [x14], x2 // row5
+
+
+    ld1             {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
+
+    ld1             {v6.8b, v7.8b}, [x14], x2 //row6
+
+
+    ext             v8.8b, v0.8b , v1.8b , #1
+
+    ext             v12.8b, v2.8b , v3.8b , #1
+
+    ext             v16.8b, v4.8b , v5.8b , #1
+
+    ext             v20.8b, v6.8b , v7.8b , #1
+
+
+    ld1             {v9.8b, v10.8b}, [x1], x2 //load row3
+
+    ld1             {v13.8b, v14.8b}, [x14], x2 //load row7
+
+    ld1             {v17.8b, v18.8b}, [x1], x2 //load  row4
+
+    ld1             {v21.8b, v22.8b}, [x14], x2 //load  row8
+
+
+    ext             v1.8b, v9.8b , v10.8b , #1
+
+    ext             v3.8b, v13.8b , v14.8b , #1
+
+
+
+    ext             v5.8b, v17.8b , v18.8b , #1
+
+    ext             v7.8b, v21.8b , v22.8b , #1
+
+
+    urhadd          v0.16b, v0.16b , v8.16b //operate on row1 and row3
+    urhadd          v1.16b, v1.16b , v9.16b //operate on row1 and row3
+
+    urhadd          v2.16b, v2.16b , v12.16b //operate on row5 and row7
+    urhadd          v3.16b, v3.16b , v13.16b //operate on row5 and row7
+
+
+    urhadd          v4.16b, v4.16b , v16.16b //operate on row2 and row4
+    urhadd          v5.16b, v5.16b , v17.16b //operate on row2 and row4
+
+
+    urhadd          v6.16b, v6.16b , v20.16b //operate on row6 and row8
+    urhadd          v7.16b, v7.16b , v21.16b //operate on row6 and row8
+
+    st1             {v0.8b}, [x0], x3   //store row1
+
+    st1             {v2.8b}, [x12], x3  //store row5
+
+    st1             {v4.8b}, [x0], x3   //store row2
+
+    st1             {v6.8b}, [x12], x3  //store row6
+
+    st1             {v1.8b}, [x0], x3   //store row3
+
+    st1             {v3.8b}, [x12], x3  //store row7
+
+    st1             {v5.8b}, [x0], x3   //store row4
+
+    st1             {v7.8b}, [x12], x3  //store row8
+
+
+
+    // LDMFD sp!,{x12,pc}
+    pop_v_regs
+    ret
+
+
+
+
+
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name      :   impeg2_mc_halfx_halfy_8x8_av8()
+////
+//// Detail Description : This function pastes the reference block in the
+////                      current frame buffer.This function is called for
+////                      blocks that are not coded and have motion vectors
+////                      with a half pel resolutionand VopRoundingType is 0 ..
+////
+//// Inputs             : x0 - out    : Current Block Pointer
+////                      x1 - ref     : Refernce Block Pointer
+////                      x2 - ref_wid   : Refernce Block Width
+////                      x3 - out_wid    @ Current Block Width
+////
+//// Registers Used     : x14, v0-v18, v22, v24, v26, v28, v30
+
+////
+//// Stack Usage        : 64 bytes
+////
+//// Outputs            : The Motion Compensated Block
+////
+//// Return Data        : None
+////
+//// Programming Note   : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+
+.global impeg2_mc_halfx_halfy_8x8_av8
+
+impeg2_mc_halfx_halfy_8x8_av8:
+
+    // STMFD sp!,{x12,x14}
+    push_v_regs
+
+    add             x14, x1, x2, lsl #2
+
+    ld1             {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of  row1
+
+    ld1             {v2.8b, v3.8b}, [x14], x2 // row5
+
+    ld1             {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
+
+    ld1             {v6.8b, v7.8b}, [x14], x2 //row6
+
+    ext             v1.8b, v0.8b , v1.8b , #1
+
+
+
+    ext             v3.8b, v2.8b , v3.8b , #1
+
+
+
+    ext             v5.8b, v4.8b , v5.8b , #1
+
+    ext             v7.8b, v6.8b , v7.8b , #1
+
+
+
+
+    ld1             {v8.8b, v9.8b}, [x1], x2 //load row3
+
+
+
+    ld1             {v10.8b, v11.8b}, [x14], x2 //load row7
+
+    ld1             {v12.8b, v13.8b}, [x1], x2 //load  row4
+
+    ld1             {v14.8b, v15.8b}, [x14], x2 //load  row8
+
+    ext             v9.8b, v8.8b , v9.8b , #1
+
+    ld1             {v16.8b, v17.8b}, [x14], x2 //load  row9
+
+
+
+
+
+    ext             v11.8b, v10.8b , v11.8b , #1
+
+
+
+    ext             v13.8b, v12.8b , v13.8b , #1
+
+
+
+    ext             v15.8b, v14.8b , v15.8b , #1
+
+    ext             v17.8b, v16.8b , v17.8b , #1
+
+
+    //interpolation in x direction
+
+    uaddl           v0.8h, v0.8b, v1.8b //operate row1
+
+    uaddl           v2.8h, v2.8b, v3.8b //operate row5
+
+    uaddl           v4.8h, v4.8b, v5.8b //operate row2
+
+    uaddl           v6.8h, v6.8b, v7.8b //operate row6
+
+    uaddl           v8.8h, v8.8b, v9.8b //operate row3
+
+    uaddl           v10.8h, v10.8b, v11.8b //operate row7
+
+    uaddl           v12.8h, v12.8b, v13.8b //operate row4
+
+    uaddl           v14.8h, v14.8b, v15.8b //operate row8
+
+    uaddl           v16.8h, v16.8b, v17.8b //operate row9
+
+    //interpolation in y direction
+
+    add             x14, x0, x3, lsl #2
+
+
+
+    add             v18.8h, v0.8h , v4.8h //operate row1 and row2
+
+    add             v26.8h, v2.8h , v6.8h //operate row5 and row6
+
+    add             v20.8h, v4.8h , v8.8h //operate row2 and row3
+
+    add             v28.8h, v6.8h , v10.8h //operate row6 and row7
+
+    rshrn           v18.8b, v18.8h, #2  //row1
+
+    rshrn           v26.8b, v26.8h, #2  //row5
+
+    rshrn           v20.8b, v20.8h, #2  //row2
+
+    rshrn           v28.8b, v28.8h, #2  //row6
+
+    add             v22.8h, v8.8h , v12.8h //operate row3 and row4
+
+    st1             {v18.8b}, [x0], x3  //store row1
+
+    add             v30.8h, v10.8h , v14.8h //operate row7 and row8
+
+    st1             {v26.8b}, [x14], x3 //store row5
+
+    add             v24.8h, v12.8h , v2.8h //operate row4 and row5
+
+    st1             {v20.8b}, [x0], x3  //store row2
+
+    add             v14.8h, v14.8h , v16.8h //operate row8 and row9
+
+    st1             {v28.8b}, [x14], x3 //store row6
+
+
+
+    rshrn           v22.8b, v22.8h, #2  //row3
+
+    rshrn           v30.8b, v30.8h, #2  //row7
+
+    rshrn           v24.8b, v24.8h, #2  //row4
+
+    rshrn           v14.8b, v14.8h, #2  //row8
+
+
+    st1             {v22.8b}, [x0], x3  //store row3
+    st1             {v30.8b}, [x14], x3 //store row7
+    st1             {v24.8b}, [x0], x3  //store row4
+    st1             {v14.8b}, [x14], x3 //store row8
+
+
+
+    // LDMFD sp!,{x12,pc}
+    pop_v_regs
+    ret
+
+
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name      :   impeg2_mc_fullx_fully_8x8_av8()
+////
+//// Detail Description : This function pastes the reference block in the
+////                      current frame buffer.This function is called for
+////                      blocks that are not coded and have motion vectors
+////                      with a half pel resolutionand ..
+////
+//// Inputs             : x0 - out    : Current Block Pointer
+////                      x1 - ref     : Refernce Block Pointer
+////                      x2 - ref_wid   : Refernce Block Width
+////                      x3 - out_wid    @ Current Block Width
+////
+//// Registers Used     : x12, x14, v0-v3
+
+////
+//// Stack Usage        : 64 bytes
+////
+//// Outputs            : The Motion Compensated Block
+////
+//// Return Data        : None
+////
+//// Programming Note   : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+
+.global impeg2_mc_fullx_fully_8x8_av8
+impeg2_mc_fullx_fully_8x8_av8:
+
+
+    // STMFD sp!,{x12,x14}
+    push_v_regs
+
+    add             x14, x1, x2, lsl #2
+
+    add             x12, x0, x3, lsl #2
+
+
+    ld1             {v0.8b}, [x1], x2   //load row1
+
+    ld1             {v1.8b}, [x14], x2  //load row4
+
+    ld1             {v2.8b}, [x1], x2   //load row2
+
+    ld1             {v3.8b}, [x14], x2  //load row5
+
+
+    st1             {v0.8b}, [x0], x3   //store row1
+
+    st1             {v1.8b}, [x12], x3  //store row4
+
+    st1             {v2.8b}, [x0], x3   //store row2
+
+    st1             {v3.8b}, [x12], x3  //store row5
+
+
+    ld1             {v0.8b}, [x1], x2   //load row3
+
+    ld1             {v1.8b}, [x14], x2  //load row6
+
+    ld1             {v2.8b}, [x1], x2   //load row4
+
+    ld1             {v3.8b}, [x14], x2  //load row8
+
+
+    st1             {v0.8b}, [x0], x3   //store row3
+
+    st1             {v1.8b}, [x12], x3  //store row6
+
+    st1             {v2.8b}, [x0], x3   //store row4
+
+    st1             {v3.8b}, [x12], x3  //store row8
+
+
+    // LDMFD sp!,{x12,pc}
+    pop_v_regs
+    ret
+
+
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name      :   impeg2_interpolate_av8()
+////
+//// Detail Description : interpolates two buffers and adds pred
+////
+//// Inputs             : x0 - pointer to src1
+////                      x1 - pointer to src2
+////                      x2 - dest buf
+////                         x3 - dst stride
+//// Registers Used     : x12, v0-v15
+////
+//// Stack Usage        : 64 bytes
+////
+//// Outputs            : The Motion Compensated Block
+////
+//// Return Data        : None
+////
+//// Programming Note   : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+
+.global impeg2_interpolate_av8
+
+
+impeg2_interpolate_av8:
+
+//STMFD    x13!,{x4-x7,x12,x14}
+    push_v_regs
+
+    ldr             x4, [x0, #0]        //ptr_y src1
+
+    ldr             x5, [x1, #0]        //ptr_y src2
+
+    ldr             x7, [x2, #0]        //ptr_y dst buf
+
+    mov             x12, #4             //counter for number of blocks
+
+
+interp_lumablocks_stride:
+    ld1             {v0.16b}, [x4], #16 //row1 src1
+
+    ld1             {v2.16b}, [x4], #16 //row2 src1
+
+    ld1             {v4.16b}, [x4], #16 //row3 src1
+
+    ld1             {v6.16b}, [x4], #16 //row4 src1
+
+
+    ld1             {v8.16b}, [x5], #16 //row1 src2
+
+    ld1             {v10.16b}, [x5], #16 //row2 src2
+
+    ld1             {v12.16b}, [x5], #16 //row3 src2
+
+    ld1             {v14.16b}, [x5], #16 //row4 src2
+
+    urhadd          v0.16b, v0.16b , v8.16b //operate on row1
+
+    urhadd          v2.16b, v2.16b , v10.16b //operate on row2
+
+    urhadd          v4.16b, v4.16b , v12.16b //operate on row3
+
+    urhadd          v6.16b, v6.16b , v14.16b //operate on row4
+    st1             {v0.16b}, [x7], x3  //row1
+
+    st1             {v2.16b}, [x7], x3  //row2
+
+    st1             {v4.16b}, [x7], x3  //row3
+
+    st1             {v6.16b}, [x7], x3  //row4
+
+    subs            x12, x12, #1
+
+    bne             interp_lumablocks_stride
+
+
+    lsr             x3, x3, #1          //stride >> 1
+
+    ldr             x4, [x0, #8]        //ptr_u src1
+
+    ldr             x5, [x1, #8]        //ptr_u src2
+
+    ldr             x7 , [x2, #8]       //ptr_u dst buf
+
+    mov             x12, #2             //counter for number of blocks
+
+
+
+//chroma blocks
+
+interp_chromablocks_stride:
+    ld1             {v0.8b, v1.8b}, [x4], #16 //row1 & 2 src1
+
+    ld1             {v2.8b, v3.8b}, [x4], #16 //row3 & 4 src1
+
+    ld1             {v4.8b, v5.8b}, [x4], #16 //row5 & 6 src1
+
+    ld1             {v6.8b, v7.8b}, [x4], #16 //row7 & 8 src1
+
+
+    ld1             {v8.8b, v9.8b}, [x5], #16 //row1 & 2 src2
+
+    ld1             {v10.8b, v11.8b}, [x5], #16 //row3 & 4 src2
+
+    ld1             {v12.8b, v13.8b}, [x5], #16 //row5 & 6 src2
+
+    ld1             {v14.8b, v15.8b}, [x5], #16 //row7 & 8 src2
+
+    urhadd          v0.16b, v0.16b , v8.16b //operate on row1 & 2
+    urhadd          v1.16b, v1.16b , v9.16b //operate on row1 & 2
+
+    urhadd          v2.16b, v2.16b , v10.16b //operate on row3 & 4
+    urhadd          v3.16b, v3.16b , v11.16b //operate on row3 & 4
+
+    urhadd          v4.16b, v4.16b , v12.16b //operate on row5 & 6
+    urhadd          v5.16b, v5.16b , v13.16b //operate on row5 & 6
+
+    urhadd          v6.16b, v6.16b , v14.16b //operate on row7 & 8
+    urhadd          v7.16b, v7.16b , v15.16b //operate on row7 & 8
+
+    st1             {v0.8b}, [x7], x3   //row1
+
+    st1             {v1.8b}, [x7], x3   //row2
+
+    st1             {v2.8b}, [x7], x3   //row3
+
+    st1             {v3.8b}, [x7], x3   //row4
+
+    st1             {v4.8b}, [x7], x3   //row5
+
+    st1             {v5.8b}, [x7], x3   //row6
+
+    st1             {v6.8b}, [x7], x3   //row7
+
+    st1             {v7.8b}, [x7], x3   //row8
+
+
+    ldr             x4, [x0, #16]       //ptr_v src1
+
+    ldr             x5, [x1, #16]       //ptr_v src2
+
+    ldr             x7, [x2, #16]       //ptr_v dst buf
+
+    subs            x12, x12, #1
+
+    bne             interp_chromablocks_stride
+
+
+    //LDMFD  x13!,{x4-x7,x12,PC}
+    pop_v_regs
+    ret
+
+
+
+
diff --git a/common/armv8/impeg2_mem_func.s b/common/armv8/impeg2_mem_func.s
new file mode 100644
index 0000000..f0bb590
--- /dev/null
+++ b/common/armv8/impeg2_mem_func.s
@@ -0,0 +1,181 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+
+///*
+////----------------------------------------------------------------------------
+//// File Name            : mot_comp_neon.s
+////
+//// Description          : This file has motion compensation related
+////                        interpolation functions on Neon + CortexA-8 platform
+////
+//// Reference Document   :
+////
+//// Revision History     :
+////      Date            Author                  Detail Description
+////   ------------    ----------------    ----------------------------------
+////   18 jun 2010      S Hamsalekha              Created
+////
+////-------------------------------------------------------------------------
+//*/
+
+///*
+//// ----------------------------------------------------------------------------
+//// Include Files
+//// ----------------------------------------------------------------------------
+//*/
+//              PRESERVE8
+.text
+.include "impeg2_neon_macros.s"
+///*
+//// ----------------------------------------------------------------------------
+//// Struct/Union Types and Define
+//// ----------------------------------------------------------------------------
+//*/
+
+
+///*
+//// ----------------------------------------------------------------------------
+//// Static Global Data section variables
+//// ----------------------------------------------------------------------------
+//*/
+//// -------------------------- NONE --------------------------------------------
+
+
+///*
+//// ----------------------------------------------------------------------------
+//// Static Prototype Functions
+//// ----------------------------------------------------------------------------
+//*/
+//// -------------------------- NONE --------------------------------------------
+
+///*
+//// ----------------------------------------------------------------------------
+//// Exported functions
+//// ----------------------------------------------------------------------------
+//*/
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name      : impeg2_memset_8bit_8x8_block_av8()
+////
+//// Detail Description : This routine intialises the Block matrix buffer contents to a
+////                      particular Value. This function also assumes the buffer size
+////                         to be set is 64 Bytes fixed. It also assumes that blk matrix
+////                         used is 64 bit aligned.
+////
+//// Inputs             : pi2_blk_mat : Block Pointer
+////                         u2_val      : Value with which the block is initialized
+////
+//// Registers Used     : v0
+////
+//// Stack Usage        : 64 bytes
+////
+//// Outputs            : Block Matrix Iniliazed to given value
+////
+//// Return Data        : None
+////
+//// Programming Note   : This implementation assumes that blk matrix buffer
+////                         is 128 bit aligned
+////-----------------------------------------------------------------------------
+//*/
+.global impeg2_memset_8bit_8x8_block_av8
+impeg2_memset_8bit_8x8_block_av8:
+    push_v_regs
+
+//        ADD            x3,x0,#WIDTH_X_SIZE            @//x3 is another copy address offsetted
+
+    dup             v0.8b, w1           ////x1 is the 8-bit value to be set into
+
+    st1             {v0.8b}, [x0], x2   ////Store the row 1
+    st1             {v0.8b}, [x0], x2   ////Store the row 2
+    st1             {v0.8b}, [x0], x2   ////Store the row 3
+    st1             {v0.8b}, [x0], x2   ////Store the row 4
+    st1             {v0.8b}, [x0], x2   ////Store the row 5
+    st1             {v0.8b}, [x0], x2   ////Store the row 6
+    st1             {v0.8b}, [x0], x2   ////Store the row 7
+    st1             {v0.8b}, [x0], x2   ////Store the row 8
+
+    pop_v_regs
+    ret
+
+
+
+
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name      :   impeg2_memset0_16bit_8x8_linear_block_av8()
+////
+//// Detail Description : memsets resudual buf to 0
+////
+//// Inputs             : x0 - pointer to y
+////                      x1 - pointer to u
+////                      x2 - pointer to v
+//// Registers Used     : v0
+
+////
+//// Stack Usage        : 64 bytes
+////
+//// Outputs            : The Motion Compensated Block
+////
+//// Return Data        : None
+////
+//// Programming Note   : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+
+
+.global impeg2_memset0_16bit_8x8_linear_block_av8
+
+
+impeg2_memset0_16bit_8x8_linear_block_av8:
+
+    push_v_regs
+
+    movi            v0.8h, #0
+
+    //Y data
+
+    st1             {v0.8h} , [x0], #16 //row1
+
+    st1             {v0.8h} , [x0], #16 //row2
+
+    st1             {v0.8h} , [x0], #16 //row3
+
+    st1             {v0.8h} , [x0], #16 //row4
+
+    st1             {v0.8h} , [x0], #16 //row5
+
+    st1             {v0.8h} , [x0], #16 //row6
+
+    st1             {v0.8h} , [x0], #16 //row7
+
+    st1             {v0.8h} , [x0], #16 //row8
+
+
+
+    pop_v_regs
+    ret
+
+
+
+
diff --git a/common/armv8/impeg2_neon_macros.s b/common/armv8/impeg2_neon_macros.s
new file mode 100644
index 0000000..452ba45
--- /dev/null
+++ b/common/armv8/impeg2_neon_macros.s
@@ -0,0 +1,58 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//*******************************************************************************
+//* @file
+//*  impeg2_neon_macros.s
+//*
+//* @brief
+//*  Contains assembly macros
+//*
+//* @author
+//*  Naveen SR
+//*
+//* @par List of Functions:
+//*
+//*
+//* @remarks
+//*  None
+//*
+//*******************************************************************************
+
+
+.macro push_v_regs
+    stp             d8, d9, [sp, #-16]!
+    stp             d10, d11, [sp, #-16]!
+    stp             d12, d13, [sp, #-16]!
+    stp             d14, d15, [sp, #-16]!
+.endm
+.macro pop_v_regs
+    ldp             d14, d15, [sp], #16
+    ldp             d12, d13, [sp], #16
+    ldp             d10, d11, [sp], #16
+    ldp             d8, d9, [sp], #16
+.endm
+
+.macro swp reg1, reg2
+    eor             \reg1, \reg1, \reg2
+    eor             \reg2, \reg1, \reg2
+    eor             \reg1, \reg1, \reg2
+.endm
+
diff --git a/common/armv8/impeg2_platform_macros.h b/common/armv8/impeg2_platform_macros.h
new file mode 100644
index 0000000..ff31034
--- /dev/null
+++ b/common/armv8/impeg2_platform_macros.h
@@ -0,0 +1,49 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_PLATFORM_MACROS_H__
+#define __IMPEG2_PLATFORM_MACROS_H__
+
+#define CONV_LE_TO_BE(u4_temp2,u4_temp1) u4_temp2 =                            \
+                                         (u4_temp1 << 24) |                    \
+                                         ((u4_temp1 & 0xff00) << 8) |          \
+                                         ((u4_temp1 & 0xff0000) >> 8) |        \
+                                         (u4_temp1 >> 24);
+
+static __inline  UWORD32 CLZ(UWORD32 u4_word)
+{
+    if(u4_word)
+        return (__builtin_clz(u4_word));
+    else
+        return 32;
+}
+
+#define CLIP_U8(x) ((x) > 255) ? (255) : (((x) < 0) ? (0) : (x))
+#define CLIP_S8(x) ((x) > 127) ? (127) : (((x) < -128) ? (-128) : (x))
+
+#define CLIP_U12(x) ((x) > 4095) ? (4095) : (((x) < 0) ? (0) : (x))
+#define CLIP_S12(x) ((x) > 2047) ? (2047) : (((x) < -2048) ? (-2048) : (x))
+
+#define CLIP_U16(x) ((x) > 65535) ? (65535) : (((x) < 0) ? (0) : (x))
+#define CLIP_S16(x) ((x) > 65535) ? (65535) : (((x) < -65536) ? (-65536) : (x))
+
+#define INLINE
+#define PLD(x) __pld(x)
+
+#endif /* __IMPEG2_PLATFORM_MACROS_H__ */
diff --git a/common/impeg2_buf_mgr.c b/common/impeg2_buf_mgr.c
new file mode 100644
index 0000000..c4aca4a
--- /dev/null
+++ b/common/impeg2_buf_mgr.c
@@ -0,0 +1,411 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+*******************************************************************************
+* @file
+*  impeg2_buf_mgr.c
+*
+* @brief
+*  Contains function definitions for buffer management
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*   - impeg2_buf_mgr_init()
+*   - impeg2_buf_mgr_add()
+*   - impeg2_buf_mgr_get_next_free()
+*   - impeg2_buf_mgr_check_free()
+*   - impeg2_buf_mgr_release()
+*   - impeg2_buf_mgr_set_status()
+*   - impeg2_buf_mgr_get_status()
+*   - impeg2_buf_mgr_get_buf()
+*   - impeg2_buf_mgr_get_num_active_buf()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include "iv_datatypedef.h"
+#include "impeg2_defs.h"
+#include "impeg2_buf_mgr.h"
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Buffer manager initialization function.
+*
+* @par Description:
+*    Initializes the buffer manager structure
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void impeg2_buf_mgr_init(
+                buf_mgr_t *ps_buf_mgr)
+{
+    WORD32 id;
+
+    ps_buf_mgr->u4_max_buf_cnt = BUF_MGR_MAX_CNT;
+    ps_buf_mgr->u4_active_buf_cnt = 0;
+
+    for(id = 0; id < BUF_MGR_MAX_CNT; id++)
+    {
+        ps_buf_mgr->au4_status[id] = 0;
+        ps_buf_mgr->apv_ptr[id] = NULL;
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*       Adds and increments the buffer and buffer count.
+*
+* @par Description:
+*     Adds a buffer to the buffer manager if it is not already  present and
+*   increments the  active buffer count
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @param[in] pv_ptr
+*  Pointer to the buffer to be added
+*
+* @returns  Returns 0 on success, -1 otherwise
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+WORD32 impeg2_buf_mgr_add(
+                buf_mgr_t *ps_buf_mgr,
+                void *pv_ptr,
+                WORD32 i4_buf_id)
+{
+
+    /* Check if buffer ID is within allowed range */
+    if(i4_buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt)
+    {
+        return (-1);
+    }
+
+    /* Check if the current ID is being used to hold some other buffer */
+    if((ps_buf_mgr->apv_ptr[i4_buf_id] != NULL) &&
+       (ps_buf_mgr->apv_ptr[i4_buf_id] != pv_ptr))
+    {
+        return (-1);
+    }
+    ps_buf_mgr->apv_ptr[i4_buf_id] = pv_ptr;
+
+    return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Gets the next free buffer.
+*
+* @par Description:
+*     Returns the next free buffer available and sets the  corresponding status
+*   to DEC
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @param[in] pi4_buf_id
+*  Pointer to the id of the free buffer
+*
+* @returns  Pointer to the free buffer
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void* impeg2_buf_mgr_get_next_free(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 *pi4_buf_id)
+{
+    WORD32 id;
+    void *pv_ret_ptr;
+
+    pv_ret_ptr = NULL;
+    for(id = 0; id < (WORD32)ps_buf_mgr->u4_max_buf_cnt; id++)
+    {
+        /* Check if the buffer is non-null and status is zero */
+        if((ps_buf_mgr->au4_status[id] == 0) && (ps_buf_mgr->apv_ptr[id]))
+        {
+            *pi4_buf_id = id;
+            /* DEC is set to 1 */
+            ps_buf_mgr->au4_status[id] = 1;
+            pv_ret_ptr = ps_buf_mgr->apv_ptr[id];
+            break;
+        }
+    }
+
+    return pv_ret_ptr;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Checks the buffer manager for free buffers available.
+*
+* @par Description:
+*  Checks if there are any free buffers available
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @returns  Returns 0 if available, -1 otherwise
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+WORD32 impeg2_buf_mgr_check_free(
+                buf_mgr_t *ps_buf_mgr)
+{
+    UWORD32 id;
+
+    for(id = 0; id < ps_buf_mgr->u4_max_buf_cnt; id++)
+    {
+        if((ps_buf_mgr->au4_status[id] == 0) &&
+           (ps_buf_mgr->apv_ptr[id]))
+        {
+            return 1;
+        }
+    }
+
+    return 0;
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*       Resets the status bits.
+*
+* @par Description:
+*     resets the status bits that the mask contains (status  corresponding to
+*    the id)
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @param[in] buf_id
+*  ID of the buffer status to be released
+*
+* @param[in] mask
+*  Contains the bits that are to be reset
+*
+* @returns  0 if success, -1 otherwise
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+WORD32 impeg2_buf_mgr_release(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 i4_buf_id,
+                UWORD32 u4_mask)
+{
+    /* If the given id is pointing to an id which is not yet added */
+    if(i4_buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt)
+    {
+        return (-1);
+    }
+
+    if(0 == (ps_buf_mgr->au4_status[i4_buf_id] & u4_mask))
+    {
+        return (-1);
+    }
+
+    ps_buf_mgr->au4_status[i4_buf_id] &= ~u4_mask;
+
+    /* If both the REF and DISP are zero, DEC is set to zero */
+    if(ps_buf_mgr->au4_status[i4_buf_id] == 1)
+    {
+        ps_buf_mgr->au4_status[i4_buf_id] = 0;
+    }
+
+    return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Sets the status bit.
+*
+* @par Description:
+*     sets the status bits that the mask contains (status  corresponding to the
+*    id)
+*
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @param[in] buf_id
+*  ID of the buffer whose status needs to be modified
+*
+*
+* @param[in] mask
+*  Contains the bits that are to be set
+*
+* @returns  0 if success, -1 otherwise
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+WORD32 impeg2_buf_mgr_set_status(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 i4_buf_id,
+                UWORD32 u4_mask)
+{
+    if(i4_buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt)
+    {
+        return (-1);
+    }
+
+
+    if((ps_buf_mgr->au4_status[i4_buf_id] & u4_mask) != 0)
+    {
+        return (-1);
+    }
+
+    ps_buf_mgr->au4_status[i4_buf_id] |= u4_mask;
+    return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Returns the status of the buffer.
+*
+* @par Description:
+*  Returns the status of the buffer corresponding to the id
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @param[in] buf_id
+*  ID of the buffer status required
+*
+* @returns  Status of the buffer corresponding to the id
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+UWORD32 impeg2_buf_mgr_get_status(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 i4_buf_id)
+{
+    return ps_buf_mgr->au4_status[i4_buf_id];
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Gets the buffer from the buffer manager
+*
+* @par Description:
+*        Returns the pointer to the buffer corresponding to the id
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @param[in] buf_id
+*  ID of the buffer required
+*
+* @returns  Pointer to the buffer required
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void* impeg2_buf_mgr_get_buf(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 i4_buf_id)
+{
+    return ps_buf_mgr->apv_ptr[i4_buf_id];
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*        Gets the no.of active buffer
+*
+* @par Description:
+*      Return the number of active buffers in the buffer manager
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @returns  number of active buffers
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+UWORD32 impeg2_buf_mgr_get_num_active_buf(
+                buf_mgr_t *ps_buf_mgr)
+{
+    return ps_buf_mgr->u4_max_buf_cnt;
+}
diff --git a/common/impeg2_buf_mgr.h b/common/impeg2_buf_mgr.h
new file mode 100644
index 0000000..6b1cbef
--- /dev/null
+++ b/common/impeg2_buf_mgr.h
@@ -0,0 +1,115 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+*  impeg2_buf_mgr.h
+*
+* @brief
+*  Function declarations used for buffer management
+*
+* @author
+*  Srinivas T
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IMPEG2_BUF_MGR_H_
+#define _IMPEG2_BUF_MGR_H_
+
+#define BUF_MGR_MAX_CNT 64
+
+#define BUF_MGR_DEC         1
+#define BUF_MGR_REF         (1 << 1)
+#define BUF_MGR_DISP        (1 << 2)
+
+typedef struct
+{
+    /**
+     * max_buf_cnt
+     */
+    UWORD32 u4_max_buf_cnt;
+
+    /**
+     * active_buf_cnt
+     */
+    UWORD32 u4_active_buf_cnt;
+    /**
+     *  au4_status[BUF_MGR_MAX_CNT]
+     */
+    UWORD32 au4_status[BUF_MGR_MAX_CNT];
+    /* The last three bit of status are:    */
+    /* Bit 0 - DEC                          */
+    /* Bit 1 - REF                          */
+    /* Bit 2 - DISP                         */
+
+    void    *apv_ptr[BUF_MGR_MAX_CNT];
+}buf_mgr_t;
+
+// intializes the buffer API structure
+void impeg2_buf_mgr_init(
+                buf_mgr_t *ps_buf_mgr);
+
+// Add buffer to buffer manager. 0: success, -1: fail (u4_active_buf_cnt has reached u4_max_buf_cnt)
+WORD32 impeg2_buf_mgr_add(
+                buf_mgr_t *ps_buf_mgr,
+                void *pv_ptr,
+                WORD32 buf_id);
+
+// this function will set the buffer status to DEC
+void* impeg2_buf_mgr_get_next_free(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 *pi4_id);
+
+// this function will check if there are any free buffers
+WORD32 impeg2_buf_mgr_check_free(
+                buf_mgr_t *ps_buf_mgr);
+
+// mask will have who released it: DISP:REF:DEC
+WORD32 impeg2_buf_mgr_release(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 id,
+                UWORD32 mask);
+
+// sets the status to one or all of DISP:REF:DEC
+WORD32 impeg2_buf_mgr_set_status(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 id,
+                UWORD32 mask);
+
+// Gets status of the buffer
+UWORD32 impeg2_buf_mgr_get_status(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 id);
+
+// pass the ID - buffer will be returned
+void* impeg2_buf_mgr_get_buf(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 id);
+
+// will return number of active buffers
+UWORD32 impeg2_buf_mgr_get_num_active_buf(
+                buf_mgr_t *ps_buf_mgr);
+
+
+
+#endif  //_IMPEG2_BUF_MGR_H_
diff --git a/common/impeg2_defs.h b/common/impeg2_defs.h
new file mode 100644
index 0000000..f1523f2
--- /dev/null
+++ b/common/impeg2_defs.h
@@ -0,0 +1,331 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+#ifndef __IMPEG2_DEFS_H__
+#define __IMPEG2_DEFS_H__
+
+#include <assert.h>
+
+/* Decoder needs at least 4 reference buffers in order to support format conversion in a thread and
+to support B pictures. Because of format conversion in a thread, codec delay is now 2 frames instead of 1.
+To reduce this delay, format conversion has to wait for MB status before converting for B pictures.
+To avoid this check the delay is increased to 2 and hence number of reference frames minimum is 4 */
+#define NUM_INT_FRAME_BUFFERS                     4
+
+
+#define MAX_WIDTH               4096
+#define MAX_HEIGHT              2160
+
+#define MIN_WIDTH               16
+#define MIN_HEIGHT              16
+
+
+#define MAX_FRM_SIZE            (MAX_WIDTH * MAX_HEIGHT * 2)  /* Supports only 420P and 422ILE */
+
+#define DEC_ORDER               0
+
+#define MAX_BITSTREAM_BUFFER_SIZE       2000 * 1024
+
+
+/******************************************************************************
+* MPEG2 Start code and other code definitions
+*******************************************************************************/
+#define START_CODE_PREFIX               0x000001
+#define SEQUENCE_HEADER_CODE            0x000001B3
+#define EXTENSION_START_CODE            0x000001B5
+#define USER_DATA_START_CODE            0x000001B2
+#define GOP_START_CODE                  0x000001B8
+#define PICTURE_START_CODE              0x00000100
+#define SEQUENCE_END_CODE               0x000001B7
+#define RESERVED_START_CODE             0x000001B0
+#define MB_ESCAPE_CODE                  0x008
+
+/******************************************************************************
+* MPEG2 Length of various codes definitions
+*******************************************************************************/
+#define START_CODE_LEN                  32
+#define START_CODE_PREFIX_LEN           24
+#define MB_ESCAPE_CODE_LEN              11
+#define EXT_ID_LEN                      4
+#define MB_QUANT_SCALE_CODE_LEN         5
+#define MB_DCT_TYPE_LEN                 1
+#define MB_MOTION_TYPE_LEN              2
+#define BYTE_LEN                        8
+
+/******************************************************************************
+* MPEG1 code definitions
+*******************************************************************************/
+#define MB_STUFFING_CODE                0x00F
+
+/******************************************************************************
+* MPEG1 Length of various codes definitions
+*******************************************************************************/
+#define MB_STUFFING_CODE_LEN             11
+
+/******************************************************************************
+* MPEG2 MB definitions
+*******************************************************************************/
+#define MPEG2_INTRA_MB                  0x04
+#define MPEG2_INTRAQ_MB                 0x44
+#define MPEG2_INTER_MB                  0x28
+#define MB_MOTION_BIDIRECT              0x30
+#define MB_INTRA_OR_PATTERN             0x0C
+
+/******************************************************************************
+* Tools definitions
+*******************************************************************************/
+#define SPATIAL_SCALABILITY             0x01
+#define TEMPORAL_SCALABILITY            0x03
+
+/******************************************************************************
+* Extension IDs definitions
+*******************************************************************************/
+#define SEQ_DISPLAY_EXT_ID              0x02
+#define SEQ_SCALABLE_EXT_ID             0x05
+#define QUANT_MATRIX_EXT_ID             0x03
+#define COPYRIGHT_EXT_ID                0x04
+#define PIC_DISPLAY_EXT_ID              0x07
+#define PIC_SPATIAL_SCALABLE_EXT_ID     0x09
+#define PIC_TEMPORAL_SCALABLE_EXT_ID    0x0A
+#define CAMERA_PARAM_EXT_ID             0x0B
+#define ITU_T_EXT_ID                    0x0C
+/******************************************************************************
+* Extension IDs Length definitions
+*******************************************************************************/
+#define CAMERA_PARAMETER_EXTENSION_LEN  377
+#define COPYRIGHT_EXTENSION_LEN          88
+#define GROUP_OF_PICTURE_LEN             59
+
+
+/******************************************************************************
+* MPEG2 Picture structure definitions
+*******************************************************************************/
+#define TOP_FIELD                       1
+#define BOTTOM_FIELD                    2
+#define FRAME_PICTURE                   3
+
+/******************************************************************************
+* MPEG2 Profile definitions
+*******************************************************************************/
+#define MPEG2_SIMPLE_PROFILE            0x05
+#define MPEG2_MAIN_PROFILE              0x04
+
+/******************************************************************************
+* MPEG2 Level definitions
+*******************************************************************************/
+#define MPEG2_LOW_LEVEL                 0x0a
+#define MPEG2_MAIN_LEVEL                0x08
+
+/******************************************************************************
+* MPEG2 Prediction types
+*******************************************************************************/
+#define FIELD_PRED                      0
+#define FRAME_PRED                      1
+#define DUAL_PRED                       2
+#define RESERVED                        -1
+#define MC_16X8_PRED                    3
+
+/*****************************************************************************
+* MPEG2 Motion vector format
+******************************************************************************/
+#define FIELD_MV                        0
+#define FRAME_MV                        1
+
+/******************************************************************************/
+/* General Video related definitions                                          */
+/******************************************************************************/
+
+#define BLK_SIZE 8
+#define NUM_COEFFS ((BLK_SIZE)*(BLK_SIZE))
+#define LUMA_BLK_SIZE (2 * (BLK_SIZE))
+#define CHROMA_BLK_SIZE (BLK_SIZE)
+#define  BLOCKS_IN_MB            6
+#define  MB_SIZE                16
+#define  MB_CHROMA_SIZE          8
+#define  NUM_PELS_IN_BLOCK      64
+#define  NUM_LUMA_BLKS           4
+#define  NUM_CHROMA_BLKS         2
+#define  MAX_COLR_COMPS          3
+#define  Y_LUMA                  0
+#define  U_CHROMA                1
+#define  V_CHROMA                2
+#define  MB_LUMA_MEM_SIZE           ((MB_SIZE) * (MB_SIZE))
+#define  MB_CHROMA_MEM_SIZE         ((MB_SIZE/2) * (MB_SIZE/2))
+
+#define BITS_IN_INT     32
+/******************************************************************************/
+/* MPEG2 Motion compensation related definitions                              */
+/******************************************************************************/
+#define REF_FRM_MB_WIDTH        18
+#define REF_FRM_MB_HEIGHT       18
+#define REF_FLD_MB_HEIGHT       10
+#define REF_FLD_MB_WIDTH        18
+
+/******************************************************************************/
+/* Maximum number of bits per MB                                              */
+/******************************************************************************/
+#define I_MB_BIT_SIZE 90
+#define P_MB_BIT_SIZE 90
+#define B_MB_BIT_SIZE 150
+
+/******************************************************************************/
+/* Aspect ratio related definitions                                           */
+/******************************************************************************/
+#define MPG1_NTSC_4_3       0x8
+#define MPG1_PAL_4_3        0xc
+#define MPG1_NTSC_16_9      0x6
+#define MPG1_PAL_16_9       0x3
+#define MPG1_1_1            0x1
+
+#define MPG2_4_3            0x2
+#define MPG2_16_9           0x3
+#define MPG2_1_1            0x1
+
+/******************************************************************************/
+/* Inverse Quantizer Output range                                             */
+/******************************************************************************/
+#define IQ_OUTPUT_MAX 2047
+#define IQ_OUTPUT_MIN -2048
+
+/******************************************************************************/
+/* IDCT Output range                                                          */
+/******************************************************************************/
+#define IDCT_OUTPUT_MAX  255
+#define IDCT_OUTPUT_MIN -256
+
+/******************************************************************************/
+/* Output pixel range                                                         */
+/******************************************************************************/
+#define PEL_VALUE_MAX 255
+#define PEL_VALUE_MIN 0
+
+/******************************************************************************/
+/* inv scan types                                                             */
+/******************************************************************************/
+#define ZIG_ZAG_SCAN        0
+#define VERTICAL_SCAN       1
+
+/******************************************************************************/
+/* Related VLD codes                                                          */
+/******************************************************************************/
+#define ESC_CODE_VALUE 0x0058
+#define EOB_CODE_VALUE 0x07d0
+
+#define END_OF_BLOCK                    0x01
+#define ESCAPE_CODE                     0x06
+
+#define END_OF_BLOCK_ZERO               0x01ff
+#define END_OF_BLOCK_ONE                0x01ff
+
+/******************** Idct Specific ***************/
+#define TRANS_SIZE_8            8
+#define IDCT_STG1_SHIFT        12
+#define IDCT_STG2_SHIFT        16
+
+#define IDCT_STG1_ROUND        ((1 << IDCT_STG1_SHIFT) >> 1)
+#define IDCT_STG2_ROUND        ((1 << IDCT_STG2_SHIFT) >> 1)
+
+
+/******************************************************************************
+* Sample Version Definitions
+*******************************************************************************/
+#define SAMPLE_VERS_MAX_FRAMES_DECODE   999
+
+#define MAX_FRAME_BUFFER                     7
+
+/* vop coding type */
+typedef enum
+{
+    I_PIC = 1,
+    P_PIC,
+    B_PIC,
+    D_PIC
+} e_pic_type_t;
+
+typedef enum
+{
+    MPEG_2_VIDEO,
+    MPEG_1_VIDEO
+} e_video_type_t;
+
+typedef enum
+{
+    FORW,
+    BACK,
+    BIDIRECT
+} e_pred_direction_t;
+
+typedef enum
+{
+    TOP,
+    BOTTOM
+} e_field_t;
+
+/* Motion vectors (first/second) */
+enum
+{
+    FIRST,
+    SECOND,
+    THIRD,
+    FOURTH
+};
+
+enum
+{
+    MV_X,
+    MV_Y
+};
+
+/* Enumeration defining the various kinds of interpolation possible in
+motion compensation */
+typedef enum
+{
+  FULL_XFULL_Y,
+    FULL_XHALF_Y,
+    HALF_XFULL_Y,
+    HALF_XHALF_Y
+} e_sample_type_t;
+typedef enum
+{
+    /* Params of the reference buffer used as input to MC */
+    /* frame prediction in P frame picture */
+    MC_FRM_FW_OR_BK_1MV,
+    /* field prediction in P frame picture */
+    MC_FRM_FW_OR_BK_2MV,
+    /* frame prediction in B frame picture */
+    MC_FRM_FW_AND_BK_2MV,
+    /* field prediction in B frame picture */
+    MC_FRM_FW_AND_BK_4MV,
+    /* dual prime prediction in P frame picture */
+    MC_FRM_FW_DUAL_PRIME_1MV,
+    /* frame prediction in P field picture */
+    MC_FLD_FW_OR_BK_1MV,
+    /* 16x8 prediction in P field picture */
+    MC_FLD_FW_OR_BK_2MV,
+    /* field prediction in B field picture */
+    MC_FLD_FW_AND_BK_2MV,
+    /* 16x8 prediction in B field picture */
+    MC_FLD_FW_AND_BK_4MV,
+    /* dual prime prediction in P field picture */
+    MC_FLD_FW_DUAL_PRIME_1MV,
+} e_mb_type_t;
+
+#endif /* __IMPEG2_DEFS_H__ */
+
diff --git a/common/impeg2_disp_mgr.c b/common/impeg2_disp_mgr.c
new file mode 100644
index 0000000..f5ede84
--- /dev/null
+++ b/common/impeg2_disp_mgr.c
@@ -0,0 +1,172 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+*  impeg2_disp_mgr.c
+*
+* @brief
+*  Contains function definitions for display management
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*   - impeg2_disp_mgr_init()
+*   - impeg2_disp_mgr_add()
+*   - impeg2_disp_mgr_get()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include "iv_datatypedef.h"
+#include "impeg2_defs.h"
+#include "impeg2_disp_mgr.h"
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Initialization function for display buffer manager
+*
+* @par Description:
+*    Initializes the display buffer management structure
+*
+* @param[in] ps_disp_mgr
+*  Pointer to the display buffer management structure
+*
+* @returns none
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void impeg2_disp_mgr_init(
+                disp_mgr_t *ps_disp_mgr)
+{
+    WORD32 id;
+
+
+    for(id = 0; id < DISP_MGR_MAX_CNT; id++)
+    {
+        ps_disp_mgr->apv_ptr[id] = NULL;
+    }
+
+    ps_disp_mgr->i4_wr_idx = 0;
+    ps_disp_mgr->i4_rd_idx = 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Adds a buffer to the display manager
+*
+* @par Description:
+*      Adds a buffer to the display buffer manager
+*
+* @param[in] ps_disp_mgr
+*  Pointer to the diaplay buffer management structure
+*
+* @param[in] buf_id
+*  ID of the display buffer
+*
+* @param[in] abs_poc
+*  Absolute POC of the display buffer
+*
+* @param[in] pv_ptr
+*  Pointer to the display buffer
+*
+* @returns  0 if success, -1 otherwise
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+WORD32 impeg2_disp_mgr_add(disp_mgr_t *ps_disp_mgr,
+                          void *pv_ptr,
+                          WORD32 i4_buf_id)
+{
+
+
+    WORD32 id;
+    id = ps_disp_mgr->i4_wr_idx % DISP_MGR_MAX_CNT;
+
+    ps_disp_mgr->apv_ptr[id] = pv_ptr;
+    ps_disp_mgr->ai4_buf_id[id] = i4_buf_id;
+    ps_disp_mgr->i4_wr_idx++;
+
+    return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Gets the next buffer
+*
+* @par Description:
+*  Gets the next display buffer
+*
+* @param[in] ps_disp_mgr
+*  Pointer to the display buffer structure
+*
+* @param[out]  pi4_buf_id
+*  Pointer to hold buffer id of the display buffer being returned
+*
+* @returns  Pointer to the next display buffer
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void* impeg2_disp_mgr_get(disp_mgr_t *ps_disp_mgr, WORD32 *pi4_buf_id)
+{
+    WORD32 id;
+
+    *pi4_buf_id = -1;
+
+    if(ps_disp_mgr->i4_rd_idx < ps_disp_mgr->i4_wr_idx)
+    {
+        id = ps_disp_mgr->i4_rd_idx % DISP_MGR_MAX_CNT;
+        if(NULL == ps_disp_mgr->apv_ptr[id])
+        {
+            return NULL;
+        }
+
+        *pi4_buf_id = ps_disp_mgr->ai4_buf_id[id];
+
+        ps_disp_mgr->i4_rd_idx++;
+
+        return ps_disp_mgr->apv_ptr[id];
+    }
+    else
+        return NULL;
+
+}
diff --git a/common/impeg2_disp_mgr.h b/common/impeg2_disp_mgr.h
new file mode 100644
index 0000000..96b01b0
--- /dev/null
+++ b/common/impeg2_disp_mgr.h
@@ -0,0 +1,67 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+*  impeg2_disp_mgr.h
+*
+* @brief
+*  Function declarations used for display management
+*
+* @author
+*  Srinivas T
+*
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IMPEG2_DISP_MGR_H_
+#define _IMPEG2_DISP_MGR_H_
+
+#define DISP_MGR_MAX_CNT 64
+#define DEFAULT_POC 0x7FFFFFFF
+
+typedef struct
+{
+    /**
+     * apv_ptr[DISP_MGR_MAX_CNT]
+     */
+    void    *apv_ptr[DISP_MGR_MAX_CNT];
+
+    WORD32   ai4_buf_id[DISP_MGR_MAX_CNT];
+
+    WORD32  i4_wr_idx;
+
+    WORD32  i4_rd_idx;
+}disp_mgr_t;
+
+void impeg2_disp_mgr_init(
+                disp_mgr_t *ps_disp_mgr);
+
+WORD32 impeg2_disp_mgr_add(
+                disp_mgr_t *ps_disp_mgr,
+                void *pv_ptr,
+                WORD32 i4_buf_id);
+
+void* impeg2_disp_mgr_get(disp_mgr_t *ps_disp_mgr, WORD32 *pi4_buf_id);
+
+#endif  //_IMPEG2_DISP_MGR_H_
diff --git a/common/impeg2_format_conv.c b/common/impeg2_format_conv.c
new file mode 100644
index 0000000..ec0bcfb
--- /dev/null
+++ b/common/impeg2_format_conv.c
@@ -0,0 +1,401 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/*                                                                           */
+/*  File Name         : impeg2_format_conv .c                                */
+/*                                                                           */
+/*  Description       : Contains functions needed to convert the images in   */
+/*                      different color spaces to yuv 422i color space       */
+/*                                                                           */
+/*  List of Functions : YUV420toYUV420()                                      */
+/*                      YUV420toYUV422I()                                    */
+/*                      YUV420toYUV420SP_VU()                                */
+/*                      YUV420toYUV420SP_UU()                                */
+/*                                                                           */
+/*  Issues / Problems : None                                                 */
+/*                                                                           */
+/*  Revision History  :                                                      */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         28 08 2007  Naveen Kumar T        Draft                           */
+/*                                                                           */
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+/* System include files */
+
+/* User include files */
+#include <stdio.h>
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "iv.h"
+#include "ithread.h"
+
+#include "iv_datatypedef.h"
+#include "impeg2_macros.h"
+#include "impeg2_buf_mgr.h"
+#include "impeg2_disp_mgr.h"
+#include "impeg2_defs.h"
+#include "impeg2_platform_macros.h"
+
+#include "impeg2_job_queue.h"
+#include "impeg2_format_conv.h"
+
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : impeg2_copy_frm_yuv420p()                                        */
+/*                                                                           */
+/*  Description   : This function performs conversion from YUV420 to         */
+/*                  YUV422I color space.                                     */
+/*                                                                           */
+/*  Inputs        : pu1_src_y,       -   UWORD8 pointer to source y plane.   */
+/*                  pu1_src_u,       -   UWORD8 pointer to source u plane.   */
+/*                  pu1_src_v,       -   UWORD8 pointer to source v plane.   */
+/*                  pu1_dst_y,       -   UWORD8 pointer to dest y plane.     */
+/*                  pu1_dst_u,       -   UWORD8 pointer to dest u plane.     */
+/*                  pu1_dst_v,       -   UWORD8 pointer to dest v plane.     */
+/*                  u4_width,        -   Width of image.                     */
+/*                  u4_height,       -   Height of image.                    */
+/*                  u4_src_stride_y  -   Stride in pixels of source Y plane. */
+/*                  u4_src_stride_u  -   Stride in pixels of source U plane. */
+/*                  u4_src_stride_v  -   Stride in pixels of source V plane. */
+/*                  u4_dst_stride_y  -   Stride in pixels of dest Y plane.   */
+/*                  u4_dst_stride_u  -   Stride in pixels of dest U plane.   */
+/*                  u4_dst_stride_v  -   Stride in pixels of dest V plane.   */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : One row is processed at a time. The one iteration of the */
+/*                  code will rearrange pixels into YUV422 interleaved       */
+/*                  format.                                                  */
+/*                                                                           */
+/*  Outputs       : None                                                     */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         29 08 2007  Naveen Kumar T        Draft                           */
+/*                                                                           */
+/*****************************************************************************/
+void impeg2_copy_frm_yuv420p(UWORD8 *pu1_src_y,
+                             UWORD8 *pu1_src_u,
+                             UWORD8 *pu1_src_v,
+                             UWORD8 *pu1_dst_y,
+                             UWORD8 *pu1_dst_u,
+                             UWORD8 *pu1_dst_v,
+                             UWORD32 u4_width,
+                             UWORD32 u4_height,
+                             UWORD32 u4_src_stride_y,
+                             UWORD32 u4_src_stride_u,
+                             UWORD32 u4_src_stride_v,
+                             UWORD32 u4_dst_stride_y,
+                             UWORD32 u4_dst_stride_u,
+                             UWORD32 u4_dst_stride_v)
+{
+    WORD32 i4_cnt;
+    WORD32  i4_y_height     = (WORD32) u4_height;
+    WORD32  i4_uv_height    = u4_height >> 1;
+    WORD32  i4_uv_width     = u4_width >> 1;
+
+    for(i4_cnt = 0; i4_cnt < i4_y_height; i4_cnt++)
+    {
+        memcpy(pu1_dst_y, pu1_src_y, u4_width);
+        pu1_dst_y += (u4_dst_stride_y);
+        pu1_src_y += (u4_src_stride_y);
+    }
+
+    for(i4_cnt = 0; i4_cnt < i4_uv_height; i4_cnt++)
+    {
+        memcpy(pu1_dst_u, pu1_src_u, i4_uv_width);
+        pu1_dst_u += (u4_dst_stride_u);
+        pu1_src_u += (u4_src_stride_u);
+
+    }
+
+    for(i4_cnt = 0; i4_cnt < i4_uv_height; i4_cnt++)
+    {
+        memcpy(pu1_dst_v, pu1_src_v, i4_uv_width);
+        pu1_dst_v += (u4_dst_stride_v);
+        pu1_src_v += (u4_src_stride_v);
+
+    }
+
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : impeg2_fmt_conv_yuv420p_to_yuv422ile()                   */
+/*                                                                           */
+/*  Description   : This function performs conversion from YUV420 to         */
+/*                  YUV422I color space.                                     */
+/*                                                                           */
+/*  Inputs        : pu1_y            -   UWORD8 pointer to y plane.          */
+/*                  pu1_u            -   UWORD8 pointer to u plane.          */
+/*                  pu1_v            -   UWORD8 pointer to u plane.          */
+/*                  pu2_yuv422i      -   UWORD16 pointer to yuv422iimage.    */
+/*                  u4_width         -   Width of the Y plane.               */
+/*                  u4_height        -   Height of the Y plane.              */
+/*                  u4_stride_y      -   Stride in pixels of Y plane.        */
+/*                  u4_stride_u      -   Stride in pixels of U plane.        */
+/*                  u4_stride_v      -   Stride in pixels of V plane.        */
+/*                  u4_stride_yuv422i-   Stride in pixels of yuv422i image.  */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : One row is processed at a time. The one iteration of the */
+/*                  code will rearrange pixels into YUV422 interleaved       */
+/*                  format.                                                  */
+/*                                                                           */
+/*  Outputs       : None                                                     */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         29 08 2007  Naveen Kumar T        Draft                           */
+/*                                                                           */
+/*****************************************************************************/
+
+void impeg2_fmt_conv_yuv420p_to_yuv422ile(register UWORD8 *pu1_y,
+                     register UWORD8 *pu1_u,
+                     register UWORD8 *pu1_v,
+                     void *pv_yuv422i,
+                     UWORD32 u4_width,
+                     UWORD32 u4_height,
+                     UWORD32 u4_stride_y,
+                     UWORD32 u4_stride_u,
+                     UWORD32 u4_stride_v,
+                     UWORD32 u4_stride_yuv422i)
+{
+    /* Declare local variables */
+    register WORD16  i,j;
+    register UWORD16 u2_offset1,u2_offset2,u2_offset3,u2_offset_yuv422i;
+    register UWORD8  u1_y1,u1_uv;
+    register UWORD32 u4_pixel;
+    register UWORD16 u2_width_cnt;
+    register UWORD32 *pu4_yuv422i;
+
+    UWORD8 u1_flag;             /* This flag is used to indicate wether the row is even or odd */
+
+    u1_flag=0x0;                /* Intialize it with 0 indicating odd row */
+
+    /* Calculate the offsets necessary to make input and output buffers to point next row */
+    u2_offset1       = u4_stride_y - u4_width;
+    u2_offset2       = u4_stride_u - ((u4_width + 1) >> 1);
+    u2_offset3       = u4_stride_v - ((u4_width + 1) >> 1);
+    u2_offset_yuv422i = (u4_stride_yuv422i >> 1) -((u4_width + 1) >> 1);
+
+    /* Type cast the output pointer to UWORD32 */
+    pu4_yuv422i      = (UWORD32 *)pv_yuv422i;
+
+    /* Calculate the loop counter for inner loop */
+    u2_width_cnt     = u4_width >> 1;
+
+    /* Run the loop for height of input buffer */
+    for(i = u4_height; i > 0; i--)
+    {
+        /* Run the loop for width/2 */
+        for(j = u2_width_cnt; j > 0; j--)
+        {
+            /* Store the value in output buffer in the order U0Y0V0Y1U2Y2V2Y3.... */
+            /* Load Y0 */
+            u1_y1          = *pu1_y++;
+            /* Load Y1 */
+            u4_pixel       = *pu1_y++;
+            /* Load V0 */
+            u1_uv          = *pu1_v++;
+            u4_pixel       = (u4_pixel << 8) + u1_uv;
+            /* Load U0 */
+            u1_uv          = *pu1_u++;
+            u4_pixel       = (u4_pixel << 8) + u1_y1;
+            u4_pixel       = (u4_pixel << 8) + u1_uv;
+            *pu4_yuv422i++ = u4_pixel;
+        }
+        /* Incase of width is odd number take care of last pixel */
+        if(u4_width & 0x1)
+        {
+            /* Store the value in output buffer in the order U0Y0V0Y1U2Y2V2Y3.... */
+            /* Load Y0 */
+            u1_y1          = *pu1_y++;
+            /* Load V0 */
+            u1_uv          = *pu1_v++;
+            /* Take Y0 as Y1 */
+            u4_pixel       = u1_y1;
+            u4_pixel       = (u4_pixel << 8) + u1_uv;
+            /* Load U0 */
+            u1_uv          = *pu1_u++;
+            u4_pixel       = (u4_pixel << 8) + u1_y1;
+            u4_pixel       = (u4_pixel << 8) + u1_uv;
+            *pu4_yuv422i++ = u4_pixel;
+        }
+        /* Make the pointers to buffer to point to next row */
+        pu1_y = pu1_y       + u2_offset1;
+        if(!u1_flag)
+        {
+            /* Restore the pointers of u and v buffer back so that the row of pixels are also  */
+            /* Processed with same row of u and values again */
+            pu1_u = pu1_u - ((u4_width + 1) >> 1);
+            pu1_v = pu1_v - ((u4_width + 1) >> 1);
+        }
+        else
+        {
+            /* Adjust the u and v buffer pointers so that they will point to next row */
+            pu1_u = pu1_u + u2_offset2;
+            pu1_v = pu1_v + u2_offset3;
+        }
+
+        /* Adjust the output buffer pointer for next row */
+        pu4_yuv422i = pu4_yuv422i + u2_offset_yuv422i;
+        /* Toggle the flag to convert between odd and even row */
+        u1_flag= u1_flag ^ 0x1;
+    }
+}
+
+
+
+
+void impeg2_fmt_conv_yuv420p_to_yuv420sp_vu(UWORD8 *pu1_y, UWORD8 *pu1_u, UWORD8 *pu1_v,
+                                     UWORD8 *pu1_dest_y, UWORD8 *pu1_dest_uv,
+                                     UWORD32 u4_height,  UWORD32 u4_width,UWORD32 u4_stridey,
+                                     UWORD32 u4_strideu, UWORD32 u4_stridev,
+                                     UWORD32 u4_dest_stride_y, UWORD32 u4_dest_stride_uv,
+                                     UWORD32 u4_convert_uv_only
+                                     )
+
+{
+
+
+    UWORD8 *pu1_src,*pu1_dst;
+    UWORD8 *pu1_src_u, *pu1_src_v;
+    UWORD16 i;
+    UWORD32 u2_width_uv;
+
+    UWORD32 u4_dest_inc_y=0, u4_dest_inc_uv=0;
+
+
+    /* Copy Y buffer */
+    pu1_dst = (UWORD8 *)pu1_dest_y;
+    pu1_src = (UWORD8 *)pu1_y;
+
+    u4_dest_inc_y =    u4_dest_stride_y;
+    u4_dest_inc_uv =   u4_dest_stride_uv;
+
+    if(0 == u4_convert_uv_only)
+    {
+        for(i = 0; i < u4_height; i++)
+        {
+            memcpy((void *)pu1_dst,(void *)pu1_src, u4_width);
+            pu1_dst += u4_dest_inc_y;
+            pu1_src += u4_stridey;
+        }
+    }
+
+    /* Interleave Cb and Cr buffers */
+    pu1_src_u = pu1_u;
+    pu1_src_v = pu1_v;
+    pu1_dst = pu1_dest_uv ;
+
+    u4_height = (u4_height + 1) >> 1;
+    u2_width_uv = (u4_width + 1) >> 1;
+    for(i = 0; i < u4_height ; i++)
+    {
+        UWORD32 j;
+        for(j = 0; j < u2_width_uv; j++)
+        {
+            *pu1_dst++ = *pu1_src_v++;
+            *pu1_dst++ = *pu1_src_u++;
+
+        }
+
+        pu1_dst += u4_dest_inc_uv - u4_width;
+        pu1_src_u  += u4_strideu - u2_width_uv;
+        pu1_src_v  += u4_stridev - u2_width_uv;
+    }
+}
+
+void impeg2_fmt_conv_yuv420p_to_yuv420sp_uv(UWORD8 *pu1_y, UWORD8 *pu1_u, UWORD8 *pu1_v,
+                                     UWORD8 *pu1_dest_y, UWORD8 *pu1_dest_uv,
+                                     UWORD32 u4_height,  UWORD32 u4_width,UWORD32 u4_stridey,
+                                     UWORD32 u4_strideu, UWORD32 u4_stridev,
+                                     UWORD32 u4_dest_stride_y, UWORD32 u4_dest_stride_uv,
+                                     UWORD32 u4_convert_uv_only)
+
+{
+
+
+    UWORD8 *pu1_src,*pu1_dst;
+    UWORD8 *pu1_src_u, *pu1_src_v;
+    UWORD16 i;
+    UWORD32 u2_width_uv;
+
+    UWORD32 u4_dest_inc_y=0, u4_dest_inc_uv=0;
+
+
+    /* Copy Y buffer */
+    pu1_dst = (UWORD8 *)pu1_dest_y;
+    pu1_src = (UWORD8 *)pu1_y;
+
+    u4_dest_inc_y =    u4_dest_stride_y;
+    u4_dest_inc_uv =   u4_dest_stride_uv;
+
+    if(0 == u4_convert_uv_only)
+    {
+        for(i = 0; i < u4_height; i++)
+        {
+            memcpy((void *)pu1_dst,(void *)pu1_src, u4_width);
+            pu1_dst += u4_dest_inc_y;
+            pu1_src += u4_stridey;
+        }
+    }
+
+    /* Interleave Cb and Cr buffers */
+    pu1_src_u = pu1_u;
+    pu1_src_v = pu1_v;
+    pu1_dst = pu1_dest_uv ;
+
+    u4_height = (u4_height + 1) >> 1;
+    u2_width_uv = (u4_width + 1) >> 1;
+    for(i = 0; i < u4_height ; i++)
+    {
+        UWORD32 j;
+        for(j = 0; j < u2_width_uv; j++)
+        {
+            *pu1_dst++ = *pu1_src_u++;
+            *pu1_dst++ = *pu1_src_v++;
+        }
+
+        pu1_dst += u4_dest_inc_uv - u4_width;
+        pu1_src_u  += u4_strideu - u2_width_uv;
+        pu1_src_v  += u4_stridev - u2_width_uv;
+    }
+
+}
+
+
diff --git a/common/impeg2_format_conv.h b/common/impeg2_format_conv.h
new file mode 100644
index 0000000..52400d3
--- /dev/null
+++ b/common/impeg2_format_conv.h
@@ -0,0 +1,133 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/*                                                                           */
+/*  File Name         : impeg2_format_conv.h                                */
+/*                                                                           */
+/*  Description       : Contains coefficients and constant reqquired for     */
+/*                      converting from rgb and gray color spaces to yuv422i */
+/*                      color space                                          */
+/*                                                                           */
+/*  List of Functions : None                                                 */
+/*                                                                           */
+/*  Issues / Problems : None                                                 */
+/*                                                                           */
+/*  Revision History  :                                                      */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         27 08 2007  Naveen Kumar T        Draft                           */
+/*                                                                           */
+/*****************************************************************************/
+
+#ifndef __IMPEG2_FORMAT_CONV_H__
+#define __IMPEG2_FORMAT_CONV_H__
+
+/*****************************************************************************/
+/* Typedefs                                                                  */
+/*****************************************************************************/
+
+#define COEFF_0_Y       66
+#define COEFF_1_Y       129
+#define COEFF_2_Y       25
+#define COEFF_0_U       -38
+#define COEFF_1_U       -75
+#define COEFF_2_U       112
+#define COEFF_0_V       112
+#define COEFF_1_V       -94
+#define COEFF_2_V       -18
+#define CONST_RGB_YUV1  4096
+#define CONST_RGB_YUV2  32768
+#define CONST_GRAY_YUV  128
+#define COEF_2_V2_U  0xFFEE0070
+
+#define COF_2Y_0Y          0X00190042
+#define COF_1U_0U          0XFFB5FFDA
+#define COF_1V_0V          0XFFA20070
+
+/*****************************************************************************/
+/* Enums */
+/*****************************************************************************/
+typedef enum {
+GRAY_SCALE   = 0,
+YUV444      = 1,
+YUV420      = 2,
+YUV422H     = 3,
+YUV422V     = 4,
+YUV411      = 5,
+RGB24       = 6,
+RGB24i      = 7
+}input_format_t;
+
+/*****************************************************************************/
+/* Function Declarations                                                     */
+/*****************************************************************************/
+typedef void pf_copy_yuv420p_buf_t(UWORD8 *pu1_src_y,
+                                   UWORD8 *pu1_src_u,
+                                   UWORD8 *pu1_src_v,
+                                   UWORD8 *pu1_dst_y,
+                                   UWORD8 *pu1_dst_u,
+                                   UWORD8 *pu1_dst_v,
+                                   UWORD32 u4_width,
+                                   UWORD32 u4_height,
+                                   UWORD32 u4_src_stride_y,
+                                   UWORD32 u4_src_stride_u,
+                                   UWORD32 u4_src_stride_v,
+                                   UWORD32 u4_dst_stride_y,
+                                   UWORD32 u4_dst_stride_u,
+                                   UWORD32 u4_dst_stride_v);
+
+typedef void pf_fmt_conv_yuv420p_to_yuv422ile_t(UWORD8 *pu1_y,
+                                                UWORD8 *pu1_u,
+                                                UWORD8 *pu1_v,
+                                                void *pv_yuv422i,
+                                                UWORD32 u4_width,
+                                                UWORD32 u4_height,
+                                                UWORD32 u4_stride_y,
+                                                UWORD32 u4_stride_u,
+                                                UWORD32 u4_stride_v,
+                                                UWORD32 u4_stride_yuv422i);
+
+typedef void pf_fmt_conv_yuv420p_to_yuv420sp_t(UWORD8 *pu1_y,
+                                               UWORD8 *pu1_u,
+                                               UWORD8 *pu1_v,
+                                               UWORD8 *pu1_dest_y,
+                                               UWORD8 *pu1_dest_uv,
+                                               UWORD32 u2_height,
+                                               UWORD32 u2_width,
+                                               UWORD32 u2_stridey,
+                                               UWORD32 u2_strideu,
+                                               UWORD32 u2_stridev,
+                                               UWORD32 u2_dest_stride_y,
+                                               UWORD32 u2_dest_stride_uv,
+                                               UWORD32 convert_uv_only);
+
+pf_copy_yuv420p_buf_t impeg2_copy_frm_yuv420p;
+pf_fmt_conv_yuv420p_to_yuv422ile_t impeg2_fmt_conv_yuv420p_to_yuv422ile;
+pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_vu;
+pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_uv;
+
+pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q;
+pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q;
+
+pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8;
+pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8;
+
+
+#endif /* __IMPEG2_FORMAT_CONV_H__ */
diff --git a/common/impeg2_globals.c b/common/impeg2_globals.c
new file mode 100644
index 0000000..9193ef7
--- /dev/null
+++ b/common/impeg2_globals.c
@@ -0,0 +1,351 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#include <stdio.h>
+#include "iv_datatypedef.h"
+#include "iv.h"
+#include "impeg2_buf_mgr.h"
+#include "impeg2_disp_mgr.h"
+#include "impeg2_defs.h"
+#include "impeg2_platform_macros.h"
+#include "impeg2_globals.h"
+
+/* Table for converting the quantizer_scale_code to quantizer_scale */
+const UWORD8 gau1_impeg2_non_linear_quant_scale[] =
+{
+    0, 1, 2, 3, 4, 5, 6, 7,
+    8,10,12,14,16,18,20,22,
+    24,28,32,36,40,44,48,52,
+    56,64,72,80,88,96,104,112
+};
+
+
+/* Default quantizer matrix to be used for intra blocks */
+const UWORD8 gau1_impeg2_intra_quant_matrix_default[] =
+{
+    8, 16, 19, 22, 26, 27, 29, 34,
+    16, 16, 22, 24, 27, 29, 34, 37,
+    19, 22, 26, 27, 29, 34, 34, 38,
+    22, 22, 26, 27, 29, 34, 37, 40,
+    22, 26, 27, 29, 32, 35, 40, 48,
+    26, 27, 29, 32, 35, 40, 48, 58,
+    26, 27, 29, 34, 38, 46, 56, 69,
+    27, 29, 35, 38, 46, 56, 69, 83
+};
+
+/* Default quantizer matrix to be used for inter blocks */
+const UWORD8 gau1_impeg2_inter_quant_matrix_default[] =
+{
+    16,16,16,16,16,16,16,16,
+    16,16,16,16,16,16,16,16,
+    16,16,16,16,16,16,16,16,
+    16,16,16,16,16,16,16,16,
+    16,16,16,16,16,16,16,16,
+    16,16,16,16,16,16,16,16,
+    16,16,16,16,16,16,16,16,
+    16,16,16,16,16,16,16,16
+};
+
+/* Table to perform inverse scan when the scan direction is zigzag */
+const UWORD8 gau1_impeg2_inv_scan_zig_zag[] =
+{
+     0,  1,  8, 16,  9,  2,  3, 10,
+    17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34,
+    27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36,
+    29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46,
+    53, 60, 61, 54, 47, 55, 62, 63
+};
+
+/*  Table to perform inverse scan when the direction of scanning is vertical */
+const UWORD8 gau1_impeg2_inv_scan_vertical[] =
+{
+    0, 8, 16, 24, 1, 9, 2, 10,
+    17, 25, 32, 40, 48, 56, 57, 49,
+    41, 33, 26, 18, 3, 11, 4, 12,
+    19, 27, 34, 42, 50, 58, 35, 43,
+    51, 59, 20, 28, 5, 13, 6, 14,
+    21, 29, 36, 44, 52, 60, 37, 45,
+    53, 61, 22, 30, 7, 15, 23, 31,
+    38, 46, 54, 62, 39, 47, 55, 63
+};
+
+/*****************************************************************************/
+/* Table that indicate which interpolation  type is to used                  */
+/*****************************************************************************/
+/* Chroma when motion vector is positive */
+const UWORD16 gau2_impeg2_chroma_interp_mv[][16] =
+{
+    /* Pos X Pos Y */
+    {
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+        2,  2,  3,  3,
+        2,  2,  3,  3
+    },
+    /* Neg X Pos Y */
+    {
+        0,  1,  1,  0,
+        0,  1,  1,  0,
+        2,  3,  3,  2,
+        2,  3,  3,  2
+    },
+    /* Pos X Neg Y */
+    {
+        0,  0,  1,  1,
+        2,  2,  3,  3,
+        2,  2,  3,  3,
+        0,  0,  1,  1
+    },
+    /* Neg X Neg Y */
+    {
+        0,  1,  1, 0,
+        2,  3,  3, 2,
+        2,  3,  3, 2,
+        0,  1,  1, 0
+    }
+};
+/*****************************************************************************/
+/* Input #1 Offset in bytes                                                  */
+/*****************************************************************************/
+/* Chroma */
+const UWORD16 gau2_impeg2_chroma_interp_inp1[][16] =
+{
+    /* Pos X Pos Y */
+    {
+        0,  0,  0,  0,
+        0,  0,  0,  0,
+        0,  0,  0,  0,
+        0,  0,  0,  0
+    },
+    /* Neg X Pos Y */
+    {
+        0,  0,  0,  4,
+        0,  0,  0,  4,
+        0,  0,  0,  4,
+        0,  0,  0,  4
+    },
+    /* Pos X Neg Y */
+    {
+        0,  0,  0,  0,
+        0,  0,  0,  0,
+        0,  0,  0,  0,
+        72, 72, 72, 72
+    },
+    /* Neg X Neg Y */
+    {
+        0,  0,  0,  4,
+        0,  0,  0,  4,
+        0,  0,  0,  4,
+        72, 72, 72, 76
+    }
+};
+/* Luma */
+const UWORD16 gau2_impeg2_luma_interp_inp1[] =
+{
+    1,  1,  3,  3,
+    1,  1,  3,  3,
+    37, 37, 39, 39,
+    37, 37, 39, 39
+};
+/*****************************************************************************/
+/* Input #2 Offset from Input #1 in bytes                                    */
+/*****************************************************************************/
+/*
+    FXFY  0,
+    HXFY  2,
+    FXHY 36,
+    HXHY 36
+*/
+const UWORD16 gau2_impeg2_luma_interp_inp2[] =
+{
+      0,  2,  0,  2,
+     36, 36, 36, 36,
+      0,  2,  0,  2,
+     36, 36, 36, 36
+};
+const UWORD16 gau2_impeg2_chroma_interp_inp2[] =
+{
+    /* FXFY */
+    0,
+    /* HXFY */
+    4,
+    /* FXHY */
+    72,
+    /* HXHY */
+    72
+};
+
+/*****************************************************************************/
+/* Corresponds to Table 6-4 frame_rate_value  of the standard                */
+/*****************************************************************************/
+/*
+    frame_rate_code frame_rate_value
+
+    0000            Forbidden
+    0001            24 000 � 1001
+    0010            24
+    0011            25
+    0100            30 000 � 1001
+    0101            30
+    0110            50
+    0111            60 000 � 1001
+    1000            60
+    1001            Reserved
+    ....
+    1111            Reserved
+*/
+const UWORD16 gau2_impeg2_frm_rate_code[][2] =
+{
+    {1    ,    1}, /* Forbidden */
+    {24000, 1001},
+    {24000, 1000},
+    {25000, 1000},
+    {30000, 1001},
+    {30000, 1000},
+    {50000, 1000},
+    {60000, 1001},
+    {60000, 1000}
+    /* Rest reserved */
+};
+
+const WORD16 gai2_impeg2_idct_q15[] =
+{
+    23170,    23170,    23170,    23170,    23170,    23170,    23170,    23170,
+    32138,    27246,    18205,     6393,    -6393,   -18205,   -27246,   -32138,
+    30274,    12540,   -12540,   -30274,   -30274,   -12540,    12540,    30274,
+    27246,    -6393,   -32138,   -18205,    18205,    32138,     6393,   -27246,
+    23170,   -23170,   -23170,    23170,    23170,   -23170,   -23170,    23170,
+    18205,   -32138,     6393,    27246,   -27246,    -6393,    32138,   -18205,
+    12540,   -30274,    30274,   -12540,   -12540,    30274,   -30274,    12540,
+     6393,   -18205,    27246,   -32138,    32138,   -27246,    18205,    -6393,
+};
+
+const WORD16 gai2_impeg2_idct_q11[] =
+{
+    1448,     1448,     1448,     1448,     1448,     1448,     1448,     1448,
+    2009,     1703,     1138,      400,     -400,    -1138,    -1703,    -2009,
+    1892,      784,     -784,    -1892,    -1892,     -784,      784,     1892,
+    1703,     -400,    -2009,    -1138,     1138,     2009,      400,    -1703,
+    1448,    -1448,    -1448,     1448,     1448,    -1448,    -1448,     1448,
+    1138,    -2009,      400,     1703,    -1703,     -400,     2009,    -1138,
+     784,    -1892,     1892,     -784,     -784,     1892,    -1892,      784,
+     400,    -1138,     1703,    -2009,     2009,    -1703,     1138,     -400,
+};
+
+const WORD16 gai2_impeg2_idct_even_8_q15[][8] =
+{
+    {  23170,  23170,  23170,  23170,  23170,  23170,  23170,  23170  },
+    {  12540, -30274,  12540, -30274,  12540, -30274,  12540, -30274  },
+    {  30274,  12540,  30274,  12540,  30274,  12540,  30274,  12540  },
+    {  23170, -23170,  23170, -23170,  23170, -23170,  23170, -23170  }
+};
+const WORD16 gai2_impeg2_idct_odd_8_q15[][8] =
+{
+    {  32138,  27246,  32138,  27246,  32138,  27246,  32138,  27246 },
+    {  18205,   6393,  18205,   6393,  18205,   6393,  18205,   6393 },
+    {  27246,  -6393,  27246,  -6393,  27246,  -6393,  27246,  -6393 },
+    {  32138,  18205,  32138,  18205,  32138,  18205,  32138,  18205 },
+    {  18205, -32138,  18205, -32138,  18205, -32138,  18205, -32138 },
+    {  6393,   27246,   6393,  27246,   6393,  27246,   6393,  27246 },
+    {  6393,  -18205,   6393, -18205,   6393, -18205,   6393, -18205 },
+    {  27246, -32138,  27246, -32138,  27246, -32138,  27246, -32138 },
+};
+
+const WORD16 gai2_impeg2_idct_even_8_q11[][8] =
+{
+    {   1448,   1448,   1448,   1448,   1448,   1448,   1448,   1448 },
+    {    784,  -1892,    784,  -1892,    784,  -1892,    784,  -1892 },
+    {   1892,    784,   1892,    784,   1892,    784,   1892,    784 },
+    {   1448,  -1448,   1448,  -1448,   1448,  -1448,   1448,  -1448 }
+};
+const WORD16 gai2_impeg2_idct_odd_8_q11[][8] =
+{
+    {   2009,   1703,   2009,   1703,   2009,   1703,   2009,   1703 },
+    {   1138,    400,   1138,    400,   1138,    400,   1138,    400 },
+    {   1703,   -400,   1703,   -400,   1703,   -400,   1703,   -400 },
+    {   2009,   1138,   2009,   1138,   2009,   1138,   2009,   1138 },
+    {   1138,  -2009,   1138,  -2009,   1138,  -2009,   1138,  -2009 },
+    {    400,   1703,    400,   1703,    400,   1703,    400,   1703 },
+    {    400,  -1138,    400,  -1138,    400,  -1138,    400,  -1138 },
+    {   1703,  -2009,   1703,  -2009,   1703,  -2009,   1703,  -2009 },
+};
+
+
+
+/*****************************************************************************/
+/* Last row IDCT Coefficients in Q11 format                                           */
+/*****************************************************************************/
+const WORD16 gai2_impeg2_idct_last_row_q11[] =
+{
+     400,    -1138,     1703,    -2009,     2009,    -1703,     1138,     -400,
+};
+
+const WORD16 gai2_impeg2_idct_first_col_q15[] =
+{
+   23170,    32138,    30274,    27246,    23170,    18205,    12540,     6393,
+};
+
+const WORD16 gai2_impeg2_idct_first_col_q11[] =
+{
+     1448,    2009,     1892,     1703,     1448,     1138,      784,      400,
+};
+
+/*****************************************************************************/
+/* Output of first stage dct (using gai2_impeg2_idct_q15 as coeffs)          */
+/* for a 1D data (0, 0, 0, 0, 0, 0, 0, 1)                                    */
+/*****************************************************************************/
+
+const WORD16 gai2_impeg2_mismatch_stg1_outp[] =
+{
+    2, -4, 7, -8, 8, -7, 4, -2
+};
+
+const WORD16 gai2_impeg2_mismatch_stg2_additive[] =
+{
+     800,   -2276,  3406,   -4018,  4018,   -3406,  2276,   -800,
+     -1600, 4552,   -6812,  8036,   -8036,  6812,   -4552,  1600,
+     2800,  -7966,  11921,  -14063, 14063,  -11921, 7966,   -2800,
+     -3200, 9104,   -13624, 16072,  -16072, 13624,  -9104,  3200,
+     3200,  -9104,  13624,  -16072, 16072,  -13624, 9104,   -3200,
+     -2800, 7966,   -11921, 14063,  -14063, 11921,  -7966,  2800,
+     1600,  -4552,  6812,   -8036,  8036,   -6812,  4552,   -1600,
+     -800,  2276,   -3406,  4018,   -4018,  3406,   -2276,  800,
+};
+
+
+const UWORD8 gau1_impeg2_zerobuf[] =
+{
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+};
+/*****************************************************************************/
+/* Tables of offset needed to address block in an MB                         */
+/*****************************************************************************/
+const WORD16  gai2_impeg2_blk_y_off_fld[]  = {0,0,1,1};
+const WORD16  gai2_impeg2_blk_y_off_frm[]  = {0,0,8,8};
+const WORD16  gai2_impeg2_blk_x_off[]      = {0,8,0,8};
diff --git a/common/impeg2_globals.h b/common/impeg2_globals.h
new file mode 100755
index 0000000..e8c6865
--- /dev/null
+++ b/common/impeg2_globals.h
@@ -0,0 +1,57 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_GLOBALS_H__
+#define __IMPEG2_GLOBALS_H__
+
+extern const UWORD8 gau1_impeg2_non_linear_quant_scale[];
+extern const UWORD8 gau1_impeg2_intra_quant_matrix_default[];
+extern const UWORD8 gau1_impeg2_inter_quant_matrix_default[];
+extern const UWORD8  gau1_impeg2_inv_scan_vertical[];
+extern const UWORD8  gau1_impeg2_inv_scan_zig_zag[];
+extern const UWORD16 gau2_impeg2_frm_rate_code[][2];
+
+extern const UWORD16 gau2_impeg2_chroma_interp_mv[][16];
+extern const UWORD16 gau2_impeg2_chroma_interp_inp1[][16];
+extern const UWORD16 gau2_impeg2_luma_interp_inp1[];
+extern const UWORD16 gau2_impeg2_luma_interp_inp2[];
+extern const UWORD16 gau2_impeg2_chroma_interp_inp2[];
+
+extern const WORD16  gai2_impeg2_idct_q15[];
+extern const WORD16  gai2_impeg2_idct_q11[];
+
+extern const WORD16 gai2_impeg2_mismatch_stg1_outp[];
+extern const WORD16 gai2_impeg2_idct_last_row_q11[];
+extern const WORD16 gai2_impeg2_idct_first_col_q15[];
+extern const WORD16 gai2_impeg2_idct_first_col_q11[];
+extern const WORD16 gai2_impeg2_mismatch_stg2_additive[];
+
+extern const WORD16  gai2_impeg2_blk_y_off_fld[];
+extern const WORD16  gai2_impeg2_blk_y_off_frm[];
+extern const WORD16  gai2_impeg2_blk_x_off[];
+
+extern const UWORD8 gau1_impeg2_zerobuf[];
+
+extern const WORD16 gai2_impeg2_idct_odd_8_q15[8][8];
+extern const WORD16 gai2_impeg2_idct_odd_8_q11[8][8];
+
+extern const WORD16 gai2_impeg2_idct_even_8_q11[4][8];
+extern const WORD16 gai2_impeg2_idct_even_8_q15[4][8];
+
+#endif /* __IMPEG2_GLOBALS_H__ */
diff --git a/common/impeg2_idct.c b/common/impeg2_idct.c
new file mode 100644
index 0000000..6834260
--- /dev/null
+++ b/common/impeg2_idct.c
@@ -0,0 +1,500 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/*                                                                           */
+/*  File Name         : impeg2_idct.c                                        */
+/*                                                                           */
+/*  Description       : Contains 2d idct and invese quantization functions   */
+/*                                                                           */
+/*  List of Functions : impeg2_idct_recon_dc()                               */
+/*                      impeg2_idct_recon_dc_mismatch()                      */
+/*                      impeg2_idct_recon()                                  */
+/*                                                                           */
+/*  Issues / Problems : None                                                 */
+/*                                                                           */
+/*  Revision History  :                                                      */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         10 09 2005   Hairsh M        First Version                        */
+/*                                                                           */
+/*****************************************************************************/
+/*
+  IEEE - 1180 results for this IDCT
+  L                           256         256         5           5           300         300         384         384         Thresholds
+  H                           255         255         5           5           300         300         383         383
+  sign                        1           -1          1           -1          1           -1          1           -1
+  Peak Error                  1           1           1           1           1           1           1           1           1
+  Peak Mean Square Error      0.0191      0.0188      0.0108      0.0111      0.0176      0.0188      0.0165      0.0177      0.06
+  Overall Mean Square Error   0.01566406  0.01597656  0.0091875   0.00908906  0.01499063  0.01533281  0.01432344  0.01412344  0.02
+  Peak Mean Error             0.0027      0.0026      0.0028      0.002       0.0017      0.0033      0.0031      0.0025      0.015
+  Overall Mean Error          0.00002656  -0.00031406 0.00016875  0.00005469  -0.00003125 0.00011406  0.00009219  0.00004219  0.0015
+  */
+#include <stdio.h>
+#include <string.h>
+
+#include "iv_datatypedef.h"
+#include "iv.h"
+#include "impeg2_defs.h"
+#include "impeg2_platform_macros.h"
+
+#include "impeg2_macros.h"
+#include "impeg2_globals.h"
+#include "impeg2_idct.h"
+
+
+void impeg2_idct_recon_dc(WORD16 *pi2_src,
+                            WORD16 *pi2_tmp,
+                            UWORD8 *pu1_pred,
+                            UWORD8 *pu1_dst,
+                            WORD32 i4_src_strd,
+                            WORD32 i4_pred_strd,
+                            WORD32 i4_dst_strd,
+                            WORD32 i4_zero_cols,
+                            WORD32 i4_zero_rows)
+{
+    WORD32 i4_val, i, j;
+
+    UNUSED(pi2_tmp);
+    UNUSED(i4_src_strd);
+    UNUSED(i4_zero_cols);
+    UNUSED(i4_zero_rows);
+
+    i4_val = pi2_src[0] * gai2_impeg2_idct_q15[0];
+    i4_val = ((i4_val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
+    i4_val = i4_val * gai2_impeg2_idct_q11[0];
+    i4_val = ((i4_val + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT);
+
+    for(i = 0; i < TRANS_SIZE_8; i++)
+    {
+        for(j = 0; j < TRANS_SIZE_8; j++)
+        {
+            pu1_dst[j] = CLIP_U8(i4_val + pu1_pred[j]);
+        }
+        pu1_dst  += i4_dst_strd;
+        pu1_pred += i4_pred_strd;
+    }
+}
+void impeg2_idct_recon_dc_mismatch(WORD16 *pi2_src,
+                            WORD16 *pi2_tmp,
+                            UWORD8 *pu1_pred,
+                            UWORD8 *pu1_dst,
+                            WORD32 i4_src_strd,
+                            WORD32 i4_pred_strd,
+                            WORD32 i4_dst_strd,
+                            WORD32 i4_zero_cols,
+                            WORD32 i4_zero_rows)
+
+{
+    WORD32 i4_val, i, j;
+    WORD32 i4_count = 0;
+    WORD32 i4_sum;
+
+    UNUSED(pi2_tmp);
+    UNUSED(i4_src_strd);
+    UNUSED(i4_zero_cols);
+    UNUSED(i4_zero_rows);
+
+    i4_val = pi2_src[0] * gai2_impeg2_idct_q15[0];
+    i4_val = ((i4_val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
+
+    i4_val *= gai2_impeg2_idct_q11[0];
+    for(i = 0; i < TRANS_SIZE_8; i++)
+    {
+        for (j = 0; j < TRANS_SIZE_8; j++)
+        {
+            i4_sum = i4_val;
+            i4_sum += gai2_impeg2_mismatch_stg2_additive[i4_count];
+            i4_sum = ((i4_sum + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT);
+            i4_sum += pu1_pred[j];
+            pu1_dst[j] = CLIP_U8(i4_sum);
+            i4_count++;
+        }
+
+        pu1_dst  += i4_dst_strd;
+        pu1_pred += i4_pred_strd;
+    }
+
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Inverse transform  and reconstruction for 8x8
+ * input block
+ *
+ * @par Description:
+ *  Performs inverse transform and adds the prediction  data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 8x8 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 8x8 buffer for storing inverse
+ *
+ *  transform
+ *  1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 8x8 block
+ *
+ * @param[out] pu1_dst
+ *  Output 8x8 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void impeg2_idct_recon(WORD16 *pi2_src,
+                        WORD16 *pi2_tmp,
+                        UWORD8 *pu1_pred,
+                        UWORD8 *pu1_dst,
+                        WORD32 i4_src_strd,
+                        WORD32 i4_pred_strd,
+                        WORD32 i4_dst_strd,
+                        WORD32 i4_zero_cols,
+                        WORD32 i4_zero_rows)
+{
+    WORD32 j, k;
+    WORD32 ai4_e[4], ai4_o[4];
+    WORD32 ai4_ee[2], ai4_eo[2];
+    WORD32 i4_add;
+    WORD32 i4_shift;
+    WORD16 *pi2_tmp_orig;
+    WORD32 i4_trans_size;
+    WORD32 i4_zero_rows_2nd_stage = i4_zero_cols;
+    WORD32 i4_row_limit_2nd_stage;
+
+    i4_trans_size = TRANS_SIZE_8;
+
+    pi2_tmp_orig = pi2_tmp;
+
+    if((i4_zero_cols & 0xF0) == 0xF0)
+        i4_row_limit_2nd_stage = 4;
+    else
+        i4_row_limit_2nd_stage = TRANS_SIZE_8;
+
+
+    if((i4_zero_rows & 0xF0) == 0xF0) /* First 4 rows of input are non-zero */
+    {
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_8x8******************************************/
+        /************************************************************************************************/
+
+        /* Inverse Transform 1st stage */
+        i4_shift = IDCT_STG1_SHIFT;
+        i4_add = 1 << (i4_shift - 1);
+
+        for(j = 0; j < i4_row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((i4_zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, i4_trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    ai4_o[k] = gai2_impeg2_idct_q15[1 * 8 + k] * pi2_src[i4_src_strd]
+                                    + gai2_impeg2_idct_q15[3 * 8 + k]
+                                                    * pi2_src[3 * i4_src_strd];
+                }
+                ai4_eo[0] = gai2_impeg2_idct_q15[2 * 8 + 0] * pi2_src[2 * i4_src_strd];
+                ai4_eo[1] = gai2_impeg2_idct_q15[2 * 8 + 1] * pi2_src[2 * i4_src_strd];
+                ai4_ee[0] = gai2_impeg2_idct_q15[0 * 8 + 0] * pi2_src[0];
+                ai4_ee[1] = gai2_impeg2_idct_q15[0 * 8 + 1] * pi2_src[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                ai4_e[0] = ai4_ee[0] + ai4_eo[0];
+                ai4_e[3] = ai4_ee[0] - ai4_eo[0];
+                ai4_e[1] = ai4_ee[1] + ai4_eo[1];
+                ai4_e[2] = ai4_ee[1] - ai4_eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift));
+                    pi2_tmp[k + 4] =
+                                    CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += i4_trans_size;
+            i4_zero_cols = i4_zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        i4_shift = IDCT_STG2_SHIFT;
+        i4_add = 1 << (i4_shift - 1);
+        if((i4_zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < i4_trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    ai4_o[k] = gai2_impeg2_idct_q11[1 * 8 + k] * pi2_tmp[i4_trans_size]
+                                    + gai2_impeg2_idct_q11[3 * 8 + k] * pi2_tmp[3 * i4_trans_size];
+                }
+                ai4_eo[0] = gai2_impeg2_idct_q11[2 * 8 + 0] * pi2_tmp[2 * i4_trans_size];
+                ai4_eo[1] = gai2_impeg2_idct_q11[2 * 8 + 1] * pi2_tmp[2 * i4_trans_size];
+                ai4_ee[0] = gai2_impeg2_idct_q11[0 * 8 + 0] * pi2_tmp[0];
+                ai4_ee[1] = gai2_impeg2_idct_q11[0 * 8 + 1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                ai4_e[0] = ai4_ee[0] + ai4_eo[0];
+                ai4_e[3] = ai4_ee[0] - ai4_eo[0];
+                ai4_e[1] = ai4_ee[1] + ai4_eo[1];
+                ai4_e[2] = ai4_ee[1] - ai4_eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift));
+                    pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+                }
+                pi2_tmp++;
+                pu1_pred += i4_pred_strd;
+                pu1_dst += i4_dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < i4_trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    ai4_o[k] = gai2_impeg2_idct_q11[1 * 8 + k] * pi2_tmp[i4_trans_size]
+                                    + gai2_impeg2_idct_q11[3 * 8 + k]
+                                                    * pi2_tmp[3 * i4_trans_size]
+                                    + gai2_impeg2_idct_q11[5 * 8 + k]
+                                                    * pi2_tmp[5 * i4_trans_size]
+                                    + gai2_impeg2_idct_q11[7 * 8 + k]
+                                                    * pi2_tmp[7 * i4_trans_size];
+                }
+
+                ai4_eo[0] = gai2_impeg2_idct_q11[2 * 8 + 0] * pi2_tmp[2 * i4_trans_size]
+                                + gai2_impeg2_idct_q11[6 * 8 + 0] * pi2_tmp[6 * i4_trans_size];
+                ai4_eo[1] = gai2_impeg2_idct_q11[2 * 8 + 1] * pi2_tmp[2 * i4_trans_size]
+                                + gai2_impeg2_idct_q11[6 * 8 + 1] * pi2_tmp[6 * i4_trans_size];
+                ai4_ee[0] = gai2_impeg2_idct_q11[0 * 8 + 0] * pi2_tmp[0]
+                                + gai2_impeg2_idct_q11[4 * 8 + 0] * pi2_tmp[4 * i4_trans_size];
+                ai4_ee[1] = gai2_impeg2_idct_q11[0 * 8 + 1] * pi2_tmp[0]
+                                + gai2_impeg2_idct_q11[4 * 8 + 1] * pi2_tmp[4 * i4_trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                ai4_e[0] = ai4_ee[0] + ai4_eo[0];
+                ai4_e[3] = ai4_ee[0] - ai4_eo[0];
+                ai4_e[1] = ai4_ee[1] + ai4_eo[1];
+                ai4_e[2] = ai4_ee[1] - ai4_eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift));
+                    pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+                }
+                pi2_tmp++;
+                pu1_pred += i4_pred_strd;
+                pu1_dst += i4_dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_8x8******************************************/
+        /************************************************************************************************/
+    }
+    else /* All rows of input are non-zero */
+    {
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_8x8******************************************/
+        /************************************************************************************************/
+
+        /* Inverse Transform 1st stage */
+        i4_shift = IDCT_STG1_SHIFT;
+        i4_add = 1 << (i4_shift - 1);
+
+        for(j = 0; j < i4_row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((i4_zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, i4_trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    ai4_o[k] = gai2_impeg2_idct_q15[1 * 8 + k] * pi2_src[i4_src_strd]
+                                    + gai2_impeg2_idct_q15[3 * 8 + k]
+                                                    * pi2_src[3 * i4_src_strd]
+                                    + gai2_impeg2_idct_q15[5 * 8 + k]
+                                                    * pi2_src[5 * i4_src_strd]
+                                    + gai2_impeg2_idct_q15[7 * 8 + k]
+                                                    * pi2_src[7 * i4_src_strd];
+                }
+
+                ai4_eo[0] = gai2_impeg2_idct_q15[2 * 8 + 0] * pi2_src[2 * i4_src_strd]
+                                + gai2_impeg2_idct_q15[6 * 8 + 0] * pi2_src[6 * i4_src_strd];
+                ai4_eo[1] = gai2_impeg2_idct_q15[2 * 8 + 1] * pi2_src[2 * i4_src_strd]
+                                + gai2_impeg2_idct_q15[6 * 8 + 1] * pi2_src[6 * i4_src_strd];
+                ai4_ee[0] = gai2_impeg2_idct_q15[0 * 8 + 0] * pi2_src[0]
+                                + gai2_impeg2_idct_q15[4 * 8 + 0] * pi2_src[4 * i4_src_strd];
+                ai4_ee[1] = gai2_impeg2_idct_q15[0 * 8 + 1] * pi2_src[0]
+                                + gai2_impeg2_idct_q15[4 * 8 + 1] * pi2_src[4 * i4_src_strd];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                ai4_e[0] = ai4_ee[0] + ai4_eo[0];
+                ai4_e[3] = ai4_ee[0] - ai4_eo[0];
+                ai4_e[1] = ai4_ee[1] + ai4_eo[1];
+                ai4_e[2] = ai4_ee[1] - ai4_eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift));
+                    pi2_tmp[k + 4] =
+                                    CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += i4_trans_size;
+            i4_zero_cols = i4_zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        i4_shift = IDCT_STG2_SHIFT;
+        i4_add = 1 << (i4_shift - 1);
+        if((i4_zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < i4_trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    ai4_o[k] = gai2_impeg2_idct_q11[1 * 8 + k] * pi2_tmp[i4_trans_size]
+                                    + gai2_impeg2_idct_q11[3 * 8 + k] * pi2_tmp[3 * i4_trans_size];
+                }
+                ai4_eo[0] = gai2_impeg2_idct_q11[2 * 8 + 0] * pi2_tmp[2 * i4_trans_size];
+                ai4_eo[1] = gai2_impeg2_idct_q11[2 * 8 + 1] * pi2_tmp[2 * i4_trans_size];
+                ai4_ee[0] = gai2_impeg2_idct_q11[0 * 8 + 0] * pi2_tmp[0];
+                ai4_ee[1] = gai2_impeg2_idct_q11[0 * 8 + 1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                ai4_e[0] = ai4_ee[0] + ai4_eo[0];
+                ai4_e[3] = ai4_ee[0] - ai4_eo[0];
+                ai4_e[1] = ai4_ee[1] + ai4_eo[1];
+                ai4_e[2] = ai4_ee[1] - ai4_eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift));
+                    pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+                }
+                pi2_tmp++;
+                pu1_pred += i4_pred_strd;
+                pu1_dst += i4_dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < i4_trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    ai4_o[k] = gai2_impeg2_idct_q11[1 * 8 + k] * pi2_tmp[i4_trans_size]
+                                    + gai2_impeg2_idct_q11[3 * 8 + k]
+                                                    * pi2_tmp[3 * i4_trans_size]
+                                    + gai2_impeg2_idct_q11[5 * 8 + k]
+                                                    * pi2_tmp[5 * i4_trans_size]
+                                    + gai2_impeg2_idct_q11[7 * 8 + k]
+                                                    * pi2_tmp[7 * i4_trans_size];
+                }
+
+                ai4_eo[0] = gai2_impeg2_idct_q11[2 * 8 + 0] * pi2_tmp[2 * i4_trans_size]
+                                + gai2_impeg2_idct_q11[6 * 8 + 0] * pi2_tmp[6 * i4_trans_size];
+                ai4_eo[1] = gai2_impeg2_idct_q11[2 * 8 + 1] * pi2_tmp[2 * i4_trans_size]
+                                + gai2_impeg2_idct_q11[6 * 8 + 1] * pi2_tmp[6 * i4_trans_size];
+                ai4_ee[0] = gai2_impeg2_idct_q11[0 * 8 + 0] * pi2_tmp[0]
+                                + gai2_impeg2_idct_q11[4 * 8 + 0] * pi2_tmp[4 * i4_trans_size];
+                ai4_ee[1] = gai2_impeg2_idct_q11[0 * 8 + 1] * pi2_tmp[0]
+                                + gai2_impeg2_idct_q11[4 * 8 + 1] * pi2_tmp[4 * i4_trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                ai4_e[0] = ai4_ee[0] + ai4_eo[0];
+                ai4_e[3] = ai4_ee[0] - ai4_eo[0];
+                ai4_e[1] = ai4_ee[1] + ai4_eo[1];
+                ai4_e[2] = ai4_ee[1] - ai4_eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift));
+                    pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+                }
+                pi2_tmp++;
+                pu1_pred += i4_pred_strd;
+                pu1_dst += i4_dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_8x8******************************************/
+        /************************************************************************************************/
+    }
+}
+
diff --git a/common/impeg2_idct.h b/common/impeg2_idct.h
new file mode 100644
index 0000000..80defde
--- /dev/null
+++ b/common/impeg2_idct.h
@@ -0,0 +1,66 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_IDCT_H__
+#define __IMPEG2_IDCT_H__
+
+
+/*****************************************************************************/
+/* Function Declarations                                                     */
+/*****************************************************************************/
+
+typedef void  pf_idct_recon_t(WORD16 *pi2_src,
+                            WORD16 *pi2_tmp,
+                            UWORD8 *pu1_pred,
+                            UWORD8 *pu1_dst,
+                            WORD32 src_strd,
+                            WORD32 pred_strd,
+                            WORD32 dst_strd,
+                            WORD32 zero_cols,
+                            WORD32 zero_rows);
+
+/* ARM assembly modules curently ignore non_zero_cols argument */
+pf_idct_recon_t impeg2_idct_recon_dc;
+
+pf_idct_recon_t impeg2_idct_recon_dc_mismatch;
+
+pf_idct_recon_t impeg2_idct_recon;
+
+
+pf_idct_recon_t impeg2_idct_recon_dc_a9q;
+
+pf_idct_recon_t impeg2_idct_recon_dc_mismatch_a9q;
+
+pf_idct_recon_t impeg2_idct_recon_a9q;
+
+
+pf_idct_recon_t impeg2_idct_recon_dc_av8;
+
+pf_idct_recon_t impeg2_idct_recon_dc_mismatch_av8;
+
+pf_idct_recon_t impeg2_idct_recon_av8;
+
+pf_idct_recon_t impeg2_idct_recon_sse42;
+
+pf_idct_recon_t impeg2_idct_recon_dc_mismatch_sse42;
+
+pf_idct_recon_t impeg2_idct_recon_dc_sse42;
+
+#endif /* #ifndef __IMPEG2_IDCT_H__ */
+
diff --git a/common/impeg2_inter_pred.c b/common/impeg2_inter_pred.c
new file mode 100644
index 0000000..019fa5c
--- /dev/null
+++ b/common/impeg2_inter_pred.c
@@ -0,0 +1,467 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+*  impeg2d_mcu.c
+*
+* @brief
+*  Contains MC function definitions for MPEG2 decoder
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+* - impeg2_copy_mb()
+* - impeg2_interpolate()
+* - impeg2_mc_halfx_halfy_8x8()
+* - impeg2_mc_halfx_fully_8x8()
+* - impeg2_mc_fullx_halfy_8x8()
+* - impeg2_mc_fullx_fully_8x8()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "iv.h"
+#include "impeg2_buf_mgr.h"
+#include "impeg2_disp_mgr.h"
+#include "impeg2_defs.h"
+#include "impeg2_platform_macros.h"
+
+#include "impeg2_inter_pred.h"
+#include "impeg2_globals.h"
+#include "impeg2_macros.h"
+#include "impeg2_idct.h"
+
+/*******************************************************************************
+*  Function Name   : impeg2_copy_mb
+*
+*  Description     : copies 3 components to the frame from mc_buf
+*
+*  Arguments       :
+*  src_buf         : Source Buffer
+*  dst_buf         : Destination Buffer
+*  src_offset_x    : X offset for source
+*  src_offset_y    : Y offset for source
+*  dst_offset_x    : X offset for destination
+*  dst_offset_y    : Y offset for destination
+*  src_wd          : Source Width
+*  dst_wd          : destination Width
+*  rows            : Number of rows
+*  cols            : Number of columns
+*
+*  Values Returned : None
+*******************************************************************************/
+void impeg2_copy_mb(yuv_buf_t *ps_src_buf,
+                    yuv_buf_t *ps_dst_buf,
+                    UWORD32 u4_src_wd,
+                    UWORD32 u4_dst_wd)
+{
+    UWORD8 *pu1_src;
+    UWORD8 *pu1_dst;
+    UWORD32 i;
+    UWORD32 u4_rows = MB_SIZE;
+    UWORD32 u4_cols = MB_SIZE;
+
+    /*******************************************************/
+    /* copy Y                                              */
+    /*******************************************************/
+    pu1_src = ps_src_buf->pu1_y;
+    pu1_dst = ps_dst_buf->pu1_y;
+    for(i = 0; i < u4_rows; i++)
+    {
+        memcpy(pu1_dst, pu1_src, u4_cols);
+        pu1_src += u4_src_wd;
+        pu1_dst += u4_dst_wd;
+    }
+
+    u4_src_wd >>= 1;
+    u4_dst_wd >>= 1;
+    u4_rows >>= 1;
+    u4_cols >>= 1;
+
+    /*******************************************************/
+    /* copy U                                              */
+    /*******************************************************/
+    pu1_src = ps_src_buf->pu1_u;
+    pu1_dst = ps_dst_buf->pu1_u;
+    for(i = 0; i < u4_rows; i++)
+    {
+        memcpy(pu1_dst, pu1_src, u4_cols);
+
+        pu1_src += u4_src_wd;
+        pu1_dst += u4_dst_wd;
+    }
+    /*******************************************************/
+    /* copy V                                              */
+    /*******************************************************/
+    pu1_src = ps_src_buf->pu1_v;
+    pu1_dst = ps_dst_buf->pu1_v;
+    for(i = 0; i < u4_rows; i++)
+    {
+        memcpy(pu1_dst, pu1_src, u4_cols);
+
+        pu1_src += u4_src_wd;
+        pu1_dst += u4_dst_wd;
+    }
+
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : impeg2_interpolate                                       */
+/*                                                                           */
+/*  Description   : averages the contents of buf_src1 and buf_src2 and stores*/
+/*                  result in buf_dst                                        */
+/*                                                                           */
+/*  Inputs        : buf_src1 -  First Source                                 */
+/*                  buf_src2 -  Second Source                                */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : Avg the values from two sources and store the result in  */
+/*                  destination buffer                                       */
+/*                                                                           */
+/*  Outputs       : buf_dst  -  Avg of contents of buf_src1 and buf_src2     */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : Assumes that all 3 buffers are of same size              */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         14 09 2005   Harish M        First Version                        */
+/*         15 09 2010   Venkat          Added stride                         */
+/*                                                                           */
+/*****************************************************************************/
+void impeg2_interpolate(yuv_buf_t *ps_buf_src1,
+                        yuv_buf_t *ps_buf_src2,
+                        yuv_buf_t *ps_buf_dst,
+                        UWORD32 u4_stride)
+{
+
+    UWORD32 i,j;
+    UWORD8 *pu1_src1,*pu1_src2,*pu1_dst;
+    pu1_src1 = ps_buf_src1->pu1_y;
+    pu1_src2 = ps_buf_src2->pu1_y;
+    pu1_dst  = ps_buf_dst->pu1_y;
+    for(i = MB_SIZE; i > 0; i--)
+    {
+        for(j = MB_SIZE; j > 0; j--)
+        {
+            *pu1_dst++ = ((*pu1_src1++) + (*pu1_src2++) + 1) >> 1;
+        }
+
+        pu1_dst += u4_stride - MB_SIZE;
+
+    }
+
+    u4_stride >>= 1;
+
+    pu1_src1 = ps_buf_src1->pu1_u;
+    pu1_src2 = ps_buf_src2->pu1_u;
+    pu1_dst  = ps_buf_dst->pu1_u;
+    for(i = MB_CHROMA_SIZE; i > 0 ; i--)
+    {
+        for(j = MB_CHROMA_SIZE; j > 0; j--)
+        {
+            *pu1_dst++ = ((*pu1_src1++) + (*pu1_src2++) + 1) >> 1;
+        }
+
+        pu1_dst += u4_stride - MB_CHROMA_SIZE;
+    }
+
+    pu1_src1 = ps_buf_src1->pu1_v;
+    pu1_src2 = ps_buf_src2->pu1_v;
+    pu1_dst  = ps_buf_dst->pu1_v;
+    for(i = MB_CHROMA_SIZE; i > 0 ; i--)
+    {
+        for(j = MB_CHROMA_SIZE; j > 0; j--)
+        {
+            *pu1_dst++ = ((*pu1_src1++) + (*pu1_src2++) + 1) >> 1;
+        }
+
+        pu1_dst += u4_stride - MB_CHROMA_SIZE;
+    }
+
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : impeg2_mc_halfx_halfy_8x8()                                 */
+/*                                                                           */
+/*  Description   : Gets the buffer from (0.5,0.5) to (8.5,8.5)              */
+/*                  and the above block of size 8 x 8 will be placed as a    */
+/*                  block from the current position of out_buf               */
+/*                                                                           */
+/*  Inputs        : ref - Reference frame from which the block will be       */
+/*                        block will be extracted.                           */
+/*                  ref_wid - WIdth of reference frame                       */
+/*                  out_wid - WIdth of the output frame                      */
+/*                  blk_width  - width of the block                          */
+/*                  blk_width  - height of the block                         */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : Point to the (0,0),(1,0),(0,1),(1,1) position in         */
+/*                  the ref frame.Interpolate these four values to get the   */
+/*                  value at(0.5,0.5).Repeat this to get an 8 x 8 block      */
+/*                  using 9 x 9 block from reference frame                   */
+/*                                                                           */
+/*  Outputs       : out -  Output containing the extracted block             */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         05 09 2005   Harish M        First Version                        */
+/*                                                                           */
+/*****************************************************************************/
+void impeg2_mc_halfx_halfy_8x8(UWORD8 *pu1_out,
+                            UWORD8 *pu1_ref,
+                            UWORD32 u4_ref_wid,
+                            UWORD32 u4_out_wid)
+{
+    UWORD8 *pu1_ref_p0,*pu1_ref_p1,*pu1_ref_p2,*pu1_ref_p3;
+    UWORD32 i,j;
+    /* P0-P3 are the pixels in the reference frame and Q is the value being */
+    /* estimated                                                            */
+    /*
+       P0 P1
+         Q
+       P2 P3
+    */
+
+    pu1_ref_p0 = pu1_ref;
+    pu1_ref_p1 = pu1_ref + 1;
+    pu1_ref_p2 = pu1_ref + u4_ref_wid;
+    pu1_ref_p3 = pu1_ref + u4_ref_wid + 1;
+
+    for(i = 0; i < BLK_SIZE; i++)
+    {
+        for(j = 0; j < BLK_SIZE; j++)
+        {
+            *pu1_out++ =   (( (*pu1_ref_p0++ )
+                        + (*pu1_ref_p1++ )
+                        + (*pu1_ref_p2++ )
+                        + (*pu1_ref_p3++ ) + 2 ) >> 2);
+        }
+        pu1_ref_p0 += u4_ref_wid - BLK_SIZE;
+        pu1_ref_p1 += u4_ref_wid - BLK_SIZE;
+        pu1_ref_p2 += u4_ref_wid - BLK_SIZE;
+        pu1_ref_p3 += u4_ref_wid - BLK_SIZE;
+
+        pu1_out    += u4_out_wid - BLK_SIZE;
+    }
+    return;
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : impeg2_mc_halfx_fully_8x8()                                 */
+/*                                                                           */
+/*  Description   : Gets the buffer from (0.5,0) to (8.5,8)                  */
+/*                  and the above block of size 8 x 8 will be placed as a    */
+/*                  block from the current position of out_buf               */
+/*                                                                           */
+/*  Inputs        : ref - Reference frame from which the block will be       */
+/*                        block will be extracted.                           */
+/*                  ref_wid - WIdth of reference frame                       */
+/*                  out_wid - WIdth of the output frame                      */
+/*                  blk_width  - width of the block                          */
+/*                  blk_width  - height of the block                         */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : Point to the (0,0) and (1,0) position in the ref frame   */
+/*                  Interpolate these two values to get the value at(0.5,0)  */
+/*                  Repeat this to get an 8 x 8 block using 9 x 8 block from */
+/*                  reference frame                                          */
+/*                                                                           */
+/*  Outputs       : out -  Output containing the extracted block             */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         05 09 2005   Harish M        First Version                        */
+/*                                                                           */
+/*****************************************************************************/
+void impeg2_mc_halfx_fully_8x8(UWORD8 *pu1_out,
+                            UWORD8 *pu1_ref,
+                            UWORD32 u4_ref_wid,
+                            UWORD32 u4_out_wid)
+{
+    UWORD8 *pu1_ref_p0, *pu1_ref_p1;
+    UWORD32 i,j;
+
+    /* P0-P3 are the pixels in the reference frame and Q is the value being */
+    /* estimated                                                            */
+    /*
+       P0 Q P1
+    */
+
+    pu1_ref_p0 = pu1_ref;
+    pu1_ref_p1 = pu1_ref + 1;
+
+    for(i = 0; i < BLK_SIZE; i++)
+    {
+        for(j = 0; j < BLK_SIZE; j++)
+        {
+            *pu1_out++ =   ((( *pu1_ref_p0++ )
+                        + (*pu1_ref_p1++) + 1 ) >> 1);
+        }
+        pu1_ref_p0 += u4_ref_wid - BLK_SIZE;
+        pu1_ref_p1 += u4_ref_wid - BLK_SIZE;
+
+        pu1_out    += u4_out_wid - BLK_SIZE;
+    }
+    return;
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : impeg2_mc_fullx_halfy_8x8()                                 */
+/*                                                                           */
+/*  Description   : Gets the buffer from (0,0.5) to (8,8.5)                  */
+/*                  and the above block of size 8 x 8 will be placed as a    */
+/*                  block from the current position of out_buf               */
+/*                                                                           */
+/*  Inputs        : ref - Reference frame from which the block will be       */
+/*                        block will be extracted.                           */
+/*                  ref_wid - WIdth of reference frame                       */
+/*                  out_wid - WIdth of the output frame                      */
+/*                  blk_width  - width of the block                          */
+/*                  blk_width  - height of the block                         */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : Point to the (0,0) and (0,1)   position in the ref frame */
+/*                  Interpolate these two values to get the value at(0,0.5)  */
+/*                  Repeat this to get an 8 x 8 block using 8 x 9 block from */
+/*                  reference frame                                          */
+/*                                                                           */
+/*  Outputs       : out -  Output containing the extracted block             */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         05 09 2005   Harish M        First Version                        */
+/*                                                                           */
+/*****************************************************************************/
+void impeg2_mc_fullx_halfy_8x8(UWORD8 *pu1_out,
+                            UWORD8 *pu1_ref,
+                            UWORD32 u4_ref_wid,
+                            UWORD32 u4_out_wid)
+{
+
+    UWORD8 *pu1_ref_p0, *pu1_ref_p1;
+    UWORD32 i,j;
+    /* P0-P3 are the pixels in the reference frame and Q is the value being */
+    /* estimated                                                            */
+    /*
+       P0
+        x
+       P1
+    */
+    pu1_ref_p0 = pu1_ref;
+    pu1_ref_p1 = pu1_ref + u4_ref_wid;
+
+    for(i = 0; i < BLK_SIZE; i++)
+    {
+        for(j = 0; j < BLK_SIZE; j++)
+        {
+            *pu1_out++ =   ((( *pu1_ref_p0++)
+                        + (*pu1_ref_p1++) + 1 ) >> 1);
+        }
+        pu1_ref_p0 += u4_ref_wid - BLK_SIZE;
+        pu1_ref_p1 += u4_ref_wid - BLK_SIZE;
+
+        pu1_out    += u4_out_wid - BLK_SIZE;
+    }
+
+    return;
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : impeg2_mc_fullx_fully_8x8()                                 */
+/*                                                                           */
+/*  Description   : Gets the buffer from (x,y) to (x+8,y+8)                  */
+/*                  and the above block of size 8 x 8 will be placed as a    */
+/*                  block from the current position of out_buf               */
+/*                                                                           */
+/*  Inputs        : ref - Reference frame from which the block will be       */
+/*                        block will be extracted.                           */
+/*                  ref_wid - WIdth of reference frame                       */
+/*                  out_wid - WIdth of the output frame                      */
+/*                  blk_width  - width of the block                          */
+/*                  blk_width  - height of the block                         */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : Point to the (0,0) position in the ref frame             */
+/*                  Get an 8 x 8 block from reference frame                  */
+/*                                                                           */
+/*  Outputs       : out -  Output containing the extracted block             */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         05 09 2005   Harish M        First Version                        */
+/*                                                                           */
+/*****************************************************************************/
+void impeg2_mc_fullx_fully_8x8(UWORD8 *pu1_out,
+                            UWORD8 *pu1_ref,
+                            UWORD32 u4_ref_wid,
+                            UWORD32 u4_out_wid)
+{
+
+    UWORD32 i;
+
+    for(i = 0; i < BLK_SIZE; i++)
+    {
+        memcpy(pu1_out, pu1_ref, BLK_SIZE);
+        pu1_ref += u4_ref_wid;
+        pu1_out += u4_out_wid;
+    }
+    return;
+}
diff --git a/common/impeg2_inter_pred.h b/common/impeg2_inter_pred.h
new file mode 100644
index 0000000..be3b0e5
--- /dev/null
+++ b/common/impeg2_inter_pred.h
@@ -0,0 +1,103 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_INTER_PRED_H__
+#define __IMPEG2_INTER_PRED_H__
+
+
+typedef struct
+{
+    UWORD8 *pu1_y;
+    UWORD8 *pu1_u;
+    UWORD8 *pu1_v;
+}yuv_buf_t;
+
+typedef struct
+{
+    WORD16 *pi2_y;
+    WORD16 *pi2_u;
+    WORD16 *pi2_v;
+}yuv_buf16_t;
+
+/**
+ * Picture buffer
+ */
+typedef struct
+{
+    UWORD8 *pu1_y;
+    UWORD8 *pu1_u;
+    UWORD8 *pu1_v;
+
+    /** Used to store display Timestamp for current buffer */
+    WORD32 u4_ts;
+    UWORD8 u1_used_as_ref;
+
+    /**
+     * buffer ID from buffer manager
+     */
+    WORD32 i4_buf_id;
+
+}pic_buf_t;
+
+typedef void pf_copy_mb_t (yuv_buf_t *src_buf,
+                   yuv_buf_t *dst_buf,
+                   UWORD32 src_wd,
+                   UWORD32 dst_wd);
+
+typedef void pf_interpred_t(UWORD8 *out,UWORD8 *ref, UWORD32 ref_wid,  UWORD32 out_wid);
+
+typedef void pf_interpolate_t(yuv_buf_t *buf_src1,
+                              yuv_buf_t *buf_src2,
+                              yuv_buf_t *buf_dst,
+                              UWORD32 stride);
+
+pf_interpolate_t impeg2_interpolate;
+pf_interpolate_t impeg2_interpolate_a9q;
+pf_interpolate_t impeg2_interpolate_av8;
+
+pf_copy_mb_t impeg2_copy_mb;
+pf_copy_mb_t impeg2_copy_mb_a9q;
+pf_copy_mb_t impeg2_copy_mb_av8;
+
+pf_interpred_t impeg2_mc_halfx_halfy_8x8;
+pf_interpred_t impeg2_mc_halfx_fully_8x8;
+pf_interpred_t impeg2_mc_fullx_halfy_8x8;
+pf_interpred_t impeg2_mc_fullx_fully_8x8;
+
+pf_interpred_t impeg2_mc_halfx_halfy_8x8_a9q;
+pf_interpred_t impeg2_mc_halfx_fully_8x8_a9q;
+pf_interpred_t impeg2_mc_fullx_halfy_8x8_a9q;
+pf_interpred_t impeg2_mc_fullx_fully_8x8_a9q;
+
+/* AV8 Declarations */
+pf_interpred_t impeg2_mc_halfx_halfy_8x8_av8;
+pf_interpred_t impeg2_mc_halfx_fully_8x8_av8;
+pf_interpred_t impeg2_mc_fullx_halfy_8x8_av8;
+pf_interpred_t impeg2_mc_fullx_fully_8x8_av8;
+
+
+/* SSE4.2 Declarations*/
+pf_copy_mb_t impeg2_copy_mb_sse42;
+pf_interpolate_t impeg2_interpolate_sse42;
+pf_interpred_t impeg2_mc_halfx_halfy_8x8_sse42;
+pf_interpred_t impeg2_mc_halfx_fully_8x8_sse42;
+pf_interpred_t impeg2_mc_fullx_halfy_8x8_sse42;
+pf_interpred_t impeg2_mc_fullx_fully_8x8_sse42;
+
+#endif /* #ifndef __IMPEG2_INTER_PRED_H__  */
diff --git a/common/impeg2_job_queue.c b/common/impeg2_job_queue.c
new file mode 100644
index 0000000..d36ce7c
--- /dev/null
+++ b/common/impeg2_job_queue.c
@@ -0,0 +1,530 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+*  impeg2d_job_queue.c
+*
+* @brief
+*  Contains functions for job queue
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "iv_datatypedef.h"
+#include "iv.h"
+#include "ithread.h"
+#include "impeg2_macros.h"
+#include "impeg2_job_queue.h"
+
+/**
+*******************************************************************************
+*
+* @brief Returns size for job queue context. Does not include job queue buffer
+* requirements
+*
+* @par   Description
+* Returns size for job queue context. Does not include job queue buffer
+* requirements. Buffer size required to store the jobs should be allocated in
+* addition to the value returned here.
+*
+* @returns Size of the job queue context
+*
+* @remarks
+*
+*******************************************************************************
+*/
+WORD32 impeg2_jobq_ctxt_size()
+{
+    WORD32 i4_size;
+    i4_size = sizeof(jobq_t);
+    i4_size += ithread_get_mutex_lock_size();
+    return i4_size;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Locks the jobq conext
+*
+* @par   Description
+*   Locks the jobq conext by calling ithread_mutex_lock()
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @returns IMPEG2D_FAIL if mutex lock fails else IV_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_lock(jobq_t *ps_jobq)
+{
+    WORD32 i4_ret_val;
+    i4_ret_val = ithread_mutex_lock(ps_jobq->pv_mutex);
+    if(i4_ret_val)
+    {
+        return IV_FAIL;
+    }
+    return IV_SUCCESS;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Unlocks the jobq conext
+*
+* @par   Description
+*   Unlocks the jobq conext by calling ithread_mutex_unlock()
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @returns IMPEG2D_FAIL if mutex unlock fails else IV_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IV_API_CALL_STATUS_T impeg2_jobq_unlock(jobq_t *ps_jobq)
+{
+    WORD32 i4_ret_val;
+    i4_ret_val = ithread_mutex_unlock(ps_jobq->pv_mutex);
+    if(i4_ret_val)
+    {
+        return IV_FAIL;
+    }
+    return IV_SUCCESS;
+
+}
+/**
+*******************************************************************************
+*
+* @brief
+*   Yeilds the thread
+*
+* @par   Description
+*   Unlocks the jobq conext by calling
+* impeg2_jobq_unlock(), ithread_yield() and then impeg2_jobq_lock()
+* jobq is unlocked before to ensure the jobq can be accessed by other threads
+* If unlock is not done before calling yield then no other thread can access
+* the jobq functions and update jobq.
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @returns IMPEG2D_FAIL if mutex lock unlock or yield fails else IV_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_yield(jobq_t *ps_jobq)
+{
+
+    IV_API_CALL_STATUS_T e_ret = IV_SUCCESS;
+
+    IV_API_CALL_STATUS_T e_ret_tmp;
+    e_ret_tmp = impeg2_jobq_unlock(ps_jobq);
+    RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp);
+
+    //NOP(1024 * 8);
+    ithread_yield();
+
+    e_ret_tmp = impeg2_jobq_lock(ps_jobq);
+    RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp);
+    return e_ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief free the job queue pointers
+*
+* @par   Description
+* Frees the jobq context
+*
+* @param[in] pv_buf
+* Memoy for job queue buffer and job queue context
+*
+* @returns Pointer to job queue context
+*
+* @remarks
+* Since it will be called only once by master thread this is not thread safe.
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_free(jobq_t *ps_jobq)
+{
+    WORD32 i4_ret;
+    i4_ret = ithread_mutex_destroy(ps_jobq->pv_mutex);
+
+    if(0 == i4_ret)
+        return IV_SUCCESS;
+    else
+        return IV_FAIL;
+}
+
+/**
+*******************************************************************************
+*
+* @brief Initialize the job queue
+*
+* @par   Description
+* Initializes the jobq context and sets write and read pointers to start of
+* job queue buffer
+*
+* @param[in] pv_buf
+* Memoy for job queue buffer and job queue context
+*
+* @param[in] buf_size
+* Size of the total memory allocated
+*
+* @returns Pointer to job queue context
+*
+* @remarks
+* Since it will be called only once by master thread this is not thread safe.
+*
+*******************************************************************************
+*/
+void* impeg2_jobq_init(void *pv_buf, WORD32 i4_buf_size)
+{
+    jobq_t *ps_jobq;
+    UWORD8 *pu1_buf;
+    pu1_buf = (UWORD8 *)pv_buf;
+
+    ps_jobq = (jobq_t *)pu1_buf;
+    pu1_buf += sizeof(jobq_t);
+    i4_buf_size -= sizeof(jobq_t);
+
+    ps_jobq->pv_mutex = pu1_buf;
+    pu1_buf += ithread_get_mutex_lock_size();
+    i4_buf_size -= ithread_get_mutex_lock_size();
+
+    if(i4_buf_size <= 0)
+        return NULL;
+
+    ithread_mutex_init(ps_jobq->pv_mutex);
+
+    ps_jobq->pv_buf_base = pu1_buf;
+    ps_jobq->pv_buf_wr = pu1_buf;
+    ps_jobq->pv_buf_rd = pu1_buf;
+    ps_jobq->pv_buf_end = pu1_buf + i4_buf_size;
+    ps_jobq->i4_terminate = 0;
+
+
+    return ps_jobq;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*   Resets the jobq conext
+*
+* @par   Description
+*   Resets the jobq conext by initilizing job queue context elements
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @returns IMPEG2D_FAIL if lock unlock fails else IV_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_reset(jobq_t *ps_jobq)
+{
+    IV_API_CALL_STATUS_T e_ret = IV_SUCCESS;
+    e_ret = impeg2_jobq_lock(ps_jobq);
+    RETURN_IF((e_ret != IV_SUCCESS), e_ret);
+
+    ps_jobq->pv_buf_wr      = ps_jobq->pv_buf_base;
+    ps_jobq->pv_buf_rd      = ps_jobq->pv_buf_base;
+    ps_jobq->i4_terminate   = 0;
+    e_ret = impeg2_jobq_unlock(ps_jobq);
+    RETURN_IF((e_ret != IV_SUCCESS), e_ret);
+
+    return e_ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Deinitializes the jobq conext
+*
+* @par   Description
+*   Deinitializes the jobq conext by calling impeg2_jobq_reset()
+* and then destrying the mutex created
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @returns IMPEG2D_FAIL if lock unlock fails else IV_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_deinit(jobq_t *ps_jobq)
+{
+    WORD32 i4_ret_val;
+    IV_API_CALL_STATUS_T e_ret = IV_SUCCESS;
+
+    e_ret = impeg2_jobq_reset(ps_jobq);
+    RETURN_IF((e_ret != IV_SUCCESS), e_ret);
+
+    i4_ret_val = ithread_mutex_destroy(ps_jobq->pv_mutex);
+    if(i4_ret_val)
+    {
+        return IV_FAIL;
+    }
+
+    return IV_SUCCESS;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Terminates the jobq
+*
+* @par   Description
+*   Terminates the jobq by setting a flag in context.
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @returns IMPEG2D_FAIL if lock unlock fails else IV_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IV_API_CALL_STATUS_T impeg2_jobq_terminate(jobq_t *ps_jobq)
+{
+    IV_API_CALL_STATUS_T e_ret = IV_SUCCESS;
+    e_ret = impeg2_jobq_lock(ps_jobq);
+    RETURN_IF((e_ret != IV_SUCCESS), e_ret);
+
+    ps_jobq->i4_terminate = 1;
+
+    e_ret = impeg2_jobq_unlock(ps_jobq);
+    RETURN_IF((e_ret != IV_SUCCESS), e_ret);
+    return e_ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief Adds a job to the queue
+*
+* @par   Description
+* Adds a job to the queue and updates wr address to next location.
+* Format/content of the job structure is abstracted and hence size of the job
+* buffer is being passed.
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @param[in] pv_job
+*   Pointer to the location that contains details of the job to be added
+*
+* @param[in] job_size
+*   Size of the job buffer
+*
+* @param[in] blocking
+*   To signal if the write is blocking or non-blocking.
+*
+* @returns
+*
+* @remarks
+* Job Queue buffer is assumed to be allocated to handle worst case number of jobs
+* Wrap around is not supported
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_queue(jobq_t *ps_jobq,
+                                       void *pv_job,
+                                       WORD32 i4_job_size,
+                                       WORD32 i4_blocking,
+                                       WORD32 i4_lock)
+{
+    IV_API_CALL_STATUS_T e_ret = IV_SUCCESS;
+    IV_API_CALL_STATUS_T e_ret_tmp;
+    UWORD8 *pu1_buf;
+    UNUSED(i4_blocking);
+
+    if(i4_lock)
+    {
+        e_ret_tmp = impeg2_jobq_lock(ps_jobq);
+        RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp);
+    }
+    pu1_buf = (UWORD8 *)ps_jobq->pv_buf_wr;
+    if((UWORD8 *)ps_jobq->pv_buf_end >= (pu1_buf + i4_job_size))
+    {
+        memcpy(ps_jobq->pv_buf_wr, pv_job, i4_job_size);
+        ps_jobq->pv_buf_wr = (UWORD8 *)ps_jobq->pv_buf_wr + i4_job_size;
+        e_ret = IV_SUCCESS;
+    }
+    else
+    {
+        /* Handle wrap around case */
+        /* Wait for pv_buf_rd to consume first job_size number of bytes
+         * from the beginning of job queue
+         */
+        e_ret = IV_FAIL;
+    }
+
+    ps_jobq->i4_terminate = 0;
+
+    if(i4_lock)
+    {
+        e_ret_tmp = impeg2_jobq_unlock(ps_jobq);
+        RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp);
+    }
+
+    return e_ret;
+}
+/**
+*******************************************************************************
+*
+* @brief Gets next from the Job queue
+*
+* @par   Description
+* Gets next job from the job queue and updates rd address to next location.
+* Format/content of the job structure is abstracted and hence size of the job
+* buffer is being passed. If it is a blocking call and if there is no new job
+* then this functions unlocks the mutext and calls yield and then locks it back.
+* and continues till a job is available or terminate is set
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @param[out] pv_job
+*   Pointer to the location that contains details of the job to be written
+*
+* @param[in] job_size
+*   Size of the job buffer
+*
+* @param[in] blocking
+*   To signal if the read is blocking or non-blocking.
+*
+* @returns
+*
+* @remarks
+* Job Queue buffer is assumed to be allocated to handle worst case number of jobs
+* Wrap around is not supported
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_dequeue(jobq_t *ps_jobq,
+                                         void *pv_job,
+                                         WORD32 i4_job_size,
+                                         WORD32 i4_blocking,
+                                         WORD32 i4_lock)
+{
+    IV_API_CALL_STATUS_T e_ret;
+    IV_API_CALL_STATUS_T e_ret_tmp;
+    volatile UWORD8 *pu1_buf;
+    if(i4_lock)
+    {
+        e_ret_tmp = impeg2_jobq_lock(ps_jobq);
+        RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp);
+    }
+    pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd;
+
+
+    if((UWORD8 *)ps_jobq->pv_buf_end >= (pu1_buf + i4_job_size))
+    {
+        while(1)
+        {
+            pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd;
+            if((UWORD8 *)ps_jobq->pv_buf_wr >= (pu1_buf + i4_job_size))
+            {
+                memcpy(pv_job, ps_jobq->pv_buf_rd, i4_job_size);
+                ps_jobq->pv_buf_rd = (UWORD8 *)ps_jobq->pv_buf_rd + i4_job_size;
+                e_ret = IV_SUCCESS;
+                break;
+            }
+            else
+            {
+                /* If all the entries have been dequeued, then break and return */
+                if(1 == ps_jobq->i4_terminate)
+                {
+                    e_ret = IV_FAIL;
+                    break;
+                }
+
+                if((1 == i4_blocking) && (1 == i4_lock))
+                {
+                    impeg2_jobq_yield(ps_jobq);
+
+                }
+                else
+                {
+                    /* If there is no job available,
+                     * and this is non blocking call then return fail */
+                    e_ret = IV_FAIL;
+                }
+            }
+        }
+    }
+    else
+    {
+        /* Handle wrap around case */
+        /* Wait for pv_buf_rd to consume first i4_job_size number of bytes
+         * from the beginning of job queue
+         */
+        e_ret = IV_FAIL;
+    }
+    if(i4_lock)
+    {
+        e_ret_tmp = impeg2_jobq_unlock(ps_jobq);
+        RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp);
+    }
+
+    return e_ret;
+}
diff --git a/common/impeg2_job_queue.h b/common/impeg2_job_queue.h
new file mode 100644
index 0000000..46d8bb9
--- /dev/null
+++ b/common/impeg2_job_queue.h
@@ -0,0 +1,72 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+*  impeg2_job_queue.h
+*
+* @brief
+*  Contains functions for job queue
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IMPEG2_JOB_QUEUE_H_
+#define _IMPEG2_JOB_QUEUE_H_
+
+typedef struct
+{
+    /** Pointer to buffer base which contains the jobs */
+    void *pv_buf_base;
+
+    /** Pointer to current address where new job can be added */
+    void *pv_buf_wr;
+
+    /** Pointer to current address from where next job can be obtained */
+    void *pv_buf_rd;
+
+    /** Pointer to end of job buffer */
+    void *pv_buf_end;
+
+    /** Mutex used to keep the functions thread-safe */
+    void *pv_mutex;
+
+    /** Flag to indicate jobq has to be terminated */
+    WORD32 i4_terminate;
+}jobq_t;
+
+WORD32 impeg2_jobq_ctxt_size(void);
+void* impeg2_jobq_init(void *pv_buf, WORD32 buf_size);
+IV_API_CALL_STATUS_T impeg2_jobq_free(jobq_t *ps_jobq);
+IV_API_CALL_STATUS_T impeg2_jobq_reset(jobq_t *ps_jobq);
+IV_API_CALL_STATUS_T impeg2_jobq_deinit(jobq_t *ps_jobq);
+IV_API_CALL_STATUS_T impeg2_jobq_terminate(jobq_t *ps_jobq);
+IV_API_CALL_STATUS_T impeg2_jobq_queue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking, WORD32 lock);
+IV_API_CALL_STATUS_T impeg2_jobq_dequeue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking, WORD32 lock);
+
+#endif /* _IMPEG2_JOB_QUEUE_H_ */
diff --git a/common/impeg2_macros.h b/common/impeg2_macros.h
new file mode 100644
index 0000000..366510f
--- /dev/null
+++ b/common/impeg2_macros.h
@@ -0,0 +1,60 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_MACROS_H__
+#define __IMPEG2_MACROS_H__
+
+#define ABS(x) ((x) < 0 ? (-1 * (x)) : (x))
+
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+#define MIN(x,y) ((x) < (y) ? (x) : (y))
+
+#define CLIP(Number,Max,Min)    if((Number) > (Max)) (Number) = (Max); \
+else if((Number) < (Min)) (Number) = (Min)
+
+#define SIGN(Number)    (((Number) < 0) ? -1 : 1)
+
+
+#define BITS(val,msb,lsb) (UWORD16)((((val) >> (lsb)) & ((1 << ((msb) - (lsb) + 1)) - 1)))
+
+#define BIT(val,bit)      (UWORD16)(((val) >> (bit)) & 0x1)
+
+#define IS_VAL_IN_RANGE(val,upperLimit,lowerLimit) ((val) >= (lowerLimit) && (val) <= (upperLimit))
+
+#define MSW(dword)        (dword >> 16)
+#define LSW(dword)        (dword & 0xFFFF)
+#define DIV_2_RND(mv) (((mv) + ((mv) > 0)) >> 1)
+#define IS_NEG(Number)    (((Number) < 0) ? 1 : 0)
+
+#define ALIGN128(x) ((((x) + 127) >> 7) << 7)
+#define ALIGN64(x)  ((((x) + 63) >> 6) << 6)
+#define ALIGN32(x)  ((((x) + 31) >> 5) << 5)
+#define ALIGN16(x)  ((((x) + 15) >> 4) << 4)
+#define ALIGN8(x)   ((((x) + 7) >> 3) << 3)
+
+
+#define RETURN_IF(cond, retval) if(cond) {return (retval);}
+#define UNUSED(x) ((void)(x))
+
+
+#define ASSERT(x) assert(x)
+
+
+#endif  /* __IMPEG2_IT_MACROS_H__ */
diff --git a/common/impeg2_mem_func.c b/common/impeg2_mem_func.c
new file mode 100644
index 0000000..9268c01
--- /dev/null
+++ b/common/impeg2_mem_func.c
@@ -0,0 +1,87 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+*  impeg2_utils.c
+*
+* @brief
+*  Contains utility function definitions for MPEG2 codec
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+* - impeg2_memset0_16bit_8x8_linear_block()
+* - impeg2_memset_8bit_8x8_block()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "impeg2_defs.h"
+
+/*******************************************************************************
+*  Function Name   : impeg2_memset0_16bit_8x8_linear_block
+*
+*  Description     : memsets resudial buf to 0
+*
+*  Arguments       : destination buffer
+*
+*  Values Returned : None
+*******************************************************************************/
+
+
+void impeg2_memset0_16bit_8x8_linear_block (WORD16 *pi2_buf)
+{
+        memset(pi2_buf,0,64 * sizeof(WORD16));
+}
+
+
+
+/*******************************************************************************
+*  Function Name   : impeg2_memset_8bit_8x8_block
+*
+*  Description     : memsets residual buf to value
+*
+*  Arguments       : destination buffer, value and stride
+*
+*  Values Returned : None
+*******************************************************************************/
+
+
+void impeg2_memset_8bit_8x8_block(UWORD8 *pu1_dst, WORD32 u4_dc_val, WORD32 u4_dst_wd)
+{
+    WORD32 j;
+
+    for(j = BLK_SIZE; j > 0; j--)
+    {
+        memset(pu1_dst, u4_dc_val, BLK_SIZE);
+        pu1_dst += u4_dst_wd;
+    }
+}
+
+
+
diff --git a/common/impeg2_mem_func.h b/common/impeg2_mem_func.h
new file mode 100644
index 0000000..f73702c
--- /dev/null
+++ b/common/impeg2_mem_func.h
@@ -0,0 +1,41 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+#ifndef IMPEG2_MEM_FUNC_H_
+#define IMPEG2_MEM_FUNC_H_
+
+typedef void pf_memset0_one_16bit_buf_t (WORD16 *buf);
+typedef void pf_memset_8bit_t (UWORD8 *dst, WORD32 dc_val, WORD32 dst_wd);
+
+pf_memset0_one_16bit_buf_t impeg2_memset0_16bit_8x8_linear_block;
+pf_memset0_one_16bit_buf_t impeg2_memset0_16bit_8x8_linear_block_a9q;
+
+pf_memset0_one_16bit_buf_t impeg2_memset0_16bit_8x8_linear_block_sse42;
+
+pf_memset0_one_16bit_buf_t impeg2_memset0_16bit_8x8_linear_block_av8;
+
+pf_memset_8bit_t impeg2_memset_8bit_8x8_block;
+pf_memset_8bit_t impeg2_memset_8bit_8x8_block_a9q;
+
+pf_memset_8bit_t impeg2_memset_8bit_8x8_block_sse42;
+
+pf_memset_8bit_t impeg2_memset_8bit_8x8_block_av8;
+
+#endif /* IMPEG2_MEM_FUNC_H_ */
diff --git a/common/ithread.c b/common/ithread.c
new file mode 100644
index 0000000..76fdad3
--- /dev/null
+++ b/common/ithread.c
@@ -0,0 +1,453 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/*                                                                           */
+/*  File Name         : ithread.c                                            */
+/*                                                                           */
+/*  Description       : Contains abstraction for threads, mutex and semaphores*/
+/*                                                                           */
+/*  List of Functions :                                                      */
+/*                                                                           */
+/*  Issues / Problems : None                                                 */
+/*                                                                           */
+/*  Revision History  :                                                      */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   Harish          Initial Version                      */
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "ithread.h"
+#include <sys/types.h>
+
+#ifndef X86_MSVC
+//#define PTHREAD_AFFINITY
+//#define SYSCALL_AFFINITY
+
+#ifdef PTHREAD_AFFINITY
+#define _GNU_SOURCE
+#define __USE_GNU
+#endif
+
+#include <pthread.h>
+#include <sched.h>
+#include <semaphore.h>
+#include <unistd.h>
+
+
+#endif
+#if 0
+#include <sys/syscall.h>
+#endif
+
+#ifdef X86_MSVC
+
+#include <windows.h>
+#define SEM_MAX_COUNT       100
+#define SEM_INCREMENT_COUNT 1
+
+UWORD32 ithread_get_handle_size(void)
+{
+    return (sizeof(HANDLE));
+}
+
+UWORD32 ithread_get_mutex_lock_size(void)
+{
+    return (sizeof(HANDLE));
+}
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
+{
+    HANDLE *ppv_thread_handle;
+    HANDLE thread_handle_value;
+
+    if(0 == thread_handle)
+        return -1;
+
+    ppv_thread_handle = (HANDLE *)thread_handle;
+    thread_handle_value = (void *)CreateThread
+            (NULL,                             /* Attributes      */
+            1024*128,                          /* Stack size      */
+            (LPTHREAD_START_ROUTINE)strt,      /* Thread function */
+            argument,                          /* Parameters      */
+            0,                                 /* Creation flags  */
+            NULL);                             /* Thread ID       */
+    *ppv_thread_handle = (HANDLE)thread_handle_value;
+
+    return 0;
+}
+
+WORD32 ithread_join(void *thread_handle, void ** val_ptr)
+{
+    HANDLE *ppv_thread_handle;
+    HANDLE thread_handle_value;
+
+    if(0 == thread_handle)
+        return -1;
+
+    ppv_thread_handle = (HANDLE *)thread_handle;
+    thread_handle_value = *ppv_thread_handle;
+
+    if(WAIT_OBJECT_0 == WaitForSingleObject(thread_handle_value, INFINITE))
+    {
+        CloseHandle(thread_handle_value);
+    }
+
+    return 0;
+}
+
+void ithread_exit(void *thread_handle)
+{
+    HANDLE *ppv_thread_handle;
+    HANDLE thread_handle_value;
+    DWORD thread_exit_code;
+
+    if(0 == thread_handle)
+        return;
+
+    ppv_thread_handle = (HANDLE *)thread_handle;
+    thread_handle_value = *ppv_thread_handle;
+    /* Get exit code for thread. If the return value is 0, means thread is busy */
+    if( 0 != GetExitCodeThread(thread_handle_value, &thread_exit_code))
+    {
+        TerminateThread(thread_handle_value, thread_exit_code);
+    }
+
+    return;
+}
+
+WORD32 ithread_get_mutex_struct_size(void)
+{
+    return (sizeof(HANDLE));
+}
+
+WORD32 ithread_mutex_init(void *mutex)
+{
+    HANDLE *ppv_mutex_handle;
+    HANDLE mutex_handle_value;
+
+    if(0 == mutex)
+        return -1;
+
+    ppv_mutex_handle = (HANDLE *)mutex;
+    mutex_handle_value = CreateSemaphore(NULL, 1, 1, NULL);
+    *ppv_mutex_handle = mutex_handle_value;
+    return 0;
+}
+
+WORD32 ithread_mutex_destroy(void *mutex)
+{
+    HANDLE *ppv_mutex_handle;
+    HANDLE mutex_handle_value;
+
+    if(0 == mutex)
+        return -1;
+
+    ppv_mutex_handle = (HANDLE *)mutex;
+    mutex_handle_value = *ppv_mutex_handle;
+    CloseHandle(mutex_handle_value);
+    return 0;
+}
+
+WORD32 ithread_mutex_lock(void *mutex)
+{
+    HANDLE *ppv_mutex_handle;
+    HANDLE mutex_handle_value;
+    DWORD  result = 0;
+
+    if(0 == mutex)
+        return -1;
+
+    ppv_mutex_handle = (HANDLE *)mutex;
+    mutex_handle_value = *ppv_mutex_handle;
+    result = WaitForSingleObject(mutex_handle_value, INFINITE);
+
+    if(WAIT_OBJECT_0 == result)
+        return 0;
+
+    return 1;
+
+}
+
+WORD32 ithread_mutex_unlock(void *mutex)
+{
+    HANDLE *ppv_mutex_handle;
+    HANDLE mutex_handle_value;
+    DWORD  result = 0;
+
+    if(0 == mutex)
+        return -1;
+
+    ppv_mutex_handle = (HANDLE *)mutex;
+    mutex_handle_value = *ppv_mutex_handle;
+    result = ReleaseSemaphore(mutex_handle_value, 1, NULL);
+
+    if(0 == result)
+        return -1;
+
+    return 0;
+}
+
+void ithread_yield(void) { }
+
+void ithread_usleep(UWORD32 u4_time_us)
+{
+    UWORD32 u4_time_ms = u4_time_us / 1000;
+    Sleep(u4_time_ms);
+}
+
+void ithread_msleep(UWORD32 u4_time_ms)
+{
+    Sleep(u4_time_ms);
+}
+
+void ithread_sleep(UWORD32 u4_time)
+{
+    UWORD32 u4_time_ms = u4_time * 1000;
+    Sleep(u4_time_ms);
+}
+
+UWORD32 ithread_get_sem_struct_size(void)
+{
+    return (sizeof(HANDLE));
+}
+
+WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value)
+{
+    HANDLE *sem_handle = (HANDLE *)sem;
+    HANDLE sem_handle_value;
+
+    if(0 == sem)
+        return -1;
+
+    sem_handle_value = CreateSemaphore(NULL,  /* Security Attribute*/
+                         value,  /* Initial count     */
+                        SEM_MAX_COUNT,/* Max value         */
+                        NULL);        /* Name, not used    */
+    *sem_handle = sem_handle_value;
+    return 0;
+}
+
+WORD32 ithread_sem_post(void *sem)
+{
+    HANDLE *sem_handle = (HANDLE *)sem;
+    HANDLE sem_handle_value;
+
+    if(0 == sem)
+        return -1;
+
+    sem_handle_value = *sem_handle;
+
+    /* Post on Semaphore by releasing the lock on mutex */
+    if(ReleaseSemaphore(sem_handle_value, SEM_INCREMENT_COUNT, NULL))
+        return 0;
+
+    return -1;
+}
+
+WORD32 ithread_sem_wait(void *sem)
+{
+    DWORD          result = 0;
+    HANDLE *sem_handle = (HANDLE *)sem;
+    HANDLE sem_handle_value;
+
+    if(0 == sem)
+        return -1;
+
+    sem_handle_value = *sem_handle;
+
+    /* Wait on Semaphore object infinitly */
+    result = WaitForSingleObject(sem_handle_value, INFINITE);
+
+    /* If lock on semaphore is acquired, return SUCCESS */
+    if(WAIT_OBJECT_0 == result)
+        return 0;
+
+    /* If call timeouts, return FAILURE */
+    if(WAIT_TIMEOUT == result)
+        return -1;
+
+    return 0;
+}
+
+WORD32 ithread_sem_destroy(void *sem)
+{
+    HANDLE *sem_handle = (HANDLE *)sem;
+    HANDLE sem_handle_value;
+
+    if(0 == sem)
+        return -1;
+
+    sem_handle_value = *sem_handle;
+
+    if(FALSE == CloseHandle(sem_handle_value) )
+    {
+        return -1;
+    }
+    return 0;
+}
+
+WORD32 ithread_set_affinity(WORD32 core_id)
+{
+        return 1;
+}
+
+#else
+UWORD32 ithread_get_handle_size(void)
+{
+    return sizeof(pthread_t);
+}
+
+UWORD32 ithread_get_mutex_lock_size(void)
+{
+    return sizeof(pthread_mutex_t);
+}
+
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
+{
+    ((void)(attribute));
+    return pthread_create((pthread_t *)thread_handle, NULL,(void *(*)(void *)) strt, argument);
+}
+
+WORD32 ithread_join(void *thread_handle, void ** val_ptr)
+{
+    pthread_t *pthread_handle   = (pthread_t *)thread_handle;
+    ((void)(val_ptr));
+    return pthread_join(*pthread_handle, NULL);
+}
+
+void ithread_exit(void *val_ptr)
+{
+return pthread_exit(val_ptr);
+}
+
+WORD32 ithread_get_mutex_struct_size(void)
+{
+    return(sizeof(pthread_mutex_t));
+}
+WORD32 ithread_mutex_init(void *mutex)
+{
+    return pthread_mutex_init((pthread_mutex_t *) mutex, NULL);
+}
+
+WORD32 ithread_mutex_destroy(void *mutex)
+{
+    return pthread_mutex_destroy((pthread_mutex_t *) mutex);
+}
+
+WORD32 ithread_mutex_lock(void *mutex)
+{
+    return pthread_mutex_lock((pthread_mutex_t *)mutex);
+}
+
+WORD32 ithread_mutex_unlock(void *mutex)
+{
+    return pthread_mutex_unlock((pthread_mutex_t *)mutex);
+}
+
+void ithread_yield(void)
+{
+    sched_yield();
+}
+
+void ithread_sleep(UWORD32 u4_time)
+{
+    usleep(u4_time * 1000 * 1000);
+}
+
+void ithread_msleep(UWORD32 u4_time_ms)
+{
+    usleep(u4_time_ms * 1000);
+}
+
+void ithread_usleep(UWORD32 u4_time_us)
+{
+    usleep(u4_time_us);
+}
+
+UWORD32 ithread_get_sem_struct_size(void)
+{
+    return(sizeof(sem_t));
+}
+
+
+WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value)
+{
+    return sem_init((sem_t *)sem,pshared,value);
+}
+
+WORD32 ithread_sem_post(void *sem)
+{
+    return sem_post((sem_t *)sem);
+}
+
+
+WORD32 ithread_sem_wait(void *sem)
+{
+    return sem_wait((sem_t *)sem);
+}
+
+
+WORD32 ithread_sem_destroy(void *sem)
+{
+return sem_destroy((sem_t *)sem);
+}
+
+
+WORD32 ithread_set_affinity(WORD32 core_id)
+{
+#ifdef PTHREAD_AFFINITY
+    cpu_set_t cpuset;
+    int num_cores = sysconf(_SC_NPROCESSORS_ONLN);
+    pthread_t cur_thread = pthread_self();
+
+    if (core_id >= num_cores)
+        return -1;
+
+    CPU_ZERO(&cpuset);
+    CPU_SET(core_id, &cpuset);
+
+    return pthread_setaffinity_np(cur_thread, sizeof(cpu_set_t), &cpuset);
+
+#elif SYSCALL_AFFINITY
+    WORD32 i4_sys_res;
+
+    pid_t pid = gettid();
+
+
+    i4_sys_res = syscall(__NR_sched_setaffinity, pid, sizeof(i4_mask), &i4_mask);
+    if (i4_sys_res)
+    {
+        //WORD32 err;
+        //err = errno;
+        //perror("Error in setaffinity syscall PERROR : ");
+        //LOG_ERROR("Error in the syscall setaffinity: mask=0x%x err=0x%x", i4_mask, i4_sys_res);
+        return -1;
+    }
+#endif
+    ((void)(core_id));
+    return 1;
+
+}
+#endif
diff --git a/common/ithread.h b/common/ithread.h
new file mode 100644
index 0000000..eb75d20
--- /dev/null
+++ b/common/ithread.h
@@ -0,0 +1,80 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+*  ithread.h
+*
+* @brief
+*  This file contains all the necessary structure and  enumeration
+* definitions needed for the Application  Program Interface(API) of the
+* Thread Abstraction Layer
+*
+* @author
+*  Harish
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef __ITHREAD_H__
+#define __ITHREAD_H__
+
+UWORD32 ithread_get_handle_size(void);
+
+UWORD32 ithread_get_mutex_lock_size(void);
+
+WORD32  ithread_create(void *thread_handle, void *attribute, void *strt, void *argument);
+
+void    ithread_exit(void *val_ptr);
+
+WORD32  ithread_join(void *thread_id, void ** val_ptr);
+
+WORD32  ithread_get_mutex_struct_size(void);
+
+WORD32 ithread_mutex_init(void *mutex);
+
+WORD32 ithread_mutex_destroy(void *mutex);
+
+WORD32  ithread_mutex_lock(void *mutex);
+
+WORD32  ithread_mutex_unlock(void *mutex);
+
+void    ithread_yield(void);
+
+void    ithread_sleep(UWORD32 u4_time);
+
+void    ithread_msleep(UWORD32 u4_time_ms);
+
+void    ithread_usleep(UWORD32 u4_time_us);
+
+UWORD32 ithread_get_sem_struct_size(void);
+
+WORD32  ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value);
+
+WORD32  ithread_sem_post(void *sem);
+
+WORD32  ithread_sem_wait(void *sem);
+
+WORD32  ithread_sem_destroy(void *sem);
+
+WORD32 ithread_set_affinity(WORD32 core_id);
+#endif /* __ITHREAD_H__ */
diff --git a/common/iv.h b/common/iv.h
new file mode 100644
index 0000000..3941497
--- /dev/null
+++ b/common/iv.h
@@ -0,0 +1,420 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+*  iv.h
+*
+* @brief
+*  This file contains all the necessary structure and  enumeration
+* definitions needed for the Application  Program Interface(API) of the
+* Ittiam Video and Image  codecs
+*
+* @author
+*  100239(RCY)
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IV_H
+#define _IV_H
+
+/*****************************************************************************/
+/* Constant Macros                                                           */
+/*****************************************************************************/
+
+
+/*****************************************************************************/
+/* Typedefs                                                                  */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Enums                                                                     */
+/*****************************************************************************/
+
+
+/* IV_API_CALL_STATUS_T:This is only to return the FAIL/PASS status to the  */
+/* application for the current API call                                     */
+
+typedef enum{
+    IV_STATUS_NA                                = 0x7FFFFFFF,
+    IV_SUCCESS                                  = 0x0,
+    IV_FAIL                                     = 0x1,
+}IV_API_CALL_STATUS_T;
+
+/* IV_MEM_TYPE_T: This Enumeration defines the type of memory (Internal/Ext */
+/* -ernal) along with the cacheable/non-cacheable attributes                */
+
+typedef enum {
+    IV_NA_MEM_TYPE                              = 0x7FFFFFFF,
+    IV_INTERNAL_CACHEABLE_PERSISTENT_MEM        = 0x1,
+    IV_INTERNAL_CACHEABLE_SCRATCH_MEM           = 0x2,
+    IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM        = 0x3,
+    IV_EXTERNAL_CACHEABLE_SCRATCH_MEM           = 0x4,
+    IV_INTERNAL_NONCACHEABLE_PERSISTENT_MEM     = 0x5,
+    IV_INTERNAL_NONCACHEABLE_SCRATCH_MEM        = 0x6,
+    IV_EXTERNAL_NONCACHEABLE_PERSISTENT_MEM     = 0x7,
+    IV_EXTERNAL_NONCACHEABLE_SCRATCH_MEM        = 0x8
+}IV_MEM_TYPE_T;
+
+/* IV_COLOR_FORMAT_T: This enumeration lists all the color formats which    */
+/* finds usage in video/image codecs                                        */
+
+typedef enum {
+    IV_CHROMA_NA                            = 0x7FFFFFFF,
+    IV_YUV_420P                             = 0x1,
+    IV_YUV_422P                             = 0x2,
+    IV_420_UV_INTL                          = 0x3,
+    IV_YUV_422IBE                           = 0x4,
+    IV_YUV_422ILE                           = 0x5,
+    IV_YUV_444P                             = 0x6,
+    IV_YUV_411P                             = 0x7,
+    IV_GRAY                                 = 0x8,
+    IV_RGB_565                              = 0x9,
+    IV_RGB_24                               = 0xa,
+    IV_YUV_420SP_UV                         = 0xb,
+    IV_YUV_420SP_VU                         = 0xc,
+    IV_RGBA_8888                            = 0xd
+}IV_COLOR_FORMAT_T;
+
+/* IV_PICTURE_CODING_TYPE_T: VOP/Frame coding type Enumeration              */
+
+typedef enum {
+    IV_NA_FRAME                             = 0x7FFFFFFF,
+    IV_I_FRAME                              = 0x0,
+    IV_P_FRAME                              = 0x1,
+    IV_B_FRAME                              = 0x2,
+    IV_IDR_FRAME                            = 0x3,
+    IV_II_FRAME                             = 0x4,
+    IV_IP_FRAME                             = 0x5,
+    IV_IB_FRAME                             = 0x6,
+    IV_PI_FRAME                             = 0x7,
+    IV_PP_FRAME                             = 0x8,
+    IV_PB_FRAME                             = 0x9,
+    IV_BI_FRAME                             = 0xa,
+    IV_BP_FRAME                             = 0xb,
+    IV_BB_FRAME                             = 0xc,
+    IV_MBAFF_I_FRAME                        = 0xd,
+    IV_MBAFF_P_FRAME                        = 0xe,
+    IV_MBAFF_B_FRAME                        = 0xf,
+    IV_MBAFF_IDR_FRAME                      = 0x10,
+    IV_NOT_CODED_FRAME                      = 0x11,
+    IV_FRAMETYPE_DEFAULT                    = IV_I_FRAME
+}IV_PICTURE_CODING_TYPE_T;
+
+/* IV_FLD_TYPE_T: field type Enumeration                                    */
+
+typedef enum {
+    IV_NA_FLD                               = 0x7FFFFFFF,
+    IV_TOP_FLD                              = 0x0,
+    IV_BOT_FLD                              = 0x1,
+    IV_FLD_TYPE_DEFAULT                     = IV_TOP_FLD
+}IV_FLD_TYPE_T;
+
+/* IV_CONTENT_TYPE_T: Video content type                                     */
+
+typedef enum {
+    IV_CONTENTTYPE_NA                       = 0x7FFFFFFF,
+    IV_PROGRESSIVE                          = 0x0,
+    IV_INTERLACED                           = 0x1,
+    IV_PROGRESSIVE_FRAME                    = 0x2,
+    IV_INTERLACED_FRAME                     = 0x3,
+    IV_INTERLACED_TOPFIELD                  = 0x4,
+    IV_INTERLACED_BOTTOMFIELD               = 0x5,
+    IV_CONTENTTYPE_DEFAULT                  = IV_PROGRESSIVE,
+}IV_CONTENT_TYPE_T;
+
+/* IV_API_COMMAND_TYPE_T:API command type                                   */
+typedef enum {
+    IV_CMD_NA                           = 0x7FFFFFFF,
+    IV_CMD_GET_NUM_MEM_REC              = 0x0,
+    IV_CMD_FILL_NUM_MEM_REC             = 0x1,
+    IV_CMD_RETRIEVE_MEMREC              = 0x2,
+    IV_CMD_INIT                         = 0x3,
+    IV_CMD_DUMMY_ELEMENT                = 0x4,
+}IV_API_COMMAND_TYPE_T;
+
+/*****************************************************************************/
+/* Structure                                                                 */
+/*****************************************************************************/
+
+/* IV_OBJ_T: This structure defines the handle for the codec instance        */
+
+typedef struct{
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * Pointer to the API function pointer table of the codec
+     */
+    void                                        *pv_fxns;
+
+    /**
+     * Pointer to the handle of the codec
+     */
+    void                                        *pv_codec_handle;
+}iv_obj_t;
+
+/* iv_mem_rec_t: This structure defines the memory record holder which will  */
+/* be used by the codec to communicate its memory requirements to the        */
+/* application through appropriate API functions                             */
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * Pointer to the memory allocated by the application
+     */
+    void                                        *pv_base;
+
+    /**
+     * u4_size of the memory to be allocated
+     */
+    UWORD32                                     u4_mem_size;
+
+    /**
+     * Alignment of the memory pointer
+     */
+    UWORD32                                     u4_mem_alignment;
+    /**
+     * Nature of the memory to be allocated
+     */
+    IV_MEM_TYPE_T                               e_mem_type;
+}iv_mem_rec_t;
+
+/* IV_YUV_BUF_T: This structure defines attributes for the yuv buffer        */
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * Pointer to Luma (Y) Buffer
+     */
+
+    void                                        *pv_y_buf;
+    /**
+     * Pointer to Chroma (Cb) Buffer
+     */
+    void                                        *pv_u_buf;
+
+    /**
+     * Pointer to Chroma (Cr) Buffer
+     */
+    void                                        *pv_v_buf;
+
+    /**
+     * Width of the Luma (Y) Buffer
+     */
+    UWORD32                                     u4_y_wd;
+
+    /**
+     * Height of the Luma (Y) Buffer
+     */
+    UWORD32                                     u4_y_ht;
+
+    /**
+     * Stride/Pitch of the Luma (Y) Buffer
+     */
+    UWORD32                                     u4_y_strd;
+
+    /**
+     * Width of the Chroma (Cb) Buffer
+     */
+    UWORD32                                     u4_u_wd;
+
+    /**
+     * Height of the Chroma (Cb) Buffer
+     */
+    UWORD32                                     u4_u_ht;
+
+    /**
+     * Stride/Pitch of the Chroma (Cb) Buffer
+     */
+    UWORD32                                     u4_u_strd;
+
+    /**
+     * Width of the Chroma (Cr) Buffer
+     */
+    UWORD32                                     u4_v_wd;
+
+    /**
+     * Height of the Chroma (Cr) Buffer
+     */
+    UWORD32                                     u4_v_ht;
+
+    /**
+     * Stride/Pitch of the Chroma (Cr) Buffer
+     */
+    UWORD32                                     u4_v_strd;
+}iv_yuv_buf_t;
+
+/*****************************************************************************/
+/*  Get Number of Memory Records                                             */
+/*****************************************************************************/
+
+/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_GET_NUM_MEM_REC                     */
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * cmd
+     */
+    IV_API_COMMAND_TYPE_T                       e_cmd;
+}iv_num_mem_rec_ip_t;
+
+
+typedef struct{
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * error code
+     */
+    UWORD32                                     u4_error_code;
+
+    /**
+     * num_mem_rec
+     */
+    UWORD32                                     u4_num_mem_rec;
+}iv_num_mem_rec_op_t;
+
+
+/*****************************************************************************/
+/*  Fill Memory Records                                                      */
+/*****************************************************************************/
+
+/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_FILL_NUM_MEM_REC                    */
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * cmd
+     */
+    IV_API_COMMAND_TYPE_T                       e_cmd;
+
+    /**
+     * pointer to array of memrecords structures should be filled by codec
+    with details of memory resource requirements
+     */
+    iv_mem_rec_t                                *pv_mem_rec_location;
+
+    /**
+     * maximum width for which codec should request memory requirements
+     */
+    UWORD32                                     u4_max_frm_wd;
+
+    /**
+     * maximum height for which codec should request memory requirements
+     */
+    UWORD32                                     u4_max_frm_ht;
+}iv_fill_mem_rec_ip_t;
+
+
+typedef struct{
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * error_code
+     */
+    UWORD32                                     u4_error_code;
+
+    /**
+     * no of memory record structures which are filled by codec
+     */
+    UWORD32                                     u4_num_mem_rec_filled;
+}iv_fill_mem_rec_op_t;
+
+
+/*****************************************************************************/
+/*  Retrieve Memory Records                                                  */
+/*****************************************************************************/
+
+/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_RETRIEVE_MEMREC                     */
+
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * cmd
+     */
+    IV_API_COMMAND_TYPE_T                       e_cmd;
+
+    /**
+     * array of structures where codec should fill with all resources(memory) with it
+     */
+    iv_mem_rec_t                                *pv_mem_rec_location;
+}iv_retrieve_mem_rec_ip_t;
+
+
+typedef struct{
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * error_code
+     */
+    UWORD32                                     u4_error_code;
+
+    /**
+     * no of memory records filled by codec
+     */
+    UWORD32                                     u4_num_mem_rec_filled;
+}iv_retrieve_mem_rec_op_t;
+
+
+
+#endif /* _IV_H */
+
diff --git a/common/iv_datatypedef.h b/common/iv_datatypedef.h
new file mode 100644
index 0000000..3c45942
--- /dev/null
+++ b/common/iv_datatypedef.h
@@ -0,0 +1,81 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/*                                                                           */
+/*  File Name         : datatypedef.h                                        */
+/*                                                                           */
+/*  Description       : This file contains all the necessary data type       */
+/*                      definitions.                                         */
+/*                                                                           */
+/*  List of Functions : None                                                 */
+/*                                                                           */
+/*  Issues / Problems : None                                                 */
+/*                                                                           */
+/*  Revision History  :                                                      */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         29 12 2006  Rajendra C Y          Draft                           */
+/*                                                                           */
+/*****************************************************************************/
+
+#ifndef __IV_DATATYPEDEF_H__
+#define __IV_DATATYPEDEF_H__
+
+/*****************************************************************************/
+/* Typedefs                                                                  */
+/*****************************************************************************/
+
+typedef int             WORD32;
+typedef unsigned int    UWORD32;
+
+typedef short           WORD16;
+typedef unsigned short  UWORD16;
+
+typedef char            WORD8;
+typedef unsigned char   UWORD8;
+
+typedef char            CHAR;
+#ifndef NULL
+#define NULL            ((void *)0)
+
+#endif
+
+typedef enum
+{
+    IT_FALSE,
+    IT_TRUE
+} IT_BOOL;
+
+
+typedef enum
+{
+    IT_OK,
+    IT_ERROR = -1
+} IT_STATUS;
+
+/*****************************************************************************/
+/* Input and Output Parameter identifiers                                    */
+/*****************************************************************************/
+#define                 IT_IN
+#define                 IT_OUT
+
+
+#endif /* __IV_DATATYPEDEF_H__ */
+
diff --git a/common/mips/impeg2_platform_macros.h b/common/mips/impeg2_platform_macros.h
new file mode 100644
index 0000000..05ff6da
--- /dev/null
+++ b/common/mips/impeg2_platform_macros.h
@@ -0,0 +1,49 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_PLATFORM_MACROS_H__
+#define __IMPEG2_PLATFORM_MACROS_H__
+
+
+#define     CONV_LE_TO_BE(u4_temp2,u4_temp1)    u4_temp2 = (u4_temp1 << 24) |               \
+                                                           ((u4_temp1 & 0xff00) << 8) |     \
+                                                           ((u4_temp1 & 0xff0000) >> 8) |   \
+                                                           (u4_temp1 >> 24);
+static __inline UWORD32 CLZ(UWORD32 u4_word)
+{
+    if(u4_word)
+        return (__builtin_clz(u4_word));
+    else
+        return 32;
+}
+
+
+#define CLIP_U8(x) ((x) > 255) ? (255) : (((x) < 0) ? (0) : (x))
+#define CLIP_S8(x) ((x) > 127) ? (127) : (((x) < -128) ? (-128) : (x))
+
+#define CLIP_U12(x) ((x) > 4095) ? (4095) : (((x) < 0) ? (0) : (x))
+#define CLIP_S12(x) ((x) > 2047) ? (2047) : (((x) < -2048) ? (-2048) : (x))
+
+#define CLIP_U16(x) ((x) > 65535) ? (65535) : (((x) < 0) ? (0) : (x))
+#define CLIP_S16(x) ((x) > 65535) ? (65535) : (((x) < -65536) ? (-65536) : (x))
+#define PLD(x)
+
+#define INLINE
+
+#endif /* __IMPEG2_PLATFORM_MACROS_H__ */
diff --git a/common/x86/impeg2_idct_recon_sse42_intr.c b/common/x86/impeg2_idct_recon_sse42_intr.c
new file mode 100755
index 0000000..4142032
--- /dev/null
+++ b/common/x86/impeg2_idct_recon_sse42_intr.c
@@ -0,0 +1,2205 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+ *******************************************************************************
+ * @file
+ *  impeg2_itrans_recon_x86_intr.c
+ *
+ * @brief
+ *  Contains function definitions for inverse  quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ *  100470
+ *  100592 (edited by)
+ *
+ * @par List of Functions:
+ *  - impeg2_itrans_recon_8x8_sse42()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "impeg2_macros.h"
+#include "impeg2_defs.h"
+#include "impeg2_globals.h"
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform and
+ * reconstruction for 8c8 input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform  and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 8x8 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 8x8 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 8x8 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 8x8 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void impeg2_idct_recon_sse42(WORD16 *pi2_src,
+                                  WORD16 *pi2_tmp,
+                                  UWORD8 *pu1_pred,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 src_strd,
+                                  WORD32 pred_strd,
+                                  WORD32 dst_strd,
+                                  WORD32 zero_cols,
+                                  WORD32 zero_rows)
+{
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_5;
+    __m128i m_temp_reg_6;
+    __m128i m_temp_reg_7;
+    __m128i m_temp_reg_4;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_15;
+    __m128i m_temp_reg_16;
+    __m128i m_temp_reg_17;
+    __m128i m_temp_reg_20;
+    __m128i m_temp_reg_21;
+    __m128i m_temp_reg_22;
+    __m128i m_temp_reg_23;
+    __m128i m_temp_reg_24;
+    __m128i m_temp_reg_25;
+    __m128i m_temp_reg_26;
+    __m128i m_temp_reg_27;
+    __m128i m_temp_reg_30;
+    __m128i m_temp_reg_31;
+    __m128i m_temp_reg_32;
+    __m128i m_temp_reg_33;
+    __m128i m_temp_reg_34;
+    __m128i m_temp_reg_35;
+    __m128i m_temp_reg_36;
+    __m128i m_temp_reg_37;
+    __m128i m_temp_reg_40;
+    __m128i m_temp_reg_41;
+    __m128i m_temp_reg_42;
+    __m128i m_temp_reg_43;
+    __m128i m_temp_reg_44;
+    __m128i m_temp_reg_45;
+    __m128i m_temp_reg_46;
+    __m128i m_temp_reg_47;
+    __m128i m_temp_reg_50;
+    __m128i m_temp_reg_51;
+    __m128i m_temp_reg_52;
+    __m128i m_temp_reg_53;
+    __m128i m_temp_reg_54;
+    __m128i m_temp_reg_55;
+    __m128i m_temp_reg_56;
+    __m128i m_temp_reg_57;
+    __m128i m_temp_reg_60;
+    __m128i m_temp_reg_61;
+    __m128i m_temp_reg_62;
+    __m128i m_temp_reg_63;
+    __m128i m_temp_reg_64;
+    __m128i m_temp_reg_65;
+    __m128i m_temp_reg_66;
+    __m128i m_temp_reg_67;
+    __m128i m_temp_reg_70;
+    __m128i m_temp_reg_71;
+    __m128i m_temp_reg_72;
+    __m128i m_temp_reg_73;
+    __m128i m_temp_reg_74;
+    __m128i m_temp_reg_75;
+    __m128i m_temp_reg_76;
+    __m128i m_temp_reg_77;
+    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+
+    WORD32 check_row_stage_1;   /* Lokesh */
+    WORD32 check_row_stage_2;   /* Lokesh */
+
+    __m128i m_rdng_factor;
+    WORD32 i4_shift = IDCT_STG1_SHIFT;
+    UNUSED(pi2_tmp);
+    check_row_stage_1   = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
+    check_row_stage_2   = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
+
+    m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+
+    m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src);
+
+    if(!check_row_stage_2)
+    {
+        if(!check_row_stage_1)
+        {
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                //Interleaving 0,4 row in 0 , 1 Rishab
+                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+                //Interleaving 2,6 row in 4, 5 Rishab
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+
+                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
+
+
+
+                /* e */
+
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+
+                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    //o0:1B*89+3B*75,5B*50+7B*18
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+
+
+                    /* Column 0 of destination computed here */
+                    /* It is stored in m_temp_reg_50 */
+                    /* Column 7 of destination computed here */
+                    /* It is stored in m_temp_reg_57 */
+                    /* Upper 8 bytes of both registers are zero due to zero_cols*/
+
+
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_setzero_si128();
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o1:1B*75-3B*18,5B*89+7B*50
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+                    /* Loading coeff for computing o2  in the next block */
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+
+                    /* Column 1 of destination computed here */
+                    /* It is stored in m_temp_reg_51 */
+                    /* Column 6 of destination computed here */
+                    /* It is stored in m_temp_reg_56 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o2:1B*50-3B*89,5B*18+7B*75
+                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
+
+
+
+                    /* Column 2 of destination computed here */
+                    /* It is stored in m_temp_reg_52 */
+                    /* Column 5 of destination computed here */
+                    /* It is stored in m_temp_reg_55 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o3:1B*18-3B*50,5B*75-7B*89
+                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+
+                    /* Column 3 of destination computed here */
+                    /* It is stored in m_temp_reg_53 */
+                    /* Column 4 of destination computed here */
+                    /* It is stored in m_temp_reg_54 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+
+                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_54 = _mm_setzero_si128();
+                m_temp_reg_55 = _mm_setzero_si128();
+                m_temp_reg_56 = _mm_setzero_si128();
+                m_temp_reg_57 = _mm_setzero_si128();
+            }
+        }
+        else
+        {
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                //Interleaving 0,4 row in 0 , 1 Rishab
+                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+                //Interleaving 2,6 row in 4, 5 Rishab
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+
+                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
+
+
+
+                /* e */
+
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+
+                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                    //o0:1B*89+3B*75,5B*50+7B*18
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+
+
+
+                    /* Column 0 of destination computed here */
+                    /* It is stored in m_temp_reg_50 */
+                    /* Column 7 of destination computed here */
+                    /* It is stored in m_temp_reg_57 */
+                    /* Upper 8 bytes of both registers are zero due to zero_cols*/
+
+
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_setzero_si128();
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o1:1B*75-3B*18,5B*89+7B*50
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+                    /* Loading coeff for computing o2  in the next block */
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+
+
+
+                    /* Column 1 of destination computed here */
+                    /* It is stored in m_temp_reg_51 */
+                    /* Column 6 of destination computed here */
+                    /* It is stored in m_temp_reg_56 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o2:1B*50-3B*89,5B*18+7B*75
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+
+
+                    /* Column 2 of destination computed here */
+                    /* It is stored in m_temp_reg_52 */
+                    /* Column 5 of destination computed here */
+                    /* It is stored in m_temp_reg_55 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o3:1B*18-3B*50,5B*75-7B*89
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+
+
+                    /* Column 3 of destination computed here */
+                    /* It is stored in m_temp_reg_53 */
+                    /* Column 4 of destination computed here */
+                    /* It is stored in m_temp_reg_54 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+
+                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_54 = _mm_setzero_si128();
+                m_temp_reg_55 = _mm_setzero_si128();
+                m_temp_reg_56 = _mm_setzero_si128();
+                m_temp_reg_57 = _mm_setzero_si128();
+            }
+        }
+
+        /* Stage 2 */
+        i4_shift = IDCT_STG2_SHIFT;
+        {
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            {
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
+
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+                /* Loading coeff for computing o0 in the next block */
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
+
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
+
+
+
+                /* e */
+
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+                    //o0:1B*89+3B*75,1T*89+3T*75
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+                    /* Loading coeff for computing o1 in the next block */
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
+
+
+
+                    /* Column 0 of destination computed here */
+                    /* It is stored in m_temp_reg_50 */
+                    /* Column 7 of destination computed here */
+                    /* It is stored in m_temp_reg_57 */
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    //o1:1B*75-3B*18,1T*75-3T*18
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+                    /* Loading coeff for computing o2  in the next block */
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
+
+
+
+                    /* Column 1 of destination computed here */
+                    /* It is stored in m_temp_reg_51 */
+                    /* Column 6 of destination computed here */
+                    /* It is stored in m_temp_reg_56 */
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    //o2:1B*50-3B*89,5T*18+7T*75.
+                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
+
+
+                    /* Column 2 of destination computed here */
+                    /* It is stored in m_temp_reg_52 */
+                    /* Column 5 of destination computed here */
+                    /* It is stored in m_temp_reg_55 */
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    //o3:1B*18-3B*50,1T*18-3T*50
+                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+                    /* Column 3 of destination computed here */
+                    /* It is stored in m_temp_reg_53 */
+                    /* Column 4 of destination computed here */
+                    /* It is stored in m_temp_reg_54 */
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
+                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
+                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
+
+                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
+                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
+                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
+                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+            }
+
+            /* Recon and store */
+            {
+                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+                m_temp_reg_50 = _mm_setzero_si128();
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
+                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
+                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
+                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
+                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
+                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
+                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
+                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
+
+                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
+                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
+                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
+                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
+                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
+                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
+                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
+                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
+
+                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
+                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
+                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
+                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
+                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
+                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
+                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
+                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
+
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
+                pu1_dst += dst_strd;
+            }
+        }
+    }
+    else
+
+    {
+
+        /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+        /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+        if(!check_row_stage_1)
+        {
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                //Interleaving 0,4 row in 0 , 1 Rishab
+                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+
+                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+                //Interleaving 2,6 row in 4, 5 Rishab
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
+                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+
+
+                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
+                //m_coeff4 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[3][0]);
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
+                //m_coeff2 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[1][0]);
+
+            }
+
+            /* e */
+            {
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+
+                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+                    //o0:1B*89+3B*75,1T*89+3T*75
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+                }
+
+                /* Column 0 of destination computed here */
+                /* It is stored in m_temp_reg_50 */
+                /* Column 7 of destination computed here */
+                /* It is stored in m_temp_reg_57 */
+                {
+
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+                    /* Loading coeff for computing o2  in the next block */
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
+
+                }
+
+                /* Column 1 of destination computed here */
+                /* It is stored in m_temp_reg_51 */
+                /* Column 6 of destination computed here */
+                /* It is stored in m_temp_reg_56 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o2:1B*50-3B*89,1T*50-3T*89
+                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
+
+                }
+
+                /* Column 2 of destination computed here */
+                /* It is stored in m_temp_reg_52 */
+                /* Column 5 of destination computed here */
+                /* It is stored in m_temp_reg_55 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o3:1B*18-3B*50,1T*18-3T*50
+                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+                }
+
+                /* Column 3 of destination computed here */
+                /* It is stored in m_temp_reg_53 */
+                /* Column 4 of destination computed here */
+                /* It is stored in m_temp_reg_54 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+            }
+        }
+        else
+        {
+
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                //Interleaving 0,4 row in 0 , 1 Rishab
+                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+
+                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+                //Interleaving 2,6 row in 4, 5 Rishab
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
+                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+
+
+                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
+
+            }
+
+            /* e */
+            {
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+
+                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                    m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 0 of destination computed here */
+                /* It is stored in m_temp_reg_50 */
+                /* Column 7 of destination computed here */
+                /* It is stored in m_temp_reg_57 */
+                {
+
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+                    /* Loading coeff for computing o2  in the next block */
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 1 of destination computed here */
+                /* It is stored in m_temp_reg_51 */
+                /* Column 6 of destination computed here */
+                /* It is stored in m_temp_reg_56 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 2 of destination computed here */
+                /* It is stored in m_temp_reg_52 */
+                /* Column 5 of destination computed here */
+                /* It is stored in m_temp_reg_55 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 3 of destination computed here */
+                /* It is stored in m_temp_reg_53 */
+                /* Column 4 of destination computed here */
+                /* It is stored in m_temp_reg_54 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+            }
+        }
+        /* Stage 2 */
+
+        i4_shift = IDCT_STG2_SHIFT;
+
+        {
+
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            {
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
+
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+                /* Loading coeff for computing o0 in the next block */
+                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[1][0]);
+
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
+            }
+
+            /* e */
+            {
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+            }
+
+            /* o */
+            {
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
+                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+                    /* Loading coeff for computing o1 in the next block */
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[3][0]);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 0 of destination computed here */
+                /* It is stored in m_temp_reg_50 */
+                /* Column 7 of destination computed here */
+                /* It is stored in m_temp_reg_57 */
+                {
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+                    /* Loading coeff for computing o2  in the next block */
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[5][0]);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 1 of destination computed here */
+                /* It is stored in m_temp_reg_51 */
+                /* Column 6 of destination computed here */
+                /* It is stored in m_temp_reg_56 */
+                {
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[7][0]);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 2 of destination computed here */
+                /* It is stored in m_temp_reg_52 */
+                /* Column 5 of destination computed here */
+                /* It is stored in m_temp_reg_55 */
+                {
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 3 of destination computed here */
+                /* It is stored in m_temp_reg_53 */
+                /* Column 4 of destination computed here */
+                /* It is stored in m_temp_reg_54 */
+                {
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
+                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
+                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
+
+                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
+                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
+                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
+                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+            }
+
+            /* Recon and store */
+            {
+                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+
+                m_temp_reg_50 = _mm_setzero_si128();
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
+                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
+                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
+                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
+                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
+                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
+                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
+                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
+
+                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
+                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
+                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
+                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
+                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
+                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
+                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
+                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
+
+                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
+                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
+                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
+                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
+                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
+                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
+                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
+                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
+
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
+                pu1_dst += dst_strd;
+
+            }
+
+
+        }
+
+
+    }
+}
+
+void impeg2_idct_recon_dc_mismatch_sse42(WORD16 *pi2_src,
+                            WORD16 *pi2_tmp,
+                            UWORD8 *pu1_pred,
+                            UWORD8 *pu1_dst,
+                            WORD32 src_strd,
+                            WORD32 pred_strd,
+                            WORD32 dst_strd,
+                            WORD32 zero_cols,
+                            WORD32 zero_rows)
+{
+    WORD32 val;
+    __m128i value_4x32b, mismatch_stg2_additive;
+    __m128i pred_r, pred_half0, pred_half1;
+    __m128i temp0, temp1;
+    __m128i round_stg2 = _mm_set1_epi32(IDCT_STG2_ROUND);
+
+    UNUSED(pi2_tmp);
+    UNUSED(src_strd);
+    UNUSED(zero_cols);
+    UNUSED(zero_rows);
+
+    val = pi2_src[0] * gai2_impeg2_idct_q15[0];
+    val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
+    val *= gai2_impeg2_idct_q11[0];
+    value_4x32b = _mm_set1_epi32(val);
+
+    // Row 0 processing
+    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) gai2_impeg2_mismatch_stg2_additive);
+    pred_r = _mm_loadl_epi64((__m128i *) pu1_pred);
+    pred_r =  _mm_cvtepu8_epi16(pred_r);
+    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+    pred_half0 = _mm_cvtepu16_epi32(pred_r);
+    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+    pred_r = _mm_srli_si128(pred_r, 8);
+
+    temp0 = _mm_add_epi32(temp0, value_4x32b);
+    temp1 = _mm_add_epi32(temp1, value_4x32b);
+    temp0 = _mm_add_epi32(temp0, round_stg2);
+    temp1 = _mm_add_epi32(temp1, round_stg2);
+    pred_half1 = _mm_cvtepu16_epi32(pred_r);
+    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+    temp0 = _mm_add_epi32(temp0, pred_half0);
+    temp1 = _mm_add_epi32(temp1, pred_half1);
+
+    temp0 = _mm_packus_epi32(temp0, temp1);
+    temp0 = _mm_packus_epi16(temp0, temp1);
+
+    _mm_storel_epi64((__m128i *)pu1_dst, temp0);
+
+    // Row 1 processing
+    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 8));
+    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
+    pred_r =  _mm_cvtepu8_epi16(pred_r);
+    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+    pred_half0 = _mm_cvtepu16_epi32(pred_r);
+    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+    pred_r = _mm_srli_si128(pred_r, 8);
+
+    temp0 = _mm_add_epi32(temp0, value_4x32b);
+    temp1 = _mm_add_epi32(temp1, value_4x32b);
+    temp0 = _mm_add_epi32(temp0, round_stg2);
+    temp1 = _mm_add_epi32(temp1, round_stg2);
+    pred_half1 = _mm_cvtepu16_epi32(pred_r);
+    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+    temp0 = _mm_add_epi32(temp0, pred_half0);
+    temp1 = _mm_add_epi32(temp1, pred_half1);
+
+    temp0 = _mm_packus_epi32(temp0, temp1);
+    temp0 = _mm_packus_epi16(temp0, temp1);
+
+    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp0);
+
+    // Row 2 processing
+    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 16));
+    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 2 * pred_strd));
+    pred_r =  _mm_cvtepu8_epi16(pred_r);
+    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+    pred_half0 = _mm_cvtepu16_epi32(pred_r);
+    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+    pred_r = _mm_srli_si128(pred_r, 8);
+
+    temp0 = _mm_add_epi32(temp0, value_4x32b);
+    temp1 = _mm_add_epi32(temp1, value_4x32b);
+    temp0 = _mm_add_epi32(temp0, round_stg2);
+    temp1 = _mm_add_epi32(temp1, round_stg2);
+    pred_half1 = _mm_cvtepu16_epi32(pred_r);
+    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+    temp0 = _mm_add_epi32(temp0, pred_half0);
+    temp1 = _mm_add_epi32(temp1, pred_half1);
+
+    temp0 = _mm_packus_epi32(temp0, temp1);
+    temp0 = _mm_packus_epi16(temp0, temp1);
+
+    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), temp0);
+
+    // Row 3 processing
+    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 24));
+    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 3 * pred_strd));
+    pred_r =  _mm_cvtepu8_epi16(pred_r);
+    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+    pred_half0 = _mm_cvtepu16_epi32(pred_r);
+    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+    pred_r = _mm_srli_si128(pred_r, 8);
+
+    temp0 = _mm_add_epi32(temp0, value_4x32b);
+    temp1 = _mm_add_epi32(temp1, value_4x32b);
+    temp0 = _mm_add_epi32(temp0, round_stg2);
+    temp1 = _mm_add_epi32(temp1, round_stg2);
+    pred_half1 = _mm_cvtepu16_epi32(pred_r);
+    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+    temp0 = _mm_add_epi32(temp0, pred_half0);
+    temp1 = _mm_add_epi32(temp1, pred_half1);
+
+    temp0 = _mm_packus_epi32(temp0, temp1);
+    temp0 = _mm_packus_epi16(temp0, temp1);
+
+    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), temp0);
+
+    // Row 4 processing
+    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 32));
+    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 4 * pred_strd));
+    pred_r =  _mm_cvtepu8_epi16(pred_r);
+    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+    pred_half0 = _mm_cvtepu16_epi32(pred_r);
+    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+    pred_r = _mm_srli_si128(pred_r, 8);
+
+    temp0 = _mm_add_epi32(temp0, value_4x32b);
+    temp1 = _mm_add_epi32(temp1, value_4x32b);
+    temp0 = _mm_add_epi32(temp0, round_stg2);
+    temp1 = _mm_add_epi32(temp1, round_stg2);
+    pred_half1 = _mm_cvtepu16_epi32(pred_r);
+    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+    temp0 = _mm_add_epi32(temp0, pred_half0);
+    temp1 = _mm_add_epi32(temp1, pred_half1);
+
+    temp0 = _mm_packus_epi32(temp0, temp1);
+    temp0 = _mm_packus_epi16(temp0, temp1);
+
+    _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), temp0);
+
+    // Row 5 processing
+    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 40));
+    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 5 * pred_strd));
+    pred_r =  _mm_cvtepu8_epi16(pred_r);
+    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+    pred_half0 = _mm_cvtepu16_epi32(pred_r);
+    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+    pred_r = _mm_srli_si128(pred_r, 8);
+
+    temp0 = _mm_add_epi32(temp0, value_4x32b);
+    temp1 = _mm_add_epi32(temp1, value_4x32b);
+    temp0 = _mm_add_epi32(temp0, round_stg2);
+    temp1 = _mm_add_epi32(temp1, round_stg2);
+    pred_half1 = _mm_cvtepu16_epi32(pred_r);
+    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+    temp0 = _mm_add_epi32(temp0, pred_half0);
+    temp1 = _mm_add_epi32(temp1, pred_half1);
+
+    temp0 = _mm_packus_epi32(temp0, temp1);
+    temp0 = _mm_packus_epi16(temp0, temp1);
+
+    _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), temp0);
+
+    // Row 6 processing
+    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 48));
+    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 6 * pred_strd));
+    pred_r =  _mm_cvtepu8_epi16(pred_r);
+    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+    pred_half0 = _mm_cvtepu16_epi32(pred_r);
+    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+    pred_r = _mm_srli_si128(pred_r, 8);
+
+    temp0 = _mm_add_epi32(temp0, value_4x32b);
+    temp1 = _mm_add_epi32(temp1, value_4x32b);
+    temp0 = _mm_add_epi32(temp0, round_stg2);
+    temp1 = _mm_add_epi32(temp1, round_stg2);
+    pred_half1 = _mm_cvtepu16_epi32(pred_r);
+    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+    temp0 = _mm_add_epi32(temp0, pred_half0);
+    temp1 = _mm_add_epi32(temp1, pred_half1);
+
+    temp0 = _mm_packus_epi32(temp0, temp1);
+    temp0 = _mm_packus_epi16(temp0, temp1);
+
+    _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), temp0);
+
+    // Row 7 processing
+    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 56));
+    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 7 * pred_strd));
+    pred_r =  _mm_cvtepu8_epi16(pred_r);
+    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+    pred_half0 = _mm_cvtepu16_epi32(pred_r);
+    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+    pred_r = _mm_srli_si128(pred_r, 8);
+
+    temp0 = _mm_add_epi32(temp0, value_4x32b);
+    temp1 = _mm_add_epi32(temp1, value_4x32b);
+    temp0 = _mm_add_epi32(temp0, round_stg2);
+    temp1 = _mm_add_epi32(temp1, round_stg2);
+    pred_half1 = _mm_cvtepu16_epi32(pred_r);
+    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+    temp0 = _mm_add_epi32(temp0, pred_half0);
+    temp1 = _mm_add_epi32(temp1, pred_half1);
+
+    temp0 = _mm_packus_epi32(temp0, temp1);
+    temp0 = _mm_packus_epi16(temp0, temp1);
+
+    _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), temp0);
+}
+
+void impeg2_idct_recon_dc_sse42(WORD16 *pi2_src,
+                            WORD16 *pi2_tmp,
+                            UWORD8 *pu1_pred,
+                            UWORD8 *pu1_dst,
+                            WORD32 src_strd,
+                            WORD32 pred_strd,
+                            WORD32 dst_strd,
+                            WORD32 zero_cols,
+                            WORD32 zero_rows)
+{
+    WORD32 val;
+    __m128i value_4x32b, pred_r0, pred_r1, temp0, temp1, temp2, temp3;
+
+    UNUSED(pi2_tmp);
+    UNUSED(src_strd);
+    UNUSED(zero_cols);
+    UNUSED(zero_rows);
+
+    val = pi2_src[0] * gai2_impeg2_idct_q15[0];
+    val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
+    val = val * gai2_impeg2_idct_q11[0];
+    val = ((val + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT);
+
+    value_4x32b = _mm_set1_epi32(val);
+
+    //Row 0-1 processing
+    pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
+    pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
+    pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
+    pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
+
+    temp0 = _mm_cvtepu16_epi32(pred_r0);
+    pred_r0 = _mm_srli_si128(pred_r0, 8);
+    temp2 = _mm_cvtepu16_epi32(pred_r1);
+    pred_r1 = _mm_srli_si128(pred_r1, 8);
+    temp1 = _mm_cvtepu16_epi32(pred_r0);
+    temp3 = _mm_cvtepu16_epi32(pred_r1);
+
+    temp0 = _mm_add_epi32(temp0, value_4x32b);
+    temp2 = _mm_add_epi32(temp2, value_4x32b);
+    temp1 = _mm_add_epi32(temp1, value_4x32b);
+    temp3 = _mm_add_epi32(temp3, value_4x32b);
+    temp0 = _mm_packus_epi32(temp0, temp1);
+    temp2 = _mm_packus_epi32(temp2, temp3);
+    temp0 = _mm_packus_epi16(temp0, temp1);
+    temp2 = _mm_packus_epi16(temp2, temp3);
+    _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
+    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
+
+    //Row 2-3 processing
+    pu1_pred += 2 * pred_strd;
+    pu1_dst += 2 * dst_strd;
+
+    pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
+    pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
+    pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
+    pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
+
+    temp0 = _mm_cvtepu16_epi32(pred_r0);
+    pred_r0 = _mm_srli_si128(pred_r0, 8);
+    temp2 = _mm_cvtepu16_epi32(pred_r1);
+    pred_r1 = _mm_srli_si128(pred_r1, 8);
+    temp1 = _mm_cvtepu16_epi32(pred_r0);
+    temp3 = _mm_cvtepu16_epi32(pred_r1);
+
+    temp0 = _mm_add_epi32(temp0, value_4x32b);
+    temp2 = _mm_add_epi32(temp2, value_4x32b);
+    temp1 = _mm_add_epi32(temp1, value_4x32b);
+    temp3 = _mm_add_epi32(temp3, value_4x32b);
+    temp0 = _mm_packus_epi32(temp0, temp1);
+    temp2 = _mm_packus_epi32(temp2, temp3);
+    temp0 = _mm_packus_epi16(temp0, temp1);
+    temp2 = _mm_packus_epi16(temp2, temp3);
+    _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
+    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
+
+    //Row 4-5 processing
+    pu1_pred += 2 * pred_strd;
+    pu1_dst += 2 * dst_strd;
+
+    pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
+    pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
+    pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
+    pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
+
+    temp0 = _mm_cvtepu16_epi32(pred_r0);
+    pred_r0 = _mm_srli_si128(pred_r0, 8);
+    temp2 = _mm_cvtepu16_epi32(pred_r1);
+    pred_r1 = _mm_srli_si128(pred_r1, 8);
+    temp1 = _mm_cvtepu16_epi32(pred_r0);
+    temp3 = _mm_cvtepu16_epi32(pred_r1);
+
+    temp0 = _mm_add_epi32(temp0, value_4x32b);
+    temp2 = _mm_add_epi32(temp2, value_4x32b);
+    temp1 = _mm_add_epi32(temp1, value_4x32b);
+    temp3 = _mm_add_epi32(temp3, value_4x32b);
+    temp0 = _mm_packus_epi32(temp0, temp1);
+    temp2 = _mm_packus_epi32(temp2, temp3);
+    temp0 = _mm_packus_epi16(temp0, temp1);
+    temp2 = _mm_packus_epi16(temp2, temp3);
+    _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
+    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
+
+    //Row 6-7 processing
+    pu1_pred += 2 * pred_strd;
+    pu1_dst += 2 * dst_strd;
+
+    pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
+    pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
+    pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
+    pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
+
+    temp0 = _mm_cvtepu16_epi32(pred_r0);
+    pred_r0 = _mm_srli_si128(pred_r0, 8);
+    temp2 = _mm_cvtepu16_epi32(pred_r1);
+    pred_r1 = _mm_srli_si128(pred_r1, 8);
+    temp1 = _mm_cvtepu16_epi32(pred_r0);
+    temp3 = _mm_cvtepu16_epi32(pred_r1);
+
+    temp0 = _mm_add_epi32(temp0, value_4x32b);
+    temp2 = _mm_add_epi32(temp2, value_4x32b);
+    temp1 = _mm_add_epi32(temp1, value_4x32b);
+    temp3 = _mm_add_epi32(temp3, value_4x32b);
+    temp0 = _mm_packus_epi32(temp0, temp1);
+    temp2 = _mm_packus_epi32(temp2, temp3);
+    temp0 = _mm_packus_epi16(temp0, temp1);
+    temp2 = _mm_packus_epi16(temp2, temp3);
+    _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
+    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
+}
diff --git a/common/x86/impeg2_inter_pred_sse42_intr.c b/common/x86/impeg2_inter_pred_sse42_intr.c
new file mode 100644
index 0000000..4599afa
--- /dev/null
+++ b/common/x86/impeg2_inter_pred_sse42_intr.c
@@ -0,0 +1,899 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+ *******************************************************************************
+ * @file
+ *  impeg2_inter_pred_sse42_intr.c
+ *
+ * @brief
+ *  Contains Motion compensation function definitions for MPEG2 decoder
+ *
+ * @author
+ *  Mohit [100664]
+ *
+ * - impeg2_copy_mb_sse42()
+ * - impeg2_interpolate_sse42()
+ * - impeg2_mc_halfx_halfy_8x8_sse42()
+ * - impeg2_mc_halfx_fully_8x8_sse42()
+ * - impeg2_mc_fullx_halfy_8x8_sse42()
+ * - impeg2_mc_fullx_fully_8x8_sse42()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "impeg2_macros.h"
+#include "impeg2_defs.h"
+#include "impeg2_inter_pred.h"
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+/*******************************************************************************
+*  Function Name   : impeg2_copy_mb
+*
+*  Description     : copies 3 components to the frame from mc_buf
+*
+*  Arguments       :
+*  src_buf         : Source Buffer
+*  dst_buf         : Destination Buffer
+*  src_wd          : Source Width
+*  dst_wd          : destination Width
+*
+*  Values Returned : None
+*******************************************************************************/
+void impeg2_copy_mb_sse42(yuv_buf_t *src_buf,
+                    yuv_buf_t *dst_buf,
+                    UWORD32 src_wd,
+                    UWORD32 dst_wd)
+{
+    UWORD8 *src;
+    UWORD8 *dst;
+    __m128i src_r0, src_r1, src_r2, src_r3;
+
+    /*******************************************************/
+    /* copy Y                                              */
+    /*******************************************************/
+    src = src_buf->pu1_y;
+    dst = dst_buf->pu1_y;
+    // Row 0-3
+    src_r0 = _mm_loadu_si128((__m128i *) (src));
+    src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
+    src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
+    src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
+
+    _mm_storeu_si128((__m128i *) dst, src_r0);
+    _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
+    _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
+    _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
+
+    // Row 4-7
+    src += 4 * src_wd;
+    dst += 4 * dst_wd;
+    src_r0 = _mm_loadu_si128((__m128i *) (src));
+    src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
+    src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
+    src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
+
+    _mm_storeu_si128((__m128i *) dst, src_r0);
+    _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
+    _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
+    _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
+
+    // Row 8-11
+    src += 4 * src_wd;
+    dst += 4 * dst_wd;
+    src_r0 = _mm_loadu_si128((__m128i *) (src));
+    src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
+    src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
+    src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
+
+    _mm_storeu_si128((__m128i *) dst, src_r0);
+    _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
+    _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
+    _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
+
+    // Row 12-15
+    src += 4 * src_wd;
+    dst += 4 * dst_wd;
+    src_r0 = _mm_loadu_si128((__m128i *) (src));
+    src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
+    src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
+    src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
+
+    _mm_storeu_si128((__m128i *) dst, src_r0);
+    _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
+    _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
+    _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
+
+    src_wd >>= 1;
+    dst_wd >>= 1;
+
+    /*******************************************************/
+    /* copy U                                              */
+    /*******************************************************/
+    src = src_buf->pu1_u;
+    dst = dst_buf->pu1_u;
+
+    // Row 0-3
+    src_r0 =  _mm_loadl_epi64((__m128i *)src);
+    src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
+    src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
+    src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
+
+    _mm_storel_epi64((__m128i *)dst, src_r0);
+    _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
+    _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
+    _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
+
+    // Row 4-7
+    src += 4 * src_wd;
+    dst += 4 * dst_wd;
+
+    src_r0 =  _mm_loadl_epi64((__m128i *)src);
+    src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
+    src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
+    src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
+
+    _mm_storel_epi64((__m128i *)dst, src_r0);
+    _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
+    _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
+    _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
+
+    /*******************************************************/
+    /* copy V                                              */
+    /*******************************************************/
+    src = src_buf->pu1_v;
+    dst = dst_buf->pu1_v;
+    // Row 0-3
+    src_r0 =  _mm_loadl_epi64((__m128i *)src);
+    src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
+    src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
+    src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
+
+    _mm_storel_epi64((__m128i *)dst, src_r0);
+    _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
+    _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
+    _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
+
+    // Row 4-7
+    src += 4 * src_wd;
+    dst += 4 * dst_wd;
+
+    src_r0 =  _mm_loadl_epi64((__m128i *)src);
+    src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
+    src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
+    src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
+
+    _mm_storel_epi64((__m128i *)dst, src_r0);
+    _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
+    _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
+    _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : impeg2_interpolate                                       */
+/*                                                                           */
+/*  Description   : averages the contents of buf_src1 and buf_src2 and stores*/
+/*                  result in buf_dst                                        */
+/*                                                                           */
+/*  Inputs        : buf_src1 -  First Source                                 */
+/*                  buf_src2 -  Second Source                                */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : Avg the values from two sources and store the result in  */
+/*                  destination buffer                                       */
+/*                                                                           */
+/*  Outputs       : buf_dst  -  Avg of contents of buf_src1 and buf_src2     */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : Assumes that all 3 buffers are of same size              */
+/*                                                                           */
+/*****************************************************************************/
+void impeg2_interpolate_sse42(yuv_buf_t *buf_src1,
+                        yuv_buf_t *buf_src2,
+                        yuv_buf_t *buf_dst,
+                        UWORD32 stride)
+{
+    UWORD8 *src1, *src2;
+    UWORD8 *dst;
+    __m128i src1_r0, src1_r1, src1_r2, src1_r3;
+    __m128i src2_r0, src2_r1, src2_r2, src2_r3;
+
+    /*******************************************************/
+    /* interpolate Y                                       */
+    /*******************************************************/
+    src1 = buf_src1->pu1_y;
+    src2 = buf_src2->pu1_y;
+    dst  = buf_dst->pu1_y;
+    // Row 0-3
+    src1_r0 = _mm_loadu_si128((__m128i *) (src1));
+    src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
+    src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
+    src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
+
+    src2_r0 = _mm_loadu_si128((__m128i *) (src2));
+    src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
+    src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
+    src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
+
+    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+    _mm_storeu_si128((__m128i *) dst, src1_r0);
+    _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
+    _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
+    _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
+
+    // Row 4-7
+    src1 += 4 * 16;
+    src2 += 4 * 16;
+    dst += 4 * stride;
+    src1_r0 = _mm_loadu_si128((__m128i *) (src1));
+    src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
+    src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
+    src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
+
+    src2_r0 = _mm_loadu_si128((__m128i *) (src2));
+    src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
+    src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
+    src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
+
+    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+    _mm_storeu_si128((__m128i *) dst, src1_r0);
+    _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
+    _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
+    _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
+
+    // Row 8-11
+    src1 += 4 * 16;
+    src2 += 4 * 16;
+    dst += 4 * stride;
+    src1_r0 = _mm_loadu_si128((__m128i *) (src1));
+    src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
+    src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
+    src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
+
+    src2_r0 = _mm_loadu_si128((__m128i *) (src2));
+    src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
+    src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
+    src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
+
+    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+    _mm_storeu_si128((__m128i *) dst, src1_r0);
+    _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
+    _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
+    _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
+
+    // Row 12-15
+    src1 += 4 * 16;
+    src2 += 4 * 16;
+    dst += 4 * stride;
+    src1_r0 = _mm_loadu_si128((__m128i *) (src1));
+    src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
+    src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
+    src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
+
+    src2_r0 = _mm_loadu_si128((__m128i *) (src2));
+    src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
+    src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
+    src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
+
+    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+    _mm_storeu_si128((__m128i *) dst, src1_r0);
+    _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
+    _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
+    _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
+
+    stride >>= 1;
+
+    /*******************************************************/
+    /* interpolate U                                       */
+    /*******************************************************/
+    src1 = buf_src1->pu1_u;
+    src2 = buf_src2->pu1_u;
+    dst  = buf_dst->pu1_u;
+    // Row 0-3
+    src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
+    src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
+    src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
+    src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
+
+    src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
+    src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
+    src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
+    src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
+
+    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+    _mm_storel_epi64((__m128i *) dst, src1_r0);
+    _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
+    _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
+    _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
+
+    // Row 4-7
+    src1 += 4 * 8;
+    src2 += 4 * 8;
+    dst += 4 * stride;
+
+    src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
+    src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
+    src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
+    src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
+
+    src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
+    src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
+    src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
+    src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
+
+    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+    _mm_storel_epi64((__m128i *) dst, src1_r0);
+    _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
+    _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
+    _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
+
+    /*******************************************************/
+    /* interpolate V                                       */
+    /*******************************************************/
+    src1 = buf_src1->pu1_v;
+    src2 = buf_src2->pu1_v;
+    dst  = buf_dst->pu1_v;
+
+    // Row 0-3
+    src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
+    src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
+    src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
+    src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
+
+    src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
+    src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
+    src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
+    src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
+
+    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+    _mm_storel_epi64((__m128i *) dst, src1_r0);
+    _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
+    _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
+    _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
+
+    // Row 4-7
+    src1 += 4 * 8;
+    src2 += 4 * 8;
+    dst += 4 * stride;
+
+    src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
+    src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
+    src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
+    src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
+
+    src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
+    src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
+    src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
+    src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
+
+    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+    _mm_storel_epi64((__m128i *) dst, src1_r0);
+    _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
+    _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
+    _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : impeg2_mc_halfx_halfy_8x8_sse42()                                 */
+/*                                                                           */
+/*  Description   : Gets the buffer from (0.5,0.5) to (8.5,8.5)              */
+/*                  and the above block of size 8 x 8 will be placed as a    */
+/*                  block from the current position of out_buf               */
+/*                                                                           */
+/*  Inputs        : ref - Reference frame from which the block will be       */
+/*                        block will be extracted.                           */
+/*                  ref_wid - WIdth of reference frame                       */
+/*                  out_wid - WIdth of the output frame                      */
+/*                  blk_width  - width of the block                          */
+/*                  blk_width  - height of the block                         */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : Point to the (0,0),(1,0),(0,1),(1,1) position in         */
+/*                  the ref frame.Interpolate these four values to get the   */
+/*                  value at(0.5,0.5).Repeat this to get an 8 x 8 block      */
+/*                  using 9 x 9 block from reference frame                   */
+/*                                                                           */
+/*  Outputs       : out -  Output containing the extracted block             */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*****************************************************************************/
+void impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out,
+                            UWORD8 *ref,
+                            UWORD32 ref_wid,
+                            UWORD32 out_wid)
+{
+    UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3;
+    /* P0-P3 are the pixels in the reference frame and Q is the value being */
+    /* estimated                                                            */
+    /*
+       P0 P1
+         Q
+       P2 P3
+    */
+    __m128i src_r0, src_r0_1, src_r1, src_r1_1;
+    __m128i tmp0, tmp1;
+    __m128i value_2 = _mm_set1_epi16(2);
+
+    ref_p0 = ref;
+    ref_p1 = ref + 1;
+    ref_p2 = ref + ref_wid;
+    ref_p3 = ref + ref_wid + 1;
+
+    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 0
+    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
+    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 1
+    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+    src_r0 =  _mm_cvtepu8_epi16(src_r0);
+    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
+    src_r1 =  _mm_cvtepu8_epi16(src_r1);
+    src_r1_1 =  _mm_cvtepu8_epi16(src_r1_1);
+
+    tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 0 horizontal interpolation
+    tmp1 = _mm_add_epi16(src_r1, src_r1_1);             //Row 1 horizontal interpolation
+    tmp0 = _mm_add_epi16(tmp0, tmp1);                   //Row 0 vertical interpolation
+    tmp0 = _mm_add_epi16(tmp0, value_2);
+    tmp0 =  _mm_srli_epi16(tmp0, 2);
+    tmp0 = _mm_packus_epi16(tmp0, value_2);
+
+    _mm_storel_epi64((__m128i *)out, tmp0);
+
+    //Row 1
+    ref_p2 += ref_wid;
+    ref_p3 += ref_wid;
+    out += out_wid;
+
+    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 2
+    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+    src_r0 =  _mm_cvtepu8_epi16(src_r0);
+    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
+
+    tmp0 = _mm_add_epi16(src_r0, src_r0_1);         //Row 2 horizontal interpolation
+    tmp1 = _mm_add_epi16(tmp0, tmp1);               //Row 1 vertical interpolation
+    tmp1 = _mm_add_epi16(tmp1, value_2);
+    tmp1 =  _mm_srli_epi16(tmp1, 2);
+    tmp1 = _mm_packus_epi16(tmp1, value_2);
+
+    _mm_storel_epi64((__m128i *)out, tmp1);
+
+    //Row 2
+    ref_p2 += ref_wid;
+    ref_p3 += ref_wid;
+    out += out_wid;
+
+    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 3
+    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+    src_r0 =  _mm_cvtepu8_epi16(src_r0);
+    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
+
+    tmp1 = _mm_add_epi16(src_r0, src_r0_1);         //Row 3 horizontal interpolation
+
+    tmp0 = _mm_add_epi16(tmp0, tmp1);               //Row 2 vertical interpolation
+    tmp0 = _mm_add_epi16(tmp0, value_2);
+    tmp0 =  _mm_srli_epi16(tmp0, 2);
+    tmp0 = _mm_packus_epi16(tmp0, value_2);
+
+    _mm_storel_epi64((__m128i *)out, tmp0);
+
+    //Row 3
+    ref_p2 += ref_wid;
+    ref_p3 += ref_wid;
+    out += out_wid;
+
+    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 4
+    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+    src_r0 =  _mm_cvtepu8_epi16(src_r0);
+    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
+
+    tmp0 = _mm_add_epi16(src_r0, src_r0_1);         //Row 4 horizontal interpolation
+
+    tmp1 = _mm_add_epi16(tmp0, tmp1);               //Row 3 vertical interpolation
+    tmp1 = _mm_add_epi16(tmp1, value_2);
+    tmp1 =  _mm_srli_epi16(tmp1, 2);
+    tmp1 = _mm_packus_epi16(tmp1, value_2);
+
+    _mm_storel_epi64((__m128i *)out, tmp1);
+
+    //Row 4
+    ref_p2 += ref_wid;
+    ref_p3 += ref_wid;
+    out += out_wid;
+
+    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 5
+    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+    src_r0 =  _mm_cvtepu8_epi16(src_r0);
+    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
+
+    tmp1 = _mm_add_epi16(src_r0, src_r0_1);     //Row 5 horizontal interpolation
+
+    tmp0 = _mm_add_epi16(tmp0, tmp1);           //Row 4 vertical interpolation
+    tmp0 = _mm_add_epi16(tmp0, value_2);
+    tmp0 =  _mm_srli_epi16(tmp0, 2);
+    tmp0 = _mm_packus_epi16(tmp0, value_2);
+
+    _mm_storel_epi64((__m128i *)out, tmp0);
+
+    //Row 5
+    ref_p2 += ref_wid;
+    ref_p3 += ref_wid;
+    out += out_wid;
+
+    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 6
+    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+    src_r0 =  _mm_cvtepu8_epi16(src_r0);
+    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
+
+    tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 6 horizontal interpolation
+
+    tmp1 = _mm_add_epi16(tmp0, tmp1);                   //Row 5 vertical interpolation
+    tmp1 = _mm_add_epi16(tmp1, value_2);
+    tmp1 =  _mm_srli_epi16(tmp1, 2);
+    tmp1 = _mm_packus_epi16(tmp1, value_2);
+
+    _mm_storel_epi64((__m128i *)out, tmp1);
+
+    //Row 6
+    ref_p2 += ref_wid;
+    ref_p3 += ref_wid;
+    out += out_wid;
+
+    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 7
+    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+    src_r0 =  _mm_cvtepu8_epi16(src_r0);
+    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
+
+    tmp1 = _mm_add_epi16(src_r0, src_r0_1);             //Row 7 horizontal interpolation
+
+    tmp0 = _mm_add_epi16(tmp0, tmp1);                   //Row 6 vertical interpolation
+    tmp0 = _mm_add_epi16(tmp0, value_2);
+    tmp0 =  _mm_srli_epi16(tmp0, 2);
+    tmp0 = _mm_packus_epi16(tmp0, value_2);
+
+    _mm_storel_epi64((__m128i *)out, tmp0);
+
+    //Row 7
+    ref_p2 += ref_wid;
+    ref_p3 += ref_wid;
+    out += out_wid;
+
+    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 8
+    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+    src_r0 =  _mm_cvtepu8_epi16(src_r0);
+    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
+
+    tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 8 horizontal interpolation
+
+    tmp1 = _mm_add_epi16(tmp0, tmp1);                   //Row 7 vertical interpolation
+    tmp1 = _mm_add_epi16(tmp1, value_2);
+    tmp1 =  _mm_srli_epi16(tmp1, 2);
+    tmp1 = _mm_packus_epi16(tmp1, value_2);
+
+    _mm_storel_epi64((__m128i *)out, tmp1);
+
+    return;
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : impeg2_mc_halfx_fully_8x8_sse42()                                 */
+/*                                                                           */
+/*  Description   : Gets the buffer from (0.5,0) to (8.5,8)                  */
+/*                  and the above block of size 8 x 8 will be placed as a    */
+/*                  block from the current position of out_buf               */
+/*                                                                           */
+/*  Inputs        : ref - Reference frame from which the block will be       */
+/*                        block will be extracted.                           */
+/*                  ref_wid - WIdth of reference frame                       */
+/*                  out_wid - WIdth of the output frame                      */
+/*                  blk_width  - width of the block                          */
+/*                  blk_width  - height of the block                         */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : Point to the (0,0) and (1,0) position in the ref frame   */
+/*                  Interpolate these two values to get the value at(0.5,0)  */
+/*                  Repeat this to get an 8 x 8 block using 9 x 8 block from */
+/*                  reference frame                                          */
+/*                                                                           */
+/*  Outputs       : out -  Output containing the extracted block             */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*****************************************************************************/
+void impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out,
+                            UWORD8 *ref,
+                            UWORD32 ref_wid,
+                            UWORD32 out_wid)
+{
+    UWORD8 *ref_p0,*ref_p1;
+    __m128i src_r0, src_r0_1, src_r1, src_r1_1;
+    /* P0-P3 are the pixels in the reference frame and Q is the value being */
+    /* estimated                                                            */
+    /*
+       P0 Q P1
+    */
+
+    ref_p0 = ref;
+    ref_p1 = ref + 1;
+
+    // Row 0 and 1
+    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 0
+    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
+    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 1
+    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
+
+    src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
+    src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
+
+    _mm_storel_epi64((__m128i *)out, src_r0);
+    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
+
+    // Row 2 and 3
+    ref_p0 += 2*ref_wid;
+    ref_p1 += 2*ref_wid;
+    out += 2*out_wid;
+
+    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 2
+    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
+    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 3
+    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
+
+    src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
+    src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
+
+    _mm_storel_epi64((__m128i *)out, src_r0);
+    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
+
+    // Row 4 and 5
+    ref_p0 += 2*ref_wid;
+    ref_p1 += 2*ref_wid;
+    out += 2*out_wid;
+
+    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 4
+    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
+    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 5
+    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
+
+    src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
+    src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
+
+    _mm_storel_epi64((__m128i *)out, src_r0);
+    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
+
+    // Row 6 and 7
+    ref_p0 += 2*ref_wid;
+    ref_p1 += 2*ref_wid;
+    out += 2*out_wid;
+
+    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 6
+    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
+    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 7
+    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
+
+    src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
+    src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
+
+    _mm_storel_epi64((__m128i *)out, src_r0);
+    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
+
+    return;
+}
+
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : impeg2_mc_fullx_halfy_8x8_sse42()                                 */
+/*                                                                           */
+/*  Description   : Gets the buffer from (0,0.5) to (8,8.5)                  */
+/*                  and the above block of size 8 x 8 will be placed as a    */
+/*                  block from the current position of out_buf               */
+/*                                                                           */
+/*  Inputs        : ref - Reference frame from which the block will be       */
+/*                        block will be extracted.                           */
+/*                  ref_wid - WIdth of reference frame                       */
+/*                  out_wid - WIdth of the output frame                      */
+/*                  blk_width  - width of the block                          */
+/*                  blk_width  - height of the block                         */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : Point to the (0,0) and (0,1)   position in the ref frame */
+/*                  Interpolate these two values to get the value at(0,0.5)  */
+/*                  Repeat this to get an 8 x 8 block using 8 x 9 block from */
+/*                  reference frame                                          */
+/*                                                                           */
+/*  Outputs       : out -  Output containing the extracted block             */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*****************************************************************************/
+void impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out,
+                            UWORD8 *ref,
+                            UWORD32 ref_wid,
+                            UWORD32 out_wid)
+{
+    __m128i src_r0, src_r1, src_r2, temp0, temp1;
+    /* P0-P3 are the pixels in the reference frame and Q is the value being */
+    /* estimated                                                            */
+    /*
+       P0
+        x
+       P1
+    */
+    src_r0 = _mm_loadl_epi64((__m128i *)ref);               //Row 0
+    src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 1
+    src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));   //Row 2
+    temp0 = _mm_avg_epu8(src_r0, src_r1);
+    temp1 = _mm_avg_epu8(src_r1, src_r2);
+    _mm_storel_epi64((__m128i *)out, temp0);                //Row 0
+    _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 1
+
+    ref+= 3*ref_wid;
+    out+= 2*out_wid;
+
+    src_r0 = _mm_loadl_epi64((__m128i *)ref);               //Row 3
+    src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 4
+    temp0 = _mm_avg_epu8(src_r2, src_r0);
+    temp1 = _mm_avg_epu8(src_r0, src_r1);
+    _mm_storel_epi64((__m128i *)out, temp0);                //Row 2
+    _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 3
+
+    ref += 2*ref_wid;
+    out+= 2*out_wid;
+
+    src_r2 = _mm_loadl_epi64((__m128i *)ref);               //Row 5
+    src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 6
+    temp0 = _mm_avg_epu8(src_r1, src_r2);
+    temp1 = _mm_avg_epu8(src_r2, src_r0);
+    _mm_storel_epi64((__m128i *)out, temp0);                //Row 4
+    _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 5
+
+    ref += 2*ref_wid;
+    out+= 2*out_wid;
+
+    src_r1 = _mm_loadl_epi64((__m128i *)ref);               //Row 7
+    src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid));  //Row 8
+    temp0 = _mm_avg_epu8(src_r0, src_r1);
+    temp1 = _mm_avg_epu8(src_r1, src_r2);
+    _mm_storel_epi64((__m128i *)out, temp0);                //Row 6
+    _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 7
+
+    return;
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : impeg2_mc_fullx_fully_8x8_sse42()                                 */
+/*                                                                           */
+/*  Description   : Gets the buffer from (x,y) to (x+8,y+8)                  */
+/*                  and the above block of size 8 x 8 will be placed as a    */
+/*                  block from the current position of out_buf               */
+/*                                                                           */
+/*  Inputs        : ref - Reference frame from which the block will be       */
+/*                        block will be extracted.                           */
+/*                  ref_wid - WIdth of reference frame                       */
+/*                  out_wid - WIdth of the output frame                      */
+/*                  blk_width  - width of the block                          */
+/*                  blk_width  - height of the block                         */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : Point to the (0,0) position in the ref frame             */
+/*                  Get an 8 x 8 block from reference frame                  */
+/*                                                                           */
+/*  Outputs       : out -  Output containing the extracted block             */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*****************************************************************************/
+void impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out,
+                            UWORD8 *ref,
+                            UWORD32 ref_wid,
+                            UWORD32 out_wid)
+{
+    __m128i src_r0, src_r1, src_r2, src_r3;
+    // Row 0-3
+    src_r0 =  _mm_loadl_epi64((__m128i *)ref);
+    src_r1 =  _mm_loadl_epi64((__m128i *)(ref + ref_wid));
+    src_r2 =  _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
+    src_r3 =  _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
+
+    _mm_storel_epi64((__m128i *)out, src_r0);
+    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
+    _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
+    _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
+
+    // Row 4-7
+    ref += 4 * ref_wid;
+    out += 4 * out_wid;
+
+    src_r0 =  _mm_loadl_epi64((__m128i *)ref);
+    src_r1 =  _mm_loadl_epi64((__m128i *)(ref + ref_wid));
+    src_r2 =  _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
+    src_r3 =  _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
+
+    _mm_storel_epi64((__m128i *)out, src_r0);
+    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
+    _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
+    _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
+    return;
+}
diff --git a/common/x86/impeg2_mem_func_sse42_intr.c b/common/x86/impeg2_mem_func_sse42_intr.c
new file mode 100644
index 0000000..de7de8f
--- /dev/null
+++ b/common/x86/impeg2_mem_func_sse42_intr.c
@@ -0,0 +1,100 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+ *******************************************************************************
+ * @file
+ *  impeg2_mem_func_sse42_intr.c
+ *
+ * @brief
+ *  Contains utility function definitions for MPEG2 codec
+ *
+ * @author
+ *  Mohit [100664]
+ *
+* @par List of Functions:
+* - impeg2_memset0_16bit_8x8_linear_block_sse42()
+* - impeg2_memset_8bit_8x8_block_sse42()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "impeg2_defs.h"
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+/*******************************************************************************
+*  Function Name   : impeg2_memset0_16bit_8x8_linear_block
+*
+*  Description     : memsets resudial buf to 0
+*
+*  Arguments       : destination buffer
+*
+*  Values Returned : None
+*******************************************************************************/
+
+
+void impeg2_memset0_16bit_8x8_linear_block_sse42 (WORD16 *buf)
+ {
+    __m128i zero_8x8_16b = _mm_set1_epi16(0);
+    _mm_storeu_si128((__m128i *) buf, zero_8x8_16b);
+    _mm_storeu_si128((__m128i *) (buf + 8), zero_8x8_16b);
+    _mm_storeu_si128((__m128i *) (buf + 16), zero_8x8_16b);
+    _mm_storeu_si128((__m128i *) (buf + 24), zero_8x8_16b);
+    _mm_storeu_si128((__m128i *) (buf + 32), zero_8x8_16b);
+    _mm_storeu_si128((__m128i *) (buf + 40), zero_8x8_16b);
+    _mm_storeu_si128((__m128i *) (buf + 48), zero_8x8_16b);
+    _mm_storeu_si128((__m128i *) (buf + 56), zero_8x8_16b);
+}
+
+
+
+/*******************************************************************************
+*  Function Name   : impeg2_memset_8bit_8x8_block
+*
+*  Description     : memsets residual buf to value
+*
+*  Arguments       : destination buffer, value and stride
+*
+*  Values Returned : None
+*******************************************************************************/
+
+
+void impeg2_memset_8bit_8x8_block_sse42(UWORD8 *dst, WORD32 dc_val, WORD32 dst_wd)
+{
+    __m128i value = _mm_set1_epi8((WORD8)dc_val);
+
+    _mm_storel_epi64((__m128i *)dst, value);
+    _mm_storel_epi64((__m128i *) (dst + dst_wd), value);
+    _mm_storel_epi64((__m128i *) (dst + 2 * dst_wd), value);
+    _mm_storel_epi64((__m128i *) (dst + 3 * dst_wd), value);
+    _mm_storel_epi64((__m128i *) (dst + 4 * dst_wd), value);
+    _mm_storel_epi64((__m128i *) (dst + 5 * dst_wd), value);
+    _mm_storel_epi64((__m128i *) (dst + 6 * dst_wd), value);
+    _mm_storel_epi64((__m128i *) (dst + 7 * dst_wd), value);
+}
diff --git a/common/x86/impeg2_platform_macros.h b/common/x86/impeg2_platform_macros.h
new file mode 100644
index 0000000..05ff6da
--- /dev/null
+++ b/common/x86/impeg2_platform_macros.h
@@ -0,0 +1,49 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_PLATFORM_MACROS_H__
+#define __IMPEG2_PLATFORM_MACROS_H__
+
+
+#define     CONV_LE_TO_BE(u4_temp2,u4_temp1)    u4_temp2 = (u4_temp1 << 24) |               \
+                                                           ((u4_temp1 & 0xff00) << 8) |     \
+                                                           ((u4_temp1 & 0xff0000) >> 8) |   \
+                                                           (u4_temp1 >> 24);
+static __inline UWORD32 CLZ(UWORD32 u4_word)
+{
+    if(u4_word)
+        return (__builtin_clz(u4_word));
+    else
+        return 32;
+}
+
+
+#define CLIP_U8(x) ((x) > 255) ? (255) : (((x) < 0) ? (0) : (x))
+#define CLIP_S8(x) ((x) > 127) ? (127) : (((x) < -128) ? (-128) : (x))
+
+#define CLIP_U12(x) ((x) > 4095) ? (4095) : (((x) < 0) ? (0) : (x))
+#define CLIP_S12(x) ((x) > 2047) ? (2047) : (((x) < -2048) ? (-2048) : (x))
+
+#define CLIP_U16(x) ((x) > 65535) ? (65535) : (((x) < 0) ? (0) : (x))
+#define CLIP_S16(x) ((x) > 65535) ? (65535) : (((x) < -65536) ? (-65536) : (x))
+#define PLD(x)
+
+#define INLINE
+
+#endif /* __IMPEG2_PLATFORM_MACROS_H__ */