Initial version

Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017
author: Hamsalekha S <hamsalekha.s@ittiam.com> 2015-03-13 21:24:58 +0530
committer: Hamsalekha S <hamsalekha.s@ittiam.com> 2015-04-02 15:59:02 +0530
commit: 8d3d303c7942ced6a987a52db8977d768dc3605f (patch)
tree: cc806c96794356996b13ba9970941d0aed74a97e /common/arm
parent: 3956d913d37327dcb340f836e604b04bd478b158 (diff)
download: android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz
android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2
android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip
30 files changed, 15428 insertions, 0 deletions
diff --git a/common/arm/ih264_arm_memory_barrier.s b/common/arm/ih264_arm_memory_barrier.s
new file mode 100755
index 0000000..523218f
--- /dev/null
+++ b/common/arm/ih264_arm_memory_barrier.s
@@ -0,0 +1,77 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@*******************************************************************************
+@* @file
+@*  ih264_arm_memory_barrier.s
+@*
+@* @brief
+@*  Contains function definitions for data synchronization.
+@*
+@* @author
+@*  Ittiam
+@*
+@* @par List of Functions:
+@*
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+
+.text
+.p2align 2
+
+
+@*****************************************************************************
+@*
+@* Function Name    : ih264_arm_dsb
+@* Description      : Adds DSB
+@* Revision History  :
+@*        DD MM YYYY    Author(s)   Changes
+@*        03 07 2008    100355      First version
+@*
+@*****************************************************************************
+
+        .global ih264_arm_dsb
+ih264_arm_dsb:
+    dsb
+    bx            lr
+
+
+
+@*****************************************************************************
+@*
+@* Function Name    : ih264_arm_dmb
+@* Description      : Adds DMB
+@* Revision History  :
+@*        DD MM YYYY    Author(s)   Changes
+@*        03 07 2008    100355      First version
+@*
+@*****************************************************************************
+
+        .global ih264_arm_dmb
+
+ih264_arm_dmb:
+    dmb
+    bx            lr
+
+
+
diff --git a/common/arm/ih264_deblk_chroma_a9.s b/common/arm/ih264_deblk_chroma_a9.s
new file mode 100755
index 0000000..66102a7
--- /dev/null
+++ b/common/arm/ih264_deblk_chroma_a9.s
@@ -0,0 +1,1337 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/*****************************************************************************/
+@/*                                                                           */
+@/*  File Name         : ih264_deblk_chroma_a9.s                              */
+@/*                                                                           */
+@/*  Description       : Contains function definitions for deblocking luma    */
+@/*                      edge. Functions are coded in NEON assembly and can   */
+@/*                      be compiled using ARM RVDS.                          */
+@/*                                                                           */
+@/*  List of Functions : ih264_deblk_chroma_vert_bs4_bp_a9()                  */
+@/*                      ih264_deblk_chroma_vert_bslt4_bp_a9()                */
+@/*                      ih264_deblk_chroma_horz_bs4_bp_a9()                  */
+@/*                      ih264_deblk_chroma_horz_bslt4_bp_a9()                */
+@/*                      ih264_deblk_chroma_vert_bs4_mbaff_bp_a9()            */
+@/*                      ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9()          */
+@/*                      ih264_deblk_chroma_vert_bs4_a9()                     */
+@/*                      ih264_deblk_chroma_vert_bslt4_a9()                   */
+@/*                      ih264_deblk_chroma_horz_bs4_a9()                     */
+@/*                      ih264_deblk_chroma_horz_bslt4_a9()                   */
+@/*                      ih264_deblk_chroma_vert_bs4_mbaff_a9()               */
+@/*                      ih264_deblk_chroma_vert_bslt4_mbaff_a9()             */
+@/*                                                                           */
+@/*  Issues / Problems : None                                                 */
+@/*                                                                           */
+@/*  Revision History  :                                                      */
+@/*                                                                           */
+@/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+@/*         28 11 2013   Ittiam          Draft                                */
+@/*         05 01 2015   Kaushik         Added double-call functions for      */
+@/*                      Senthoor        vertical deblocking, and high        */
+@/*                                      profile functions.                   */
+@/*                                                                           */
+@/*****************************************************************************/
+
+
+.text
+.p2align 2
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a chroma block horizontal edge when the
+@*     boundary strength is set to 4
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha
+@*  Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@*  Beta Value for the boundary
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_chroma_horz_bs4_bp_a9
+
+ih264_deblk_chroma_horz_bs4_bp_a9:
+
+    stmfd         sp!, {r4, lr}         @
+    vpush         {d8 - d15}
+    sub           r0, r0, r1, lsl #1    @R0 = uc_edgePixel pointing to p1 of chroma
+    vld2.8        {d6, d7}, [r0], r1    @D6 = p1u , D7 = p1v
+    mov           r4, r0                @Keeping a backup of the pointer p0 of chroma
+    vld2.8        {d4, d5}, [r0], r1    @D4 = p0u , D5 = p0v
+    vdup.8        q10, r2               @Q10 contains alpha
+    vld2.8        {d0, d1}, [r0], r1    @D0 = q0u , D1 = q0v
+    vaddl.u8      q4, d6, d0            @
+    vaddl.u8      q5, d7, d1            @Q4,Q5 = q0 + p1
+    vmov.i8       d31, #2               @
+    vld2.8        {d2, d3}, [r0]        @D2 = q1u , D3 = q1v
+    vabd.u8       q13, q3, q2           @Q13 = ABS(p1 - p0)
+    vmlal.u8      q4, d2, d31           @
+    vmlal.u8      q5, d3, d31           @Q5,Q4 = (X2(q1U) + q0U + p1U)
+    vabd.u8       q11, q2, q0           @Q11 = ABS(p0 - q0)
+    vabd.u8       q12, q1, q0           @Q12 = ABS(q1 - q0)
+    vaddl.u8      q7, d4, d2            @
+    vaddl.u8      q14, d5, d3           @Q14,Q7 = P0 + Q1
+    vdup.8        q8, r3                @Q8 contains beta
+    vmlal.u8      q7, d6, d31           @
+    vmlal.u8      q14, d7, d31          @Q14,Q7 = (X2(p1U) + p0U + q1U)
+    vcge.u8       q9, q11, q10          @Q9 = ( ABS(p0 - q0) >= Alpha )
+    vcge.u8       q12, q12, q8          @Q12= ( ABS(q1 - q0) >= Beta )
+    vcge.u8       q13, q13, q8          @Q13= ( ABS(p1 - p0) >= Beta )
+    vrshrn.u16    d8, q4, #2            @
+    vrshrn.u16    d9, q5, #2            @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
+    vorr          q9, q9, q12           @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+    vrshrn.u16    d10, q7, #2           @
+    vrshrn.u16    d11, q14, #2          @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
+    vorr          q9, q9, q13           @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+    vbit          q5, q2, q9            @
+    vbit          q4, q0, q9            @
+    vst2.8        {d10, d11}, [r4], r1  @
+    vst2.8        {d8, d9}, [r4]        @
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r4, pc}         @
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a chroma block vertical edge when the
+@*     boundary strength is set to 4
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha
+@*  Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@*  Beta Value for the boundary
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_chroma_vert_bs4_bp_a9
+
+ih264_deblk_chroma_vert_bs4_bp_a9:
+
+    stmfd         sp!, {r12, r14}
+    vpush         {d8 - d15}
+    sub           r0, r0, #4            @point r0 to p1u of row0.
+    mov           r12, r0               @keep a back up of r0 for buffer write
+
+    vld4.16       {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+    vld4.16       {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+    vld4.16       {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+    vld4.16       {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+
+    vld4.16       {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+    vld4.16       {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+    vld4.16       {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+    vld4.16       {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+    vdup.8        q11, r2               @Q4 = alpha
+    vdup.8        q12, r3               @Q5 = beta
+    vmov.i8       d31, #2
+
+    vabd.u8       q4, q1, q2            @|p0-q0|
+    vabd.u8       q5, q3, q2            @|q1-q0|
+    vabd.u8       q6, q0, q1            @|p1-p0|
+    vaddl.u8      q7, d2, d6
+    vaddl.u8      q8, d3, d7            @(p0 + q1)
+    vclt.u8       q4, q4, q11           @|p0-q0| < alpha ?
+    vclt.u8       q5, q5, q12           @|q1-q0| < beta ?
+    vclt.u8       q6, q6, q12           @|p1-p0| < beta ?
+    vmlal.u8      q7, d0, d31
+    vmlal.u8      q8, d1, d31           @2*p1 + (p0 + q1)
+    vaddl.u8      q9, d0, d4
+    vaddl.u8      q10, d1, d5           @(p1 + q0)
+    vand.u8       q4, q4, q5            @|p0-q0| < alpha && |q1-q0| < beta
+    vmlal.u8      q9, d6, d31
+    vmlal.u8      q10, d7, d31          @2*q1 + (p1 + q0)
+
+    vrshrn.i16    d14, q7, #2
+    vrshrn.i16    d15, q8, #2           @(2*p1 + (p0 + q1) + 2) >> 2
+    vand.u8       q4, q4, q6            @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+    vrshrn.i16    d18, q9, #2
+    vrshrn.i16    d19, q10, #2          @(2*q1 + (p1 + q0) + 2) >> 2
+
+    vbit          q1, q7, q4
+    vbit          q2, q9, q4
+
+    vst4.16       {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
+    vst4.16       {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
+    vst4.16       {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
+    vst4.16       {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
+
+    vst4.16       {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
+    vst4.16       {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
+    vst4.16       {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
+    vst4.16       {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a chroma block horizontal edge for cases where the
+@*     boundary strength is less than 4
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha
+@*  Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@*  Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@*  Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@*  tc0_table
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_chroma_horz_bslt4_bp_a9
+
+ih264_deblk_chroma_horz_bslt4_bp_a9:
+
+    stmfd         sp!, {r4-r6, lr}      @
+
+    ldrd          r4, r5, [sp, #0x10]   @r4 = u4_bs , r5 = pu1_cliptab
+    vpush         {d8 - d15}
+    sub           r0, r0, r1, lsl #1    @R0 = uc_edgePixelU pointing to p2 of chroma U
+    rev           r4, r4                @
+    vmov.32       d12[0], r4            @d12[0] = ui_Bs
+    vld1.32       d16[0], [r5]          @D16[0] contains cliptab
+    vld2.8        {d6, d7}, [r0], r1    @Q3=p1
+    vtbl.8        d14, {d16}, d12       @
+    vmovl.u8      q6, d12               @q6 = uc_Bs in each 16 bit scalar
+    mov           r6, r0                @Keeping a backup of the pointer to chroma U P0
+    vld2.8        {d4, d5}, [r0], r1    @Q2=p0
+    vmov.i8       d30, #1               @
+    vdup.8        q10, r2               @Q10 contains alpha
+    vld2.8        {d0, d1}, [r0], r1    @Q0=q0
+    vmovl.u8      q7, d14               @
+    vld2.8        {d2, d3}, [r0]        @Q1=q1
+    vsubl.u8      q5, d1, d5            @
+    vsubl.u8      q4, d0, d4            @Q5,Q4 = (q0 - p0)
+    vabd.u8       q13, q3, q2           @Q13 = ABS(p1 - p0)
+    vshl.i16      q5, q5, #2            @Q5 = (q0 - p0)<<2
+    vabd.u8       q11, q2, q0           @Q11 = ABS(p0 - q0)
+    vshl.i16      q4, q4, #2            @Q4 = (q0 - p0)<<2
+    vsli.16       q7, q7, #8            @
+    vabd.u8       q12, q1, q0           @Q12 = ABS(q1 - q0)
+    vcge.u8       q9, q11, q10          @Q9 = ( ABS(p0 - q0) >= Alpha )
+    vsubl.u8      q10, d6, d2           @Q10 = (p1 - q1)L
+    vsubl.u8      q3, d7, d3            @Q3 = (p1 - q1)H
+    vdup.8        q8, r3                @Q8 contains beta
+    vadd.i16      q4, q4, q10           @
+    vadd.i16      q5, q5, q3            @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
+    vcge.u8       q12, q12, q8          @Q12= ( ABS(q1 - q0) >= Beta )
+    vcgt.s16      d12, d12, #0          @Q6 = (us_Bs > 0)
+    vqrshrn.s16   d8, q4, #3            @
+    vqrshrn.s16   d9, q5, #3            @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
+    vadd.i8       d14, d14, d30         @Q7 = C = C0+1
+    vcge.u8       q13, q13, q8          @Q13= ( ABS(p1 - p0) >= Beta )
+    vorr          q9, q9, q12           @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+    vabs.s8       q3, q4                @Q4 = ABS (i_macro)
+    vmov.i8       d15, d14              @
+    vmov.i8       d13, d12              @
+    vorr          q9, q9, q13           @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+    vmin.u8       q7, q3, q7            @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
+    vbic          q6, q6, q9            @final condition
+    vcge.s8       q4, q4, #0            @Q4  = (i_macro >= 0)
+    vand          q7, q7, q6            @Making delta zero in places where values shouldn be filterd
+    vqadd.u8      q8, q2, q7            @Q8  = p0 + delta
+    vqsub.u8      q2, q2, q7            @Q2 = p0 - delta
+    vqadd.u8      q9, q0, q7            @Q9 = q0 + delta
+    vqsub.u8      q0, q0, q7            @Q0 = q0 - delta
+    vbif          q8, q2, q4            @Q8  = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
+    vbif          q0, q9, q4            @Q0  = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
+    vst2.8        {d16, d17}, [r6], r1  @
+    vst2.8        {d0, d1}, [r6]        @
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r4-r6, pc}      @
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a chroma block vertical edge for cases where the
+@*     boundary strength is less than 4
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha
+@*  Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@*  Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@*  Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@*  tc0_table
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_chroma_vert_bslt4_bp_a9
+
+ih264_deblk_chroma_vert_bslt4_bp_a9:
+
+    stmfd         sp!, {r10-r12, r14}
+
+    sub           r0, r0, #4            @point r0 to p1u of row0.
+    ldr           r11, [sp, #16]        @r12 = ui_Bs
+
+    ldr           r10, [sp, #20]        @r14 = puc_ClipTab
+    mov           r12, r0               @keep a back up of r0 for buffer write
+    vpush         {d8 - d15}
+    vld4.16       {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+    vld4.16       {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+    vld4.16       {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+    vld4.16       {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+
+    vld4.16       {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+    vld4.16       {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+    vld4.16       {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+    vld4.16       {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+
+    vdup.8        q11, r2               @Q4 = alpha
+    vabd.u8       q4, q1, q2            @|p0-q0|
+    vdup.8        q12, r3               @Q5 = beta
+    vabd.u8       q5, q3, q2            @|q1-q0|
+    vabd.u8       q6, q0, q1            @|p1-p0|
+    vclt.u8       q4, q4, q11           @|p0-q0| < alpha ?
+    vsubl.u8      q7, d0, d6
+    vclt.u8       q5, q5, q12           @|q1-q0| < beta ?
+    vsubl.u8      q8, d1, d7            @(p1 - q1)
+    vclt.u8       q6, q6, q12           @|p1-p0| < beta ?
+    vsubl.u8      q9, d4, d2
+    vand.u8       q4, q4, q5            @|p0-q0| < alpha && |q1-q0| < beta
+    vsubl.u8      q10, d5, d3           @(q0 - p0)
+    vmov.u16      q14, #4
+    vld1.32       {d24[0]}, [r10]       @Load ClipTable
+    rev           r11, r11              @Blocking strengths
+    vand.u8       q4, q4, q6            @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+
+    vmov.32       d10[0], r11
+
+    vmla.s16      q7, q9, q14
+    vmla.s16      q8, q10, q14          @4*(q0 - p0) + (p1 - q1)
+
+    vmovl.u8      q5, d10
+
+
+    vsli.u16      d10, d10, #8
+    vmovl.u16     q5, d10
+    vsli.u32      q5, q5, #16
+    vtbl.8        d12, {d24}, d10
+    vtbl.8        d13, {d24}, d11       @tC0
+    vmov.u8       q12, #1
+    vadd.u8       q6, q6, q12           @tC0 + 1
+    vcge.u8       q5, q5, q12           @u4_bS > 0 ?
+    vand.u8       q4, q4, q5            @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+
+    @ Q0 - Q3(inputs),
+    @ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+    @ Q6 (tC)
+
+    vrshr.s16     q7, q7, #3
+    vrshr.s16     q8, q8, #3            @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+
+    vcgt.s16      q9, q7, #0
+    vcgt.s16      q10, q8, #0
+    vmovn.i16     d18, q9
+    vmovn.i16     d19, q10              @Q9 = sign(delta)
+    vabs.s16      q7, q7
+    vabs.s16      q8, q8
+    vmovn.u16     d14, q7
+    vmovn.u16     d15, q8
+    vmin.u8       q7, q7, q6            @Q7 = |delta|
+
+    vqadd.u8      q10, q1, q7           @p0+|delta|
+    vqadd.u8      q11, q2, q7           @q0+|delta|
+    vqsub.u8      q12, q1, q7           @p0-|delta|
+    vqsub.u8      q13, q2, q7           @q0-|delta|
+
+    vbit          q12, q10, q9          @p0 + delta
+    vbit          q11, q13, q9          @q0 - delta
+
+    vbit          q1, q12, q4
+    vbit          q2, q11, q4
+
+    vst4.16       {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
+    vst4.16       {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
+    vst4.16       {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
+    vst4.16       {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
+
+    vst4.16       {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
+    vst4.16       {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
+    vst4.16       {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
+    vst4.16       {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r10-r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a chroma block vertical edge when the
+@*     boundary strength is set to 4 on calling twice
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha
+@*  Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@*  Beta Value for the boundary
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_chroma_vert_bs4_mbaff_bp_a9
+
+ih264_deblk_chroma_vert_bs4_mbaff_bp_a9:
+
+    stmfd         sp!, {r12, r14}
+    vpush         {d8 - d15}
+    sub           r0, r0, #4            @point r0 to p1u of row0.
+    mov           r12, r0               @keep a back up of r0 for buffer write
+
+    vld4.16       {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+    vld4.16       {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+    vld4.16       {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+    vld4.16       {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+
+    vdup.8        d11, r2               @D11 = alpha
+    vdup.8        d12, r3               @D12 = beta
+    vmov.i8       d31, #2
+
+    vabd.u8       d4, d1, d2            @|p0-q0|
+    vabd.u8       d5, d3, d2            @|q1-q0|
+    vabd.u8       d6, d0, d1            @|p1-p0|
+    vaddl.u8      q14, d1, d3           @(p0 + q1)
+    vclt.u8       d4, d4, d11           @|p0-q0| < alpha ?
+    vclt.u8       d5, d5, d12           @|q1-q0| < beta ?
+    vclt.u8       d6, d6, d12           @|p1-p0| < beta ?
+    vmlal.u8      q14, d0, d31          @2*p1 + (p0 + q1)
+    vaddl.u8      q13, d0, d2           @(p1 + q0)
+    vand.u8       d4, d4, d5            @|p0-q0| < alpha && |q1-q0| < beta
+    vmlal.u8      q13, d3, d31          @2*q1 + (p1 + q0)
+
+    vrshrn.i16    d7, q14, #2           @(2*p1 + (p0 + q1) + 2) >> 2
+    vand.u8       d4, d4, d6            @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+    vrshrn.i16    d9, q13, #2           @(2*q1 + (p1 + q0) + 2) >> 2
+
+    vbit          d1, d7, d4
+    vbit          d2, d9, d4
+
+    vst4.16       {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
+    vst4.16       {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
+    vst4.16       {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
+    vst4.16       {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a chroma block vertical edge for cases where the
+@*     boundary strength is less than 4 on calling twice
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha
+@*  Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@*  Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@*  Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@*  tc0_table
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9
+
+ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9:
+
+    stmfd         sp!, {r10-r12, r14}
+
+    sub           r0, r0, #4            @point r0 to p1u of row0.
+    ldr           r11, [sp, #16]        @r11 = ui_Bs
+
+    ldr           r10, [sp, #20]        @r10 = puc_ClipTab
+    mov           r12, r0               @keep a back up of r0 for buffer write
+    vpush         {d8 - d15}
+    vld4.16       {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+    vld4.16       {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+    vld4.16       {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+    vld4.16       {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+
+    vdup.8        d11, r2               @D11 = alpha
+    vabd.u8       d4, d1, d2            @|p0-q0|
+    vdup.8        d12, r3               @D12 = beta
+    vabd.u8       d5, d3, d2            @|q1-q0|
+    vabd.u8       d6, d0, d1            @|p1-p0|
+    vclt.u8       d4, d4, d11           @|p0-q0| < alpha ?
+    vclt.u8       d5, d5, d12           @|q1-q0| < beta ?
+    vsubl.u8      q14, d0, d3           @(p1 - q1)
+    vclt.u8       d6, d6, d12           @|p1-p0| < beta ?
+    vand.u8       d4, d4, d5            @|p0-q0| < alpha && |q1-q0| < beta
+    vsubl.u8      q12, d2, d1           @(q0 - p0)
+    vmov.u16      q10, #4
+
+    vld1.32       {d31[0]}, [r10]       @Load ClipTable
+    rev           r11, r11              @Blocking strengths
+    vand.u8       d4, d4, d6            @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+    vmov.32       d22[0], r11
+    vmla.s16      q14, q12, q10         @4*(q0 - p0) + (p1 - q1)
+    vmovl.u8      q11, d22
+    vsli.u16      d22, d22, #8
+    vtbl.8        d6, {d31}, d22        @tC0
+    vmov.u8       d12, #1
+    vadd.u8       d6, d6, d12           @tC0 + 1
+    vcge.u8       d5, d22, d12          @u4_bS > 0 ?
+    vand.u8       d4, d4, d5            @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+
+    @ D0 - D3(inputs),
+    @ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+    @ D6 (tC)
+
+    vrshr.s16     q14, q14, #3          @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+
+    vcgt.s16      q13, q14, #0
+    vmovn.i16     d9, q13               @D9 = sign(delta)
+    vabs.s16      q14, q14
+    vmovn.u16     d7, q14
+    vmin.u8       d7, d7, d6            @D7 = |delta|
+
+    vqadd.u8      d10, d1, d7           @p0+|delta|
+    vqadd.u8      d11, d2, d7           @q0+|delta|
+    vqsub.u8      d12, d1, d7           @p0-|delta|
+    vqsub.u8      d13, d2, d7           @q0-|delta|
+
+    vbit          d12, d10, d9          @p0 + delta
+    vbit          d11, d13, d9          @q0 - delta
+
+    vbit          d1, d12, d4
+    vbit          d2, d11, d4
+
+    vst4.16       {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
+    vst4.16       {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
+    vst4.16       {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
+    vst4.16       {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r10-r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a chroma block horizontal edge when the
+@*     boundary strength is set to 4 in high profile
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@*  Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@*  Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@*  Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@*  Beta Value for the boundary in V
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_chroma_horz_bs4_a9
+
+ih264_deblk_chroma_horz_bs4_a9:
+
+    stmfd         sp!, {r4-r6, lr}      @
+
+    ldr           r5, [sp, #16]         @R5 = alpha_cr
+    ldr           r6, [sp, #20]         @R6 = beta_cr
+    vpush         {d8 - d15}
+    sub           r0, r0, r1, lsl #1    @R0 = uc_edgePixel pointing to p1 of chroma
+    vld2.8        {d6, d7}, [r0], r1    @D6 = p1u , D7 = p1v
+    mov           r4, r0                @Keeping a backup of the pointer p0 of chroma
+    vld2.8        {d4, d5}, [r0], r1    @D4 = p0u , D5 = p0v
+    vdup.8        d20, r2               @D20 contains alpha_cb
+    vdup.8        d21, r5               @D21 contains alpha_cr
+    vld2.8        {d0, d1}, [r0], r1    @D0 = q0u , D1 = q0v
+    vaddl.u8      q4, d6, d0            @
+    vaddl.u8      q5, d7, d1            @Q4,Q5 = q0 + p1
+    vmov.i8       d31, #2               @
+    vld2.8        {d2, d3}, [r0]        @D2 = q1u , D3 = q1v
+    vabd.u8       q13, q3, q2           @Q13 = ABS(p1 - p0)
+    vmlal.u8      q4, d2, d31           @
+    vmlal.u8      q5, d3, d31           @Q5,Q4 = (X2(q1U) + q0U + p1U)
+    vabd.u8       q11, q2, q0           @Q11 = ABS(p0 - q0)
+    vabd.u8       q12, q1, q0           @Q12 = ABS(q1 - q0)
+    vaddl.u8      q7, d4, d2            @
+    vaddl.u8      q14, d5, d3           @Q14,Q7 = P0 + Q1
+    vdup.8        d16, r3               @D16 contains beta_cb
+    vdup.8        d17, r6               @D17 contains beta_cr
+    vmlal.u8      q7, d6, d31           @
+    vmlal.u8      q14, d7, d31          @Q14,Q7 = (X2(p1U) + p0U + q1U)
+    vcge.u8       q9, q11, q10          @Q9 = ( ABS(p0 - q0) >= Alpha )
+    vcge.u8       q12, q12, q8          @Q12= ( ABS(q1 - q0) >= Beta )
+    vcge.u8       q13, q13, q8          @Q13= ( ABS(p1 - p0) >= Beta )
+    vrshrn.u16    d8, q4, #2            @
+    vrshrn.u16    d9, q5, #2            @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
+    vorr          q9, q9, q12           @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+    vrshrn.u16    d10, q7, #2           @
+    vrshrn.u16    d11, q14, #2          @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
+    vorr          q9, q9, q13           @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+    vbit          q5, q2, q9            @
+    vbit          q4, q0, q9            @
+    vst2.8        {d10, d11}, [r4], r1  @
+    vst2.8        {d8, d9}, [r4]        @
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r4-r6, pc}      @
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a chroma block vertical edge when the
+@*     boundary strength is set to 4 in high profile
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@*  Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@*  Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@*  Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@*  Beta Value for the boundary in V
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_chroma_vert_bs4_a9
+
+ih264_deblk_chroma_vert_bs4_a9:
+
+    stmfd         sp!, {r4, r5, r12, r14}
+
+    sub           r0, r0, #4            @point r0 to p1u of row0.
+    mov           r12, r0               @keep a back up of r0 for buffer write
+
+    ldr           r4, [sp, #16]         @r4 = alpha_cr
+    ldr           r5, [sp, #20]         @r5 = beta_cr
+    add           r2, r2, r4, lsl #8    @r2 = (alpha_cr,alpha_cb)
+    add           r3, r3, r5, lsl #8    @r3 = (beta_cr,beta_cb)
+    vpush         {d8 - d15}
+    vld4.16       {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+    vld4.16       {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+    vld4.16       {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+    vld4.16       {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+
+    vld4.16       {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+    vld4.16       {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+    vld4.16       {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+    vld4.16       {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+    vdup.16       q11, r2               @Q11 = alpha
+    vdup.16       q12, r3               @Q12 = beta
+    vmov.i8       d31, #2
+
+    vabd.u8       q4, q1, q2            @|p0-q0|
+    vabd.u8       q5, q3, q2            @|q1-q0|
+    vabd.u8       q6, q0, q1            @|p1-p0|
+    vaddl.u8      q7, d2, d6
+    vaddl.u8      q8, d3, d7            @(p0 + q1)
+    vclt.u8       q4, q4, q11           @|p0-q0| < alpha ?
+    vclt.u8       q5, q5, q12           @|q1-q0| < beta ?
+    vclt.u8       q6, q6, q12           @|p1-p0| < beta ?
+    vmlal.u8      q7, d0, d31
+    vmlal.u8      q8, d1, d31           @2*p1 + (p0 + q1)
+    vaddl.u8      q9, d0, d4
+    vaddl.u8      q10, d1, d5           @(p1 + q0)
+    vand.u8       q4, q4, q5            @|p0-q0| < alpha && |q1-q0| < beta
+    vmlal.u8      q9, d6, d31
+    vmlal.u8      q10, d7, d31          @2*q1 + (p1 + q0)
+
+    vrshrn.i16    d14, q7, #2
+    vrshrn.i16    d15, q8, #2           @(2*p1 + (p0 + q1) + 2) >> 2
+    vand.u8       q4, q4, q6            @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+    vrshrn.i16    d18, q9, #2
+    vrshrn.i16    d19, q10, #2          @(2*q1 + (p1 + q0) + 2) >> 2
+
+    vbit          q1, q7, q4
+    vbit          q2, q9, q4
+
+    vst4.16       {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
+    vst4.16       {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
+    vst4.16       {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
+    vst4.16       {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
+
+    vst4.16       {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
+    vst4.16       {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
+    vst4.16       {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
+    vst4.16       {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r4, r5, r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a chroma block horizontal edge for cases where the
+@*     boundary strength is less than 4 in high profile
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@*  Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@*  Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@*  Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@*  Beta Value for the boundary in V
+@*
+@* @param[in] sp(8) - u4_bs
+@*  Packed Boundary strength array
+@*
+@* @param[in] sp(12) - pu1_cliptab_cb
+@*  tc0_table for U
+@*
+@* @param[in] sp(16) - pu1_cliptab_cr
+@*  tc0_table for V
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_chroma_horz_bslt4_a9
+
+ih264_deblk_chroma_horz_bslt4_a9:
+
+    stmfd         sp!, {r4-r9, lr}      @
+
+    ldrd          r4, r5, [sp, #28]     @R4 = alpha_cr , R5 = beta_cr
+    ldr           r7, [sp, #36]         @R7 = u4_bs
+    ldrd          r8, r9, [sp, #40]     @R8 = pu1_cliptab_cb , R9 = pu1_cliptab_cr
+    sub           r0, r0, r1, lsl #1    @R0 = uc_edgePixelU pointing to p1 of chroma U
+    vpush         {d8 - d15}
+    rev           r7, r7                @
+    vmov.32       d12[0], r7            @D12[0] = ui_Bs
+
+    vld1.32       d16[0], [r8]          @D16[0] contains cliptab_cb
+    vld1.32       d17[0], [r9]          @D17[0] contains cliptab_cr
+    vld2.8        {d6, d7}, [r0], r1    @Q3=p1
+    vtbl.8        d14, {d16}, d12       @Retreiving cliptab values for U
+    vtbl.8        d28, {d17}, d12       @Retrieving cliptab values for V
+    vmovl.u8      q6, d12               @Q6 = uc_Bs in each 16 bit scalar
+    mov           r6, r0                @Keeping a backup of the pointer to chroma U P0
+    vld2.8        {d4, d5}, [r0], r1    @Q2=p0
+    vmov.i8       d30, #1               @
+    vdup.8        d20, r2               @D20 contains alpha_cb
+    vdup.8        d21, r4               @D21 contains alpha_cr
+    vld2.8        {d0, d1}, [r0], r1    @Q0=q0
+    vmovl.u8      q7, d14               @
+    vmovl.u8      q14, d28              @
+    vmov.i16      d15, d28              @D14 has cliptab values for U, D15 for V
+    vld2.8        {d2, d3}, [r0]        @Q1=q1
+    vsubl.u8      q5, d1, d5            @
+    vsubl.u8      q4, d0, d4            @Q5,Q4 = (q0 - p0)
+    vabd.u8       q13, q3, q2           @Q13 = ABS(p1 - p0)
+    vshl.i16      q5, q5, #2            @Q5 = (q0 - p0)<<2
+    vabd.u8       q11, q2, q0           @Q11 = ABS(p0 - q0)
+    vshl.i16      q4, q4, #2            @Q4 = (q0 - p0)<<2
+    vsli.16       q7, q7, #8            @
+    vabd.u8       q12, q1, q0           @Q12 = ABS(q1 - q0)
+    vcge.u8       q9, q11, q10          @Q9 = ( ABS(p0 - q0) >= Alpha )
+    vsubl.u8      q10, d6, d2           @Q10 = (p1 - q1)L
+    vsubl.u8      q3, d7, d3            @Q3 = (p1 - q1)H
+    vdup.8        d16, r3               @Q8 contains beta_cb
+    vdup.8        d17, r5               @Q8 contains beta_cr
+    vadd.i16      q4, q4, q10           @
+    vadd.i16      q5, q5, q3            @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
+    vcge.u8       q12, q12, q8          @Q12= ( ABS(q1 - q0) >= Beta )
+    vcgt.s16      d12, d12, #0          @Q6 = (us_Bs > 0)
+    vqrshrn.s16   d8, q4, #3            @
+    vqrshrn.s16   d9, q5, #3            @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
+    vadd.i8       d14, d14, d30         @D14 = C = C0+1 for U
+    vcge.u8       q13, q13, q8          @Q13= ( ABS(p1 - p0) >= Beta )
+    vorr          q9, q9, q12           @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+    vabs.s8       q3, q4                @Q4 = ABS (i_macro)
+    vadd.i8       d15, d15, d30         @D15 = C = C0+1 for V
+    vmov.i8       d13, d12              @
+    vorr          q9, q9, q13           @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+    vmin.u8       q7, q3, q7            @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
+    vbic          q6, q6, q9            @final condition
+    vcge.s8       q4, q4, #0            @Q4  = (i_macro >= 0)
+    vand          q7, q7, q6            @Making delta zero in places where values shouldn be filterd
+    vqadd.u8      q8, q2, q7            @Q8 = p0 + delta
+    vqsub.u8      q2, q2, q7            @Q2 = p0 - delta
+    vqadd.u8      q9, q0, q7            @Q9 = q0 + delta
+    vqsub.u8      q0, q0, q7            @Q0 = q0 - delta
+    vbif          q8, q2, q4            @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
+    vbif          q0, q9, q4            @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
+    vst2.8        {d16, d17}, [r6], r1  @
+    vst2.8        {d0, d1}, [r6]        @
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r4-r9, pc}      @
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a chroma block vertical edge for cases where the
+@*     boundary strength is less than 4 in high profile
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@*  Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@*  Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@*  Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@*  Beta Value for the boundary in V
+@*
+@* @param[in] sp(8) - u4_bs
+@*  Packed Boundary strength array
+@*
+@* @param[in] sp(12) - pu1_cliptab_cb
+@*  tc0_table for U
+@*
+@* @param[in] sp(16) - pu1_cliptab_cr
+@*  tc0_table for V
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_chroma_vert_bslt4_a9
+
+ih264_deblk_chroma_vert_bslt4_a9:
+
+    stmfd         sp!, {r4-r7, r10-r12, r14}
+
+    sub           r0, r0, #4            @point r0 to p1u of row0.
+    ldrd          r4, r5, [sp, #32]     @R4 = alpha_cr , R5 = beta_cr
+    add           r2, r2, r4, lsl #8
+    add           r3, r3, r5, lsl #8
+    ldr           r6, [sp, #40]         @R6 = u4_bs
+    ldrd          r10, r11, [sp, #44]   @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr
+    vpush         {d8 - d15}
+    mov           r12, r0               @keep a back up of R0 for buffer write
+
+    vld4.16       {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+    vld4.16       {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+    vld4.16       {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+    vld4.16       {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+
+    vld4.16       {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+    vld4.16       {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+    vld4.16       {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+    vld4.16       {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+
+    vdup.16       q11, r2               @Q11 = alpha
+    vabd.u8       q4, q1, q2            @|p0-q0|
+    vdup.16       q12, r3               @Q12 = beta
+    vabd.u8       q5, q3, q2            @|q1-q0|
+    vabd.u8       q6, q0, q1            @|p1-p0|
+    vclt.u8       q4, q4, q11           @|p0-q0| < alpha ?
+    vsubl.u8      q7, d0, d6
+    vclt.u8       q5, q5, q12           @|q1-q0| < beta ?
+    vsubl.u8      q8, d1, d7            @(p1 - q1)
+    vclt.u8       q6, q6, q12           @|p1-p0| < beta ?
+    vsubl.u8      q9, d4, d2
+    vand.u8       q4, q4, q5            @|p0-q0| < alpha && |q1-q0| < beta
+    vsubl.u8      q10, d5, d3           @(q0 - p0)
+    vmov.u16      q14, #4
+    vld1.32       {d24[0]}, [r10]       @Load ClipTable for U
+    vld1.32       {d25[0]}, [r11]       @Load ClipTable for V
+    rev           r6, r6                @Blocking strengths
+    vand.u8       q4, q4, q6            @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+
+    vmov.32       d10[0], r6
+
+    vmla.s16      q7, q9, q14
+    vmla.s16      q8, q10, q14          @4*(q0 - p0) + (p1 - q1)
+
+    vmovl.u8      q5, d10
+    vsli.u16      d10, d10, #8
+    vtbl.8        d12, {d24}, d10       @tC0 for U
+    vtbl.8        d13, {d25}, d10       @tC0 for V
+    vzip.8        d12, d13
+    vmovl.u16     q5, d10
+    vsli.u32      q5, q5, #16
+    vmov.u8       q12, #1
+    vadd.u8       q6, q6, q12           @tC0 + 1
+    vcge.u8       q5, q5, q12           @u4_bS > 0 ?
+    vand.u8       q4, q4, q5            @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+
+    @ Q0 - Q3(inputs),
+    @ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+    @ Q6 (tC)
+
+    vrshr.s16     q7, q7, #3
+    vrshr.s16     q8, q8, #3            @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+
+    vcgt.s16      q9, q7, #0
+    vcgt.s16      q10, q8, #0
+    vmovn.i16     d18, q9
+    vmovn.i16     d19, q10              @Q9 = sign(delta)
+    vabs.s16      q7, q7
+    vabs.s16      q8, q8
+    vmovn.u16     d14, q7
+    vmovn.u16     d15, q8
+    vmin.u8       q7, q7, q6            @Q7 = |delta|
+
+    vqadd.u8      q10, q1, q7           @p0+|delta|
+    vqadd.u8      q11, q2, q7           @q0+|delta|
+    vqsub.u8      q12, q1, q7           @p0-|delta|
+    vqsub.u8      q13, q2, q7           @q0-|delta|
+
+    vbit          q12, q10, q9          @p0 + delta
+    vbit          q11, q13, q9          @q0 - delta
+
+    vbit          q1, q12, q4
+    vbit          q2, q11, q4
+
+    vst4.16       {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
+    vst4.16       {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
+    vst4.16       {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
+    vst4.16       {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
+
+    vst4.16       {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
+    vst4.16       {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
+    vst4.16       {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
+    vst4.16       {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r4-r7, r10-r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a chroma block vertical edge when the
+@*     boundary strength is set to 4 on calling twice in high profile
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@*  Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@*  Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@*  Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@*  Beta Value for the boundary in V
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_chroma_vert_bs4_mbaff_a9
+
+ih264_deblk_chroma_vert_bs4_mbaff_a9:
+
+    stmfd         sp!, {r4, r5, r12, r14}
+
+    sub           r0, r0, #4            @point r0 to p1u of row0.
+    mov           r12, r0               @keep a back up of r0 for buffer write
+    ldrd          r4, r5, [sp, #16]     @R4 = alpha_cr , R5 = beta_cr
+    add           r2, r2, r4, lsl #8
+    add           r3, r3, r5, lsl #8
+    vpush         {d8 - d15}
+    vld4.16       {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+    vld4.16       {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+    vld4.16       {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+    vld4.16       {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+
+    vdup.16       d11, r2               @D11 = alpha
+    vdup.16       d12, r3               @D12 = beta
+    vmov.i8       d31, #2
+
+    vabd.u8       d4, d1, d2            @|p0-q0|
+    vabd.u8       d5, d3, d2            @|q1-q0|
+    vabd.u8       d6, d0, d1            @|p1-p0|
+    vaddl.u8      q14, d1, d3           @(p0 + q1)
+    vclt.u8       d4, d4, d11           @|p0-q0| < alpha ?
+    vclt.u8       d5, d5, d12           @|q1-q0| < beta ?
+    vclt.u8       d6, d6, d12           @|p1-p0| < beta ?
+    vmlal.u8      q14, d0, d31          @2*p1 + (p0 + q1)
+    vaddl.u8      q13, d0, d2           @(p1 + q0)
+    vand.u8       d4, d4, d5            @|p0-q0| < alpha && |q1-q0| < beta
+    vmlal.u8      q13, d3, d31          @2*q1 + (p1 + q0)
+
+    vrshrn.i16    d7, q14, #2           @(2*p1 + (p0 + q1) + 2) >> 2
+    vand.u8       d4, d4, d6            @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+    vrshrn.i16    d9, q13, #2           @(2*q1 + (p1 + q0) + 2) >> 2
+
+    vbit          d1, d7, d4
+    vbit          d2, d9, d4
+
+    vst4.16       {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
+    vst4.16       {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
+    vst4.16       {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
+    vst4.16       {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r4, r5, r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a chroma block vertical edge for cases where the
+@*     boundary strength is less than 4 on calling twice in high profile
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@*  Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@*  Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@*  Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@*  Beta Value for the boundary in V
+@*
+@* @param[in] sp(8) - u4_bs
+@*  Packed Boundary strength array
+@*
+@* @param[in] sp(12) - pu1_cliptab_cb
+@*  tc0_table for U
+@*
+@* @param[in] sp(16) - pu1_cliptab_cr
+@*  tc0_table for V
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_chroma_vert_bslt4_mbaff_a9
+
+ih264_deblk_chroma_vert_bslt4_mbaff_a9:
+
+    stmfd         sp!, {r4-r6, r10-r12, r14}
+
+    sub           r0, r0, #4            @point r0 to p1u of row0.
+    mov           r12, r0               @keep a back up of r0 for buffer write
+
+    ldrd          r4, r5, [sp, #28]     @R4 = alpha_cr , R5 = beta_cr
+    add           r2, r2, r4, lsl #8
+    add           r3, r3, r5, lsl #8
+    ldr           r6, [sp, #36]         @R6 = u4_bs
+    ldrd          r10, r11, [sp, #40]   @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr
+    vpush         {d8 - d15}
+    vld4.16       {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+    vld4.16       {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+    vld4.16       {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+    vld4.16       {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+
+    vdup.16       d11, r2               @D11 = alpha
+    vabd.u8       d4, d1, d2            @|p0-q0|
+    vdup.16       d12, r3               @D12 = beta
+    vabd.u8       d5, d3, d2            @|q1-q0|
+    vabd.u8       d6, d0, d1            @|p1-p0|
+    vclt.u8       d4, d4, d11           @|p0-q0| < alpha ?
+    vclt.u8       d5, d5, d12           @|q1-q0| < beta ?
+    vsubl.u8      q14, d0, d3           @(p1 - q1)
+    vclt.u8       d6, d6, d12           @|p1-p0| < beta ?
+    vand.u8       d4, d4, d5            @|p0-q0| < alpha && |q1-q0| < beta
+    vsubl.u8      q12, d2, d1           @(q0 - p0)
+    vmov.u16      q10, #4
+
+    vld1.32       {d31[1]}, [r10]       @Load ClipTable for U
+    vld1.32       {d31[0]}, [r11]       @Load ClipTable for V
+    rev           r6, r6                @Blocking strengths
+    vand.u8       d4, d4, d6            @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+    vmov.32       d22[0], r6
+    vmla.s16      q14, q12, q10         @4*(q0 - p0) + (p1 - q1)
+    vmovl.u8      q11, d22
+    vsli.u16      d22, d22, #8
+    vmov.u16      d13, #4
+    vadd.u8       d22, d22, d13
+    vtbl.8        d6, {d31}, d22        @tC0
+    vmov.u8       d12, #1
+    vsub.u8       d22, d22, d13
+    vadd.u8       d6, d6, d12           @tC0 + 1
+    vcge.u8       d5, d22, d12          @u4_bS > 0 ?
+    vand.u8       d4, d4, d5            @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+
+    @ D0 - D3(inputs),
+    @ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+    @ D6 (tC)
+
+    vrshr.s16     q14, q14, #3          @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+
+    vcgt.s16      q13, q14, #0
+    vmovn.i16     d9, q13               @D9 = sign(delta)
+    vabs.s16      q14, q14
+    vmovn.u16     d7, q14
+    vmin.u8       d7, d7, d6            @D7 = |delta|
+
+    vqadd.u8      d10, d1, d7           @p0+|delta|
+    vqadd.u8      d11, d2, d7           @q0+|delta|
+    vqsub.u8      d12, d1, d7           @p0-|delta|
+    vqsub.u8      d13, d2, d7           @q0-|delta|
+
+    vbit          d12, d10, d9          @p0 + delta
+    vbit          d11, d13, d9          @q0 - delta
+
+    vbit          d1, d12, d4
+    vbit          d2, d11, d4
+
+    vst4.16       {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
+    vst4.16       {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
+    vst4.16       {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
+    vst4.16       {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r4-r6, r10-r12, pc}
+
+
+
diff --git a/common/arm/ih264_deblk_luma_a9.s b/common/arm/ih264_deblk_luma_a9.s
new file mode 100755
index 0000000..3e6a4d9
--- /dev/null
+++ b/common/arm/ih264_deblk_luma_a9.s
@@ -0,0 +1,1092 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/*****************************************************************************/
+@/*                                                                           */
+@/*  File Name         : ih264_deblk_luma_a9.s                                */
+@/*                                                                           */
+@/*  Description       : Contains function definitions for deblocking luma    */
+@/*                      edge. Functions are coded in NEON assembly and can   */
+@/*                      be compiled using ARM RVDS.                          */
+@/*                                                                           */
+@/*  List of Functions : ih264_deblk_luma_vert_bs4_a9()                       */
+@/*                      ih264_deblk_luma_vert_bslt4_a9()                     */
+@/*                      ih264_deblk_luma_horz_bs4_a9()                       */
+@/*                      ih264_deblk_luma_horz_bslt4_a9()                     */
+@/*                      ih264_deblk_luma_vert_bs4_mbaff_a9()                 */
+@/*                      ih264_deblk_luma_vert_bslt4_mbaff_a9()               */
+@/*                                                                           */
+@/*  Issues / Problems : None                                                 */
+@/*                                                                           */
+@/*  Revision History  :                                                      */
+@/*                                                                           */
+@/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+@/*         28 11 2013   Ittiam          Draft                                */
+@/*         05 01 2015   Kaushik         Added double-call functions for      */
+@/*                      Senthoor        vertical deblocking.                 */
+@/*                                                                           */
+@/*****************************************************************************/
+
+
+.text
+.p2align 2
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a luma block horizontal edge for cases where the
+@*     boundary strength is less than 4
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha
+@*  Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@*  Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@*  Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@*  tc0_table
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_luma_horz_bslt4_a9
+
+ih264_deblk_luma_horz_bslt4_a9:
+
+    stmfd         sp!, {r4-r7, lr}
+
+    ldrd          r4, r5, [sp, #0x14]   @r4 = ui_Bs , r5 = *puc_ClpTab
+    vpush         {d8 - d15}
+    sub           r0, r0, r1, lsl #1    @R1 = uc_Horizonpad
+    sub           r0, r0, r1            @r0 pointer to p2
+    rev           r4, r4                @
+    vld1.8        {q5}, [r0], r1        @p2 values are loaded into q5
+    vmov.32       d12[0], r4            @d12[0] = ui_Bs
+    mov           r6, r0                @keeping backup of pointer to p1
+    vld1.8        {q4}, [r0], r1        @p1 values are loaded into q4
+    mov           r7, r0                @keeping backup of pointer to p0
+    vld1.8        {q3}, [r0], r1        @p0 values are loaded into q3
+    vmovl.u8      q6, d12               @q6 = uc_Bs in each 16 bt scalar
+    vld1.8        {q0}, [r0], r1        @q0 values are loaded into q0
+    vabd.u8       q13, q4, q3           @Q13 = ABS(p1 - p0)
+    vld1.8        {q1}, [r0], r1        @q1 values are loaded into q1
+    vabd.u8       q11, q3, q0           @Q11 = ABS(p0 - q0)
+    vld1.32       d16[0], [r5]          @D16[0] contains cliptab
+    vabd.u8       q12, q1, q0           @Q12 = ABS(q1 - q0)
+    vld1.8        {q2}, [r0], r1        @q2 values are loaded into q2
+    vtbl.8        d14, {d16}, d12       @
+    vdup.8        q10, r2               @Q10 contains alpha
+    vdup.8        q8, r3                @Q8 contains beta
+    vmovl.u16     q6, d12               @
+    vmovl.u16     q7, d14               @
+    vabd.u8       q14, q5, q3           @Q14 = Ap = ABS(p2 - p0)
+    vabd.u8       q15, q2, q0           @Q15 = Aq = ABS(q2 - q0)
+    vcgt.s32      q6, q6, #0            @Q6 = (us_Bs > 0)
+    vsli.32       q7, q7, #8            @
+    vcge.u8       q9, q11, q10          @Q9 = ( ABS(p0 - q0) >= Alpha )
+    vcge.u8       q12, q12, q8          @Q12=( ABS(q1 - q0) >= Beta )
+    vcge.u8       q13, q13, q8          @Q13=( ABS(p1 - p0) >= Beta )
+    vcgt.u8       q10, q8, q14          @Q10=(Ap<Beta)
+    vcgt.u8       q11, q8, q15          @Q11=(Aq<Beta)
+    vsli.32       q7, q7, #16           @Q7  = C0
+    vorr          q9, q9, q12           @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+    vsubl.u8      q15, d1, d7           @
+    vsubl.u8      q12, d0, d6           @Q15,Q12 = (q0 - p0)
+    vorr          q9, q9, q13           @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+    vsubl.u8      q14, d8, d2           @Q14 = (p1 - q1)L
+    vshl.i16      q13, q15, #2          @Q13 = (q0 - p0)<<2
+    vshl.i16      q12, q12, #2          @Q12 = (q0 - p0)<<2
+    vsubl.u8      q15, d9, d3           @Q15 = (p1 - q1)H
+    vbic          q6, q6, q9            @final condition
+    vadd.i16      q12, q12, q14         @
+    vadd.i16      q13, q13, q15         @Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1)
+    vsub.i8       q9, q7, q10           @Q9 = C0 + (Ap < Beta)
+    vrhadd.u8     q8, q3, q0            @Q8 = ((p0+q0+1) >> 1)
+    vqrshrn.s16   d24, q12, #3          @
+    vqrshrn.s16   d25, q13, #3          @Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
+    vsub.i8       q9, q9, q11           @Q9 = C0 + (Ap < Beta) + (Aq < Beta)
+    vand.i8       q10, q10, q6          @
+    vand.i8       q11, q11, q6          @
+    vabs.s8       q13, q12              @Q13 = ABS (i_macro)
+    vaddl.u8      q14, d17, d11         @
+    vaddl.u8      q5, d16, d10          @Q14,Q5 = p2 + (p0+q0+1)>>1
+    vaddl.u8      q15, d17, d5          @
+    vmin.u8       q9, q13, q9           @Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
+    vshll.u8      q13, d9, #1           @
+    vaddl.u8      q2, d16, d4           @Q15,Q2 = q2 + (p0+q0+1)>>1
+    vshll.u8      q8, d8, #1            @Q13,Q8 = (p1<<1)
+    vand          q9, q9, q6            @Making delta zero in places where values shouldn be filterd
+    vsub.i16      q14, q14, q13         @Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1)
+    vsub.i16      q5, q5, q8            @
+    vshll.u8      q8, d2, #1            @
+    vshll.u8      q13, d3, #1           @Q13,Q8 = (q1<<1)
+    vqshrn.s16    d29, q14, #1          @
+    vqshrn.s16    d28, q5, #1           @Q14 = i_macro_p1
+    vsub.i16      q2, q2, q8            @
+    vsub.i16      q15, q15, q13         @Q15,Q2  = [q2 + (p0+q0+1)>>1] - (q1<<1)
+    vneg.s8       q13, q7               @Q13 = -C0
+    vmin.s8       q14, q14, q7          @Q14 = min(C0,i_macro_p1)
+    vcge.s8       q12, q12, #0          @Q12 = (i_macro >= 0)
+    vqshrn.s16    d31, q15, #1          @
+    vqshrn.s16    d30, q2, #1           @Q15 = i_macro_q1
+    vmax.s8       q14, q14, q13         @Q14 = max( - C0 , min(C0, i_macro_p1) )
+    vqadd.u8      q8, q3, q9            @Q8  = p0 + delta
+    vqsub.u8      q3, q3, q9            @Q3 = p0 - delta
+    vmin.s8       q15, q15, q7          @Q15 = min(C0,i_macro_q1)
+    vand.i8       q14, q10, q14         @condition check Ap<beta
+    vqadd.u8      q7, q0, q9            @Q7 = q0 + delta
+    vqsub.u8      q0, q0, q9            @Q0   = q0 - delta
+    vmax.s8       q15, q15, q13         @Q15 = max( - C0 , min(C0, i_macro_q1) )
+    vbif          q8, q3, q12           @Q8  = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
+    vbif          q0, q7, q12           @Q0  = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
+    vadd.i8       q14, q14, q4          @
+    vand.i8       q15, q11, q15         @condition check Aq<beta
+    vst1.8        {q8}, [r7], r1        @writting back filtered value of p0
+    vadd.i8       q15, q15, q1          @
+    vst1.8        {q0}, [r7], r1        @writting back filtered value of q0
+    vst1.8        {q14}, [r6]           @writting back filtered value of p1
+    vst1.8        {q15}, [r7], r1       @writting back filtered value of q1
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r4-r7, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a luma block horizontal edge when the
+@*     boundary strength is set to 4
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha
+@*  Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@*  Beta Value for the boundary
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_luma_horz_bs4_a9
+
+ih264_deblk_luma_horz_bs4_a9:
+
+    @ Back up necessary registers on stack
+    stmfd         sp!, {r12, r14}
+    vpush         {d8 - d15}
+    @ Init
+    vdup.8        q0, r2                @duplicate alpha
+    sub           r12, r0, r1           @pointer to p0 = q0 - src_strd
+    vdup.8        q1, r3                @duplicate beta
+    sub           r14, r0, r1, lsl#1    @pointer to p1 = q0 - src_strd*2
+    sub           r2, r0, r1, lsl#2     @pointer to p3 = q0 - src_strd*4
+    sub           r3, r14, r1           @pointer to p2 = p1 - src_strd
+
+    @ Load Data
+    vld1.8        {d4, d5}, [r0], r1    @load q0 to Q2, q0 = q0 + src_strd
+    vld1.8        {d6, d7}, [r12]       @load p0 to Q3
+    vld1.8        {d8, d9}, [r0], r1    @load q1 to Q4, q0 = q0 + src_strd
+    vld1.8        {d10, d11}, [r14]     @load p1 to Q5
+
+    @ Filter Decision
+    vabd.u8       q6, q2, q3            @ABS(p0 - q0)
+    vabd.u8       q7, q4, q2            @ABS(q1 - q0)
+    vabd.u8       q8, q5, q3            @ABS(p1 - p0)
+    vcge.u8       q9, q6, q0            @ABS(p0 - q0) >= Alpha
+    vcge.u8       q7, q7, q1            @ABS(q1 - q0) >= Beta
+    vcge.u8       q8, q8, q1            @ABS(p1 - p0) >= Beta
+    vmov.i8       q10, #2
+    vorr          q9, q9, q7            @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta
+    vld1.8        {d14, d15}, [r0], r1  @load q2 to Q7, q0 = q0 + src_strd
+    vorr          q9, q9, q8            @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta
+    vsra.u8       q10, q0, #2           @((Alpha >> 2) + 2)
+    vabd.u8       q11, q7, q2           @Aq  = ABS(q2 - q0)
+    vaddl.u8      q12, d4, d6           @p0+q0 L
+    vaddl.u8      q13, d5, d7           @p0+q0 H
+    vclt.u8       q11, q11, q1          @Aq < Beta
+    vclt.u8       q10, q6, q10          @(ABS(p0 - q0) <((Alpha >>2) + 2))
+
+    @ Deblock Filtering q0', q1', q2'
+    vaddw.u8      q14, q12, d8          @p0+q0+q1 L
+    vaddw.u8      q15, q13, d9          @p0+q0+q1 H
+    vand          q11, q11, q10         @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+    @ q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE
+    vadd.i16      q8, q14, q14          @2*(p0+q0+q1)L
+    vadd.i16      q0, q15, q15          @2*(p0+q0+q1)H
+    vaddw.u8      q8, q8, d14           @2*(p0+q0+q1)+q2 L
+    vaddw.u8      q0, q0, d15           @2*(p0+q0+q1)+q2 H
+    vaddw.u8      q8, q8, d10           @2*(p0+q0+q1)+q2 +p1 L
+    vaddw.u8      q0, q0, d11           @2*(p0+q0+q1)+q2 +p1 H
+    vrshrn.u16    d12, q8, #3           @(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0']
+    vrshrn.u16    d13, q0, #3           @(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0']
+    @ q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE
+    vaddl.u8      q8, d8, d8            @2*q1 L
+    vaddl.u8      q0, d9, d9            @2*q1 H
+    vaddw.u8      q8, q8, d4            @2*q1+q0 L
+    vaddw.u8      q0, q0, d5            @2*q1+q0 H
+    vaddw.u8      q8, q8, d10           @2*q1+q0+p1  L
+    vaddw.u8      q0, q0, d11           @2*q1+q0+p1 H
+    vrshrn.u16    d16, q8, #2           @(2*q1+q0+p1+2)>>2 L [q0"]
+    vrshrn.u16    d17, q0, #2           @(2*q1+q0+p1+2)>>2 H [q0"]
+    @ q1'
+    vaddw.u8      q14, q14, d14         @p0+q0+q1+q2 L
+    vaddw.u8      q15, q15, d15         @p0+q0+q1+q2 H
+    vld1.8        {q0}, [r0], r1        @load q3 to Q0, q0 = q0 + src_strd
+    vbit          q8, q6, q11           @choosing between q0' and q0" depending on condn
+    sub           r0, r0, r1, lsl #2    @pointer to q0
+    vbic          q11, q11, q9          @((ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
+                                        @ && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+    vrshrn.u16    d12, q14, #2          @(p0+q0+q1+q2+2)>>2 L [q1']
+    vrshrn.u16    d13, q15, #2          @(p0+q0+q1+q2+2)>>2 H [q1']
+    vbif          q2, q8, q9            @choose q0 or filtered q0
+    @ q2'
+    vaddl.u8      q8, d14, d0           @q2+q3,L
+    vaddl.u8      q0, d15, d1           @q2+q3,H
+    vadd.i16      q14, q14, q8          @p0+q0+q1+2*q2+q3 L
+    vst1.8        {d4, d5}, [r0], r1    @store q0
+    vadd.i16      q15, q15, q0          @p0+q0+q1+2*q2+q3 H
+    vadd.i16      q14, q14, q8          @p0+q0+q1+3*q2+2*q3 L
+    vadd.i16      q15, q15, q0          @p0+q0+q1+3*q2+2*q3 H
+    vrshrn.u16    d0, q14, #3           @(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2']
+    vrshrn.u16    d1, q15, #3           @(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2']
+    vld1.8        {d30, d31}, [r3]      @load p2 to Q15
+    vbif          q6, q4, q11           @choose q1 or filtered value of q1
+
+    vabd.u8       q8, q15, q3           @Ap,ABS(p2 - p0)
+    vaddw.u8      q12, q12, d10         @p0+q0+p1 L
+    vbif          q0, q7, q11           @choose q2 or filtered q2
+    vaddw.u8      q13, q13, d11         @p0+q0+p1 H
+    vst1.8        {d12, d13}, [r0], r1  @store q1
+    vclt.u8       q8, q8, q1            @Ap < Beta
+    vadd.i16      q14, q12, q12         @2*(p0+q0+p1) L
+    vadd.i16      q2, q13, q13          @2*(p0+q0+p1) H
+    vst1.8        {d0, d1}, [r0], r1    @store q2
+    vand          q10, q10, q8          @((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2)))
+    vaddw.u8      q14, q14, d30         @2*(p0+q0+p1)+p2 l
+    vaddw.u8      q2, q2, d31           @2*(p0+q0+p1)+p2 H
+    vaddw.u8      q14, q14, d8          @2*(p0+q0+p1)+p2+q1 L
+    vaddw.u8      q2, q2, d9            @2*(p0+q0+p1)+p2+q1 H
+    vrshrn.u16    d28, q14, #3          @(2*(p0+q0+p1)+p2+q1+4)>>3  L,p0'
+    vrshrn.u16    d29, q2, #3           @(2*(p0+q0+p1)+p2+q1+4)>>3  H,p0'
+    vmov.i8       d0, #2
+    vmov.i16      d1, #2
+    vaddl.u8      q1, d6, d8            @p0+q1      L
+    vmlal.u8      q1, d10, d0           @2*p1+p0+q1 L
+    vaddl.u8      q8, d7, d9            @p0+q1  H
+    vmlal.u8      q8, d11, d0           @2*p1+p0+q1 H
+    vaddw.u8      q6, q12, d30          @(p0+q0+p1) +p2 L
+    vld1.8        {d24, d25}, [r2]      @load p3,Q12
+    vaddw.u8      q2, q13, d31          @(p0+q0+p1) +p2 H
+    vaddl.u8      q4, d30, d24          @p2+p3 L
+    vrshrn.u16    d26, q6, #2           @((p0+q0+p1)+p2 +2)>>2,p1' L
+    vrshrn.u16    d2, q1, #2            @(2*p1+p0+q1+2)>>2,p0"L
+    vrshrn.u16    d27, q2, #2           @((p0+q0+p1)+p2 +2)>>2,p1' H
+    vrshrn.u16    d3, q8, #2            @(2*p1+p0+q1+2)>>2,p0" H
+    vaddl.u8      q8, d31, d25          @p2+p3 H
+    vmla.u16      q6, q4, d1[0]         @(p0+q0+p1)+3*p2+2*p3 L
+    vmla.u16      q2, q8, d1[0]         @(p0+q0+p1)+3*p2+2*p3 H
+    vbic          q8, q10, q9           @((ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
+                                        @&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+    vbit          q1, q14, q10          @choosing between po' and p0"
+    vrshrn.u16    d12, q6, #3           @((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2'
+    vrshrn.u16    d13, q2, #3           @((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2'
+    vbif          q3, q1, q9            @choosing between p0 and filtered value of p0
+    vbit          q5, q13, q8           @choosing between p1 and p1'
+    vbit          q15, q6, q8           @choosing between p2 and p2'
+    vst1.8        {d6, d7}, [r12]       @store p0
+    vst1.8        {d10, d11}, [r14]     @store p1
+    vst1.8        {d30, d31}, [r3]      @store p2
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a luma block vertical edge for cases where the
+@*     boundary strength is less than 4
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha
+@*  Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@*  Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@*  Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@*  tc0_table
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_luma_vert_bslt4_a9
+
+ih264_deblk_luma_vert_bslt4_a9:
+
+    stmfd         sp!, {r12, lr}
+
+    sub           r0, r0, #4            @pointer uc_edgePixel-4
+    ldr           r12, [sp, #8]         @r12 = ui_Bs
+    ldr           r14, [sp, #12]        @r14 = *puc_ClpTab
+    vpush         {d8 - d15}
+    @loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
+    vld1.8        {d0}, [r0], r1        @row1
+    vld1.8        d2, [r0], r1          @row2
+    vld1.8        d4, [r0], r1          @row3
+    rev           r12, r12              @reversing ui_bs
+    vld1.8        d6, [r0], r1          @row4
+    vmov.32       d18[0], r12           @d12[0] = ui_Bs
+    vld1.32       d16[0], [r14]         @D16[0] contains cliptab
+    vld1.8        d8, [r0], r1          @row5
+    vmovl.u8      q9, d18               @q6 = uc_Bs in each 16 bt scalar
+    vld1.8        d10, [r0], r1         @row6
+    vld1.8        d12, [r0], r1         @row7
+    vtbl.8        d16, {d16}, d18       @puc_ClipTab[uc_Bs]
+    vld1.8        d14, [r0], r1         @row8
+    vld1.8        d1, [r0], r1          @row9
+    vmovl.u16     q8, d16               @
+    vld1.8        d3, [r0], r1          @row10
+    vld1.8        d5, [r0], r1          @row11
+    vld1.8        d7, [r0], r1          @row12
+    vsli.32       q8, q8, #8            @
+    vld1.8        d9, [r0], r1          @row13
+    vld1.8        d11, [r0], r1         @row14
+    vld1.8        d13, [r0], r1         @row15
+    vsli.32       q8, q8, #16           @Q8  = C0
+    vld1.8        d15, [r0], r1         @row16
+
+    @taking two 8x8 transposes
+    @2X2 transposes
+    vtrn.8        d0, d2                @row1 &2
+    vtrn.8        d4, d6                @row3&row4
+    vtrn.8        d8, d10               @row5&6
+    vtrn.8        d12, d14              @row7 & 8
+    vtrn.8        d1, d3                @row9 &10
+    vtrn.8        d5, d7                @row11 & 12
+    vtrn.8        d9, d11               @row13 &14
+    vtrn.8        d13, d15              @row15 & 16
+    @4x4 transposes
+    vtrn.16       d2, d6                @row2 & row4
+    vtrn.16       d10, d14              @row6 & row8
+    vtrn.16       d3, d7                @row10 & 12
+    vtrn.16       d11, d15              @row14 & row16
+    vtrn.32       d6, d14               @row4 & 8
+    vtrn.32       d7, d15               @row 12 & 16
+
+    @now Q3 ->p0 and Q7->q3
+    vtrn.16       d0, d4                @row1 & 3
+    vtrn.16       d8, d12               @row 5 & 7
+    vtrn.16       d1, d5                @row9 & row11
+    vtrn.16       d9, d13               @row13 & row15
+    vtrn.32       d0, d8                @row1 & row5
+    vtrn.32       d1, d9                @row9 & 13
+
+    @now Q0->p3 & Q4->q0
+    @starting processing as p0 and q0 are now ready
+    vtrn.32       d2, d10               @row2 &6
+    vrhadd.u8     q10, q3, q4           @((p0 + q0 + 1) >> 1)
+    vtrn.32       d3, d11               @row10&row14
+    vmov.i8       d19, #2
+    @now Q1->p2     & Q5->q1
+    vtrn.32       d4, d12               @row3 & 7
+    vabd.u8       q11, q3, q4           @ABS(p0 - q0)
+    vtrn.32       d5, d13               @row11 & row15
+    vaddl.u8      q12, d20, d2          @(p2 + ((p0 + q0 + 1) >> 1) L
+    @now            Q2->p1,Q6->q2
+    vaddl.u8      q13, d21, d3          @(p2 + ((p0 + q0 + 1) >> 1) H
+    vmlsl.u8      q12, d4, d19          @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L
+    vmlsl.u8      q13, d5, d19          @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H
+    vdup.8        q14, r2               @alpha
+    vcle.u8       q11, q14, q11         @ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
+    vdup.i8       q14, r3               @beta
+    vabd.u8       q15, q5, q4           @ABS(q1 - q0)
+    vqshrn.s16    d24, q12, #1          @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L
+    vqshrn.s16    d25 , q13, #1         @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H
+    vcge.u8       q15, q15, q14         @ABS(q1 - q0) >= Beta
+    vabd.u8       q13, q2, q3           @ABS(p1 - p0)
+    vmin.s8       q12, q12, q8          @min(deltap1 ,C0)
+    vorr          q11, q11, q15         @ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
+    vneg.s8       q15, q8               @-C0
+    vcge.u8       q13, q13, q14         @ABS(p1 - p0) >= Beta
+    vmax.s8       q12, q12, q15         @max(deltap1,-C0)
+    vorr          q11, q11, q13         @ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta ||  ABS(p1 - p0) >= Beta)
+    vmovl.u16     q13, d18              @ui_bs
+    vaddl.u8      q9, d20, d12          @q2 + ((p0 + q0 + 1) >> 1) L
+    vceq.u32      q13, q13, #0          @ui_bs == 0
+    vsubw.u8      q9, q9, d10           @(q2 + ((p0 + q0 + 1) >> 1) - q1) L
+    vaddl.u8      q10, d21, d13         @q2 + ((p0 + q0 + 1) >> 1) H
+    vsubw.u8      q9, q9, d10           @(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L
+    vsubw.u8      q10, q10, d11         @(q2 + ((p0 + q0 + 1) >> 1) - q1) H
+    vorr          q13, q13, q11         @(ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs)
+    vsubw.u8      q10, q10, d11         @(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H
+    vqshrn.s16    d18, q9, #1           @((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L
+    vabd.u8       q11, q1, q3           @Ap = ABS(p2 - p0)
+    vqshrn.s16    d19, q10, #1          @((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H
+    vabd.u8       q10, q6, q4           @Aq= ABS(q2 - q0)
+    vclt.u8       q11, q11, q14         @Ap < Beta
+    vmin.s8       q9, q9, q8            @min(delatq1,C0)
+    vclt.u8       q10, q10, q14         @Aq <Beta
+    vsubl.u8      q14, d8, d6           @(q0 - p0) L
+    vmax.s8       q9, q9, q15           @max(deltaq1,-C0)
+    vsubl.u8      q15, d9, d7           @(q0 - p0) H
+    vshl.s16      q14, q14, #2          @(q0 - p0)<<2 L
+    vsub.u8       q8, q8, q11           @C0 + (Ap < Beta)
+    vshl.s16      q15, q15, #2          @(q0 - p0) << 2) H
+    vaddw.u8      q14, q14, d4          @((q0 - p0) << 2) + (p1  L
+    vaddw.u8      q15, q15, d5          @((q0 - p0) << 2) + (p1 H
+    vsubw.u8      q14, q14, d10         @((q0 - p0) << 2) + (p1 - q1) L
+    vsubw.u8      q15, q15, d11         @((q0 - p0) << 2) + (p1 - q1) H
+    vbic          q11, q11, q13         @final condition for p1
+    vrshrn.s16    d28, q14, #3          @delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L
+    vrshrn.s16    d29, q15, #3          @delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H
+    vsub.u8       q8, q8, q10           @C0 + (Ap < Beta) + (Aq < Beta)
+    vbic          q10, q10, q13         @final condition for q1
+    vabs.s8       q15, q14              @abs(delta)
+    vand          q12, q12, q11         @delatp1
+    vand          q9, q9, q10           @delta q1
+    vmin.u8       q15, q15, q8          @min((abs(delta),C)
+    vadd.i8       q2, q2, q12           @p1+deltap1
+    vadd.i8       q5, q5, q9            @q1+deltaq1
+    vbic          q15, q15, q13         @abs(delta) of pixels to be changed only
+    vcge.s8       q14, q14, #0          @sign(delta)
+    vqsub.u8      q11, q3, q15          @clip(p0-delta)
+    vtrn.8        d0, d2                @row1 &2
+    vqadd.u8      q3, q3, q15           @clip(p0+delta)
+    vtrn.8        d1, d3                @row9 &10
+    vqadd.u8      q12, q4, q15          @clip(q0+delta)
+    vtrn.8        d12, d14              @row7 & 8
+    vqsub.u8      q4, q4, q15           @clip(q0-delta)
+    vtrn.8        d13, d15              @row15 & 16
+    vbif          q3, q11, q14          @p0
+    vbif          q4, q12, q14          @q0
+    vtrn.8        d4, d6                @row3&row4
+    vtrn.8        d8, d10               @row5&6
+    vtrn.8        d5, d7                @row11 & 12
+    vtrn.8        d9, d11               @row13 &14
+    vtrn.16       d2, d6                @row2 & row4
+    vtrn.16       d10, d14              @row6 & row8
+    vtrn.16       d3, d7                @row10 & 12
+    vtrn.16       d11, d15              @row14 & row16
+    vtrn.32       d6, d14               @row4 & 8
+    vtrn.32       d7, d15               @row 12 & 16
+    @now Q3 ->p0 and Q7->q3
+    vtrn.16       d0, d4                @row1 & 3
+    vtrn.16       d8, d12               @row 5 & 7
+    vtrn.16       d1, d5                @row9 & row11
+    vtrn.16       d9, d13               @row13 & row15
+    sub           r0, r0, r1, lsl#4     @restore pointer
+    vtrn.32       d0, d8                @row1 & row5
+    vtrn.32       d1, d9                @row9 & 13
+    vtrn.32       d2, d10               @row2 &6
+    vtrn.32       d3, d11               @row10&row14
+    vtrn.32       d4, d12               @row3 & 7
+    vtrn.32       d5, d13               @row11 & row15
+    vst1.8        {d0}, [r0], r1        @row1
+    vst1.8        d2, [r0], r1          @row2
+    vst1.8        d4, [r0], r1          @row3
+    vst1.8        d6, [r0], r1          @row4
+    vst1.8        d8, [r0], r1          @row5
+    vst1.8        d10, [r0], r1         @row6
+    vst1.8        d12, [r0], r1         @row7
+    vst1.8        d14, [r0], r1         @row8
+    vst1.8        d1, [r0], r1          @row9
+    vst1.8        d3, [r0], r1          @row10
+    vst1.8        d5, [r0], r1          @row11
+    vst1.8        d7, [r0], r1          @row12
+    vst1.8        d9, [r0], r1          @row13
+    vst1.8        d11, [r0], r1         @row14
+    vst1.8        d13, [r0], r1         @row15
+    vst1.8        d15, [r0], r1         @row16
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a luma block vertical edge when the
+@*     boundary strength is set to 4
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha
+@*  Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@*  Beta Value for the boundary
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_luma_vert_bs4_a9
+
+ih264_deblk_luma_vert_bs4_a9:
+
+    stmfd         sp!, {r12, lr}
+    vpush         {d8 - d15}
+    sub           r0, r0, #4            @pointer uc_edgePixel-4
+    @loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
+    vld1.8        d0, [r0], r1          @row1
+    vld1.8        d2, [r0], r1          @row2
+    vld1.8        d4, [r0], r1          @row3
+    vld1.8        d6, [r0], r1          @row4
+    vld1.8        d8, [r0], r1          @row5
+    vld1.8        d10, [r0], r1         @row6
+    vld1.8        d12, [r0], r1         @row7
+    vld1.8        d14, [r0], r1         @row8
+    vld1.8        d1, [r0], r1          @row9
+    vld1.8        d3, [r0], r1          @row10
+    vld1.8        d5, [r0], r1          @row11
+    vld1.8        d7, [r0], r1          @row12
+    vld1.8        d9, [r0], r1          @row13
+    vld1.8        d11, [r0], r1         @row14
+    vld1.8        d13, [r0], r1         @row15
+    vld1.8        d15, [r0], r1         @row16
+    @taking two 8x8 transposes
+    @2X2 transposes
+    vtrn.8        d0, d2                @row1 &2
+    vtrn.8        d4, d6                @row3&row4
+    vtrn.8        d8, d10               @row5&6
+    vtrn.8        d12, d14              @row7 & 8
+    vtrn.8        d1, d3                @row9 &10
+    vtrn.8        d5, d7                @row11 & 12
+    vtrn.8        d9, d11               @row13 &14
+    vtrn.8        d13, d15              @row15 & 16
+    @4x4 transposes
+    vtrn.16       d2, d6                @row2 & row4
+    vtrn.16       d10, d14              @row6 & row8
+    vtrn.16       d3, d7                @row10 & 12
+    vtrn.16       d11, d15              @row14 & row16
+    vtrn.32       d6, d14               @row4 & 8
+    vtrn.32       d7, d15               @row 12 & 16
+    @now Q3 ->p0 and Q7->q3
+    vtrn.16       d0, d4                @row1 & 3
+    vtrn.16       d8, d12               @row 5 & 7
+    vtrn.16       d1, d5                @row9 & row11
+    vtrn.16       d9, d13               @row13 & row15
+    vtrn.32       d0, d8                @row1 & row5
+    vtrn.32       d1, d9                @row9 & 13
+    @now Q0->p3 & Q4->q0
+    @starting processing as p0 and q0 are now ready
+    @now Q1->p2 & Q5->q1
+    vpush         {q7}                  @saving in stack
+    vtrn.32       d4, d12               @row3 & 7
+    vmov.i16      q14, #2
+    vtrn.32       d5, d13               @row11 & row15
+    vaddl.u8      q8, d6, d8            @p0+q0 L
+    vtrn.32       d2, d10               @row2 &6
+    vaddl.u8      q9, d7, d9            @p0+q0 H
+    vtrn.32       d3, d11               @row10&row14
+    vaddw.u8      q10, q8, d4           @p0+q0+p1 L
+    vaddw.u8      q11, q9, d5           @p0+q0+p1 H
+    vaddl.u8      q12, d2, d10          @p2+q1 L
+    vaddl.u8      q13, d3, d11          @p2+q1 H
+    vmla.u16      q12, q10, q14         @p2 + X2(p1) + X2(p0) + X2(q0) + q1 L
+    vmla.u16      q13, q11, q14         @p2 + X2(p1) + X2(p0) + X2(q0) + q1 H
+    vmov.i8       q14, #2
+    vaddw.u8      q8, q10, d2           @p0+q0+p1+p2 L
+    vaddw.u8      q9, q11, d3           @p0+q0+p1+p2 H
+    vdup.i8       q15, r2               @duplicate alpha
+    vrshrn.u16    d20, q8, #2           @(p2 + p1 + p0 + q0 + 2) >> 2)L p1'
+    vrshrn.u16    d21, q9, #2           @(p2 + p1 + p0 + q0 + 2) >> 2)H p1'
+    vabd.u8       q11, q3, q4           @ABD(p0-q0)
+    vsra.u8       q14, q15, #2          @alpha >>2 +2
+    vabd.u8       q15, q1, q3           @Ap = ABD(p2-p0)
+    vrshrn.u16    d24, q12, #3          @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0'
+    vrshrn.u16    d25, q13, #3          @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0'
+    vdup.i8       q13, r3               @beta
+    vcgt.u8       q14, q14, q11         @ABS(p0 - q0) <((Alpha >>2) + 2)
+    vaddl.u8      q11, d6, d10          @p0+q1 L
+    vcgt.u8       q7, q13, q15          @beta>Ap
+    vaddl.u8      q15, d7, d11          @p0+q1 H
+    vaddw.u8      q11, q11, d4          @p0+q1+p1 L
+    vaddw.u8      q15, q15, d5          @p0+q1+p1 H
+    vaddw.u8      q11, q11, d4          @p0+q1+2*p1 L
+    vaddw.u8      q15, q15, d5          @p0+q1+2*p1 H
+    vand          q7, q7, q14           @(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
+    vrshrn.u16    d22, q11, #2          @((X2(p1) + p0 + q1 + 2) >> 2) L p0"
+    vrshrn.u16    d23, q15, #2          @((X2(p1) + p0 + q1 + 2) >> 2) H p0"
+    vaddl.u8      q15, d2, d0           @p2+p3 L
+    vbif          q12, q11, q7          @p0' or p0 "
+    vaddl.u8      q11, d3, d1           @p2+p3 H
+    vadd.u16      q15, q15, q15         @2*(p2+p3) L
+    vadd.u16      q11, q11, q11         @2*(p2+p3)H
+    vadd.u16      q8, q8, q15           @(X2(p3) + X3(p2) + p1 + p0 + q0) L
+    vadd.u16      q9, q9, q11           @(X2(p3) + X3(p2) + p1 + p0 + q0) H
+    vabd.u8       q15, q6, q4           @Aq = abs(q2-q0)
+    vabd.u8       q11, q5, q4           @ABS(Q1-Q0)
+    vrshrn.u16    d16, q8, #3           @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2'
+    vrshrn.u16    d17, q9, #3           @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2'
+    vabd.u8       q9, q2, q3            @ABS(p1-p0)
+    vcgt.u8       q15, q13, q15         @Aq < Beta
+    vcge.u8       q11, q11, q13         @ABS(q1 - q0) >= Beta
+    vcge.u8       q9, q9, q13           @ABS(p1 - p0) >= beta
+    vdup.i8       q13, r2               @duplicate alpha
+    vand          q15, q15, q14         @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+    vabd.u8       q14, q3, q4           @abs(p0-q0)
+    vorr          q11, q11, q9          @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
+    vaddl.u8      q9, d6, d8            @p0+q0 L
+    vcge.u8       q14, q14, q13         @ABS(p0 - q0) >= Alpha
+    vaddl.u8      q13, d7, d9           @p0+q0 H
+    vaddw.u8      q9, q9, d10           @p0+q0+q1 L
+    vorr          q11, q11, q14         @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha
+    vaddw.u8      q13, q13, d11         @p0+q0+q1 H
+    vbic          q7, q7, q11           @final condn for p's
+    vmov.i8       q14, #2
+    vbif          q3, q12, q11          @final p0
+    vbit          q1, q8, q7            @final p2
+    vbif          q10, q2, q7           @final p1
+    vaddl.u8      q12, d8, d4           @q0+p1 L
+    vmlal.u8      q12, d10, d28         @X2(q1) + q0 + p1 L
+    vaddl.u8      q8, d9, d5            @q0+p1 H
+    vmlal.u8      q8, d11, d28          @X2(q1) + q0 + p1 H
+    vmov.i16      q14, #2
+    vaddl.u8      q7, d4, d12           @p1+q2 L
+    vmla.u16      q7, q9, q14           @p1 + X2(p0) + X2(q0) + X2(q1) + q2L
+    vaddl.u8      q2, d5, d13           @p1+q2H
+    vmla.u16      q2, q13, q14          @p1 + X2(p0) + X2(q0) + X2(q1) + q2H
+    vrshrn.u16    d24, q12, #2          @(X2(q1) + q0 + p1 + 2) >> 2; L q0'
+    vrshrn.u16    d25, q8, #2           @(X2(q1) + q0 + p1 + 2) >> 2; H q0'
+    vaddw.u8      q9, q9, d12           @p0 + q0 + q1 + q2 L
+    vaddw.u8      q13, q13, d13         @p0 + q0 + q1 + q2 H
+    vrshrn.u16    d16, q7, #3           @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo"
+    vpop          {q7}
+    vrshrn.u16    d17, q2, #3           @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo"
+    vrshrn.u16    d4, q9, #2            @p0 + q0 + q1 + q2 + 2)>>2 L q1'
+    vrshrn.u16    d5, q13, #2           @p0 + q0 + q1 + q2 + 2)>>2 H q1'
+    vbit          q12, q8, q15          @q0' or q0"
+    vbic          q15, q15, q11         @final condn for q's
+    vtrn.8        d0, d2                @row1 &2
+    vbit          q5, q2, q15           @final q1
+    vtrn.8        d1, d3                @row9 &10
+    vaddl.u8      q8, d12, d14          @q2+q3 L
+    vtrn.8        d20, d6               @row3&row4
+    vaddl.u8      q2, d13, d15          @q2+q3 H
+    vtrn.8        d21, d7               @row11 & 12
+    vmla.u16      q9, q8, q14           @X2(q3) + X3(q2) + q1 + q0 + p0 L
+    vtrn.16       d2, d6                @row2 & row4
+    vmla.u16      q13, q2, q14          @X2(q3) + X3(q2) + q1 + q0 + p0 H
+    vtrn.16       d3, d7                @row10 & 12
+    vbif          q4, q12, q11          @final q0
+    vtrn.16       d0, d20               @row1 & 3
+    vrshrn.u16    d18, q9, #3           @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L
+    vtrn.16       d1, d21               @row9 & row11
+    vrshrn.u16    d19, q13, #3          @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H
+    vtrn.8        d8, d10               @row5&6
+    vbit          q6, q9, q15           @final q2
+    vtrn.8        d9, d11               @row13 &14
+    vtrn.8        d12, d14              @row7 & 8
+    vtrn.8        d13, d15              @row15 & 16
+    vtrn.16       d10, d14              @row6 & row8
+    vtrn.16       d11, d15              @row14 & row16
+    @now Q3 ->p0 and Q7->q3
+    vtrn.16       d8, d12               @row 5 & 7
+    vtrn.16       d9, d13               @row13 & row15
+    sub           r0, r0, r1, lsl#4     @restore pointer
+    vtrn.32       d6, d14               @row4 & 8
+    vtrn.32       d7, d15               @row 12 & 16
+    vtrn.32       d0, d8                @row1 & row5
+    vtrn.32       d1, d9                @row9 & 13
+    vtrn.32       d2, d10               @row2 &6
+    vtrn.32       d3, d11               @row10&row14
+    vtrn.32       d20, d12              @row3 & 7
+    vtrn.32       d21, d13              @row11 & row15
+    vst1.8        d0, [r0], r1          @row1
+    vst1.8        d2, [r0], r1          @row2
+    vst1.8        d20, [r0], r1         @row3
+    vst1.8        d6, [r0], r1          @row4
+    vst1.8        d8, [r0], r1          @row5
+    vst1.8        d10, [r0], r1         @row6
+    vst1.8        d12, [r0], r1         @row7
+    vst1.8        d14, [r0], r1         @row8
+    vst1.8        d1, [r0], r1          @row9
+    vst1.8        d3, [r0], r1          @row10
+    vst1.8        d21, [r0], r1         @row11
+    vst1.8        d7, [r0], r1          @row12
+    vst1.8        d9, [r0], r1          @row13
+    vst1.8        d11, [r0], r1         @row14
+    vst1.8        d13, [r0], r1         @row15
+    vst1.8        d15, [r0], r1         @row16
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a luma block vertical edge when the
+@*     boundary strength is set to 4 on calling twice
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha
+@*  Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@*  Beta Value for the boundary
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_luma_vert_bs4_mbaff_a9
+
+ih264_deblk_luma_vert_bs4_mbaff_a9:
+
+    stmfd         sp!, {lr}
+
+    sub           r0, r0, #4            @pointer uc_edgePixel-4
+    vpush         {d8 - d15}
+    @loading [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] for every row
+    vld4.16       {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+    vld4.16       {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+    vld4.16       {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+    vld4.16       {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+    vld4.16       {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+    vld4.16       {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+    vld4.16       {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+    vld4.16       {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+    vuzp.8        d0, d1                @D0->p3, D1->p2
+    vuzp.8        d2, d3                @D2->p1, D3->p0
+    vuzp.8        d4, d5                @D4->q0, D5->q1
+    vuzp.8        d6, d7                @D6->q2, D7->q3
+
+    vmov.i16      q14, #2
+    vaddl.u8      q4, d3, d4            @p0+q0
+    vaddw.u8      q5, q4, d2            @p0+q0+p1
+    vaddl.u8      q6, d1, d5            @p2+q1
+    vmla.u16      q6, q5, q14           @p2 + X2(p1) + X2(p0) + X2(q0) + q1
+
+    vmov.i8       d14, #2
+    vaddw.u8      q4, q5, d1            @p0+q0+p1+p2
+    vdup.i8       d15, r2               @duplicate alpha
+    vrshrn.u16    d10, q4, #2           @(p2 + p1 + p0 + q0 + 2) >> 2) p1'
+    vabd.u8       d11, d3, d4           @ABD(p0-q0)
+    vsra.u8       d14, d15, #2          @alpha >>2 +2
+    vabd.u8       d15, d1, d3           @Ap = ABD(p2-p0)
+    vrshrn.u16    d12, q6, #3           @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) p0'
+    vdup.i8       d13, r3               @beta
+    vcgt.u8       d14, d14, d11         @ABS(p0 - q0) <((Alpha >>2) + 2)
+    vaddl.u8      q8, d3, d5            @p0+q1
+    vcgt.u8       d26, d13, d15         @beta>Ap
+    vaddw.u8      q8, q8, d2            @p0+q1+p1
+    vaddw.u8      q8, q8, d2            @p0+q1+2*p1
+    vand          d26, d26, d14         @(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
+    vrshrn.u16    d11, q8, #2           @((X2(p1) + p0 + q1 + 2) >> 2) p0"
+    vbif          d12, d11, d26         @p0' or p0 "
+    vaddl.u8      q9, d1, d0            @p2+p3
+    vadd.u16      q9, q9, q9            @2*(p2+p3)
+    vadd.u16      q4, q4, q9            @(X2(p3) + X3(p2) + p1 + p0 + q0)
+    vabd.u8       d15, d6, d4           @Aq = abs(q2-q0)
+    vabd.u8       d11, d5, d4           @ABS(q1-q0)
+    vrshrn.u16    d8, q4, #3            @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); p2'
+    vabd.u8       d9, d2, d3            @ABS(p1-p0)
+    vcgt.u8       d15, d13, d15         @Aq < Beta
+    vcge.u8       d11, d11, d13         @ABS(q1 - q0) >= Beta
+    vcge.u8       d9, d9, d13           @ABS(p1 - p0) >= beta
+    vdup.i8       d13, r2               @duplicate alpha
+    vand          d15, d15, d14         @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+    vabd.u8       d14, d3, d4           @abs(p0-q0)
+    vorr          d11, d11, d9          @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
+    vcge.u8       d14, d14, d13         @ABS(p0 - q0) >= Alpha
+    vaddl.u8      q10, d3, d4           @p0+q0
+    vorr          d11, d11, d14         @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha
+    vaddw.u8      q10, q10, d5          @p0+q0+q1
+    vbic          d26, d26, d11         @final condn for p's
+    vmov.i8       d14, #2
+    vbif          d3, d12, d11          @final p0
+    vbit          d1, d8, d26           @final p2
+    vbif          d10, d2, d26          @final p1
+    vaddl.u8      q6, d4, d2            @q0+p1
+    vmlal.u8      q6, d5, d14           @X2(q1) + q0 + p1
+
+    vaddl.u8      q11, d2, d6           @p1+q2
+    vmla.u16      q11, q10, q14         @p1 + X2(p0) + X2(q0) + X2(q1) + q2
+    vrshrn.u16    d12, q6, #2           @(X2(q1) + q0 + p1 + 2) >> 2; q0'
+    vaddw.u8      q10, q10, d6          @p0 + q0 + q1 + q2
+    vrshrn.u16    d8, q11, #3           @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 qo"
+
+    vrshrn.u16    d2, q10, #2           @p0 + q0 + q1 + q2 + 2)>>2 q1'
+    vbit          d12, d8, d15          @q0' or q0"
+    vbic          d15, d15, d11         @final condn for q's
+    vbit          d5, d2, d15           @final q1
+    vaddl.u8      q12, d6, d7           @q2+q3
+    vmla.u16      q10, q12, q14         @X2(q3) + X3(q2) + q1 + q0 + p0
+    vbif          d4, d12, d11          @final q0
+    vrshrn.u16    d9, q10, #3           @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3;
+    vbit          d6, d9, d15           @final q2
+    vand          d2, d10, d10          @D0->p3, D1->p2, D2->p1, D3->p0, D4->q0, D5->q1, D6->q2, D7->q3
+
+    vzip.8        d0, d1                @D0,D1 -> [p3:p2]
+    vzip.8        d2, d3                @D2,D3 -> [p1:p0]
+    vzip.8        d4, d5                @D4,D5 -> [q0:q1]
+    vzip.8        d6, d7                @D6,D7 -> [q2:q3]
+
+    sub           r0, r0, r1, lsl#3     @restore pointer
+
+    @storing [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] in every row
+    vst4.16       {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+    vst4.16       {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+    vst4.16       {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+    vst4.16       {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+    vst4.16       {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+    vst4.16       {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+    vst4.16       {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+    vst4.16       {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+    vpop          {d8 - d15}
+    ldmfd         sp!, {pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Performs filtering of a luma block vertical edge for cases where the
+@*     boundary strength is less than 4 on calling twice
+@*
+@* @par Description:
+@*    This operation is described in  Sec. 8.7.2.4 under the title
+@*    "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@*  Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@*  Source stride
+@*
+@* @param[in] r2 - alpha
+@*  Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@*  Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@*  Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@*  tc0_table
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+    .global ih264_deblk_luma_vert_bslt4_mbaff_a9
+
+ih264_deblk_luma_vert_bslt4_mbaff_a9:
+
+    stmfd         sp!, {r12, lr}
+
+    sub           r0, r0, #4            @pointer uc_edgePixel-4
+    ldr           r12, [sp, #8]         @r12 = ui_Bs
+    ldr           r14, [sp, #12]        @r14 = pu1_ClipTab
+    vpush         {d8 - d15}
+    @loading [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] for every row
+    vld4.16       {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+    vld4.16       {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+    vld4.16       {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+    vld4.16       {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+    vld4.16       {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+    vld4.16       {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+    vld4.16       {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+    vld4.16       {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+    vuzp.8        d0, d1                @D0->p3, D1->p2
+    vuzp.8        d2, d3                @D2->p1, D3->p0
+    vuzp.8        d4, d5                @D4->q0, D5->q1
+    vuzp.8        d6, d7                @D6->q2, D7->q3
+
+    rev           r12, r12              @reversing ui_bs
+    vmov.32       d8[0], r12            @D8[0] = ui_Bs
+    vld1.32       d9[0], [r14]          @D9[0] contains cliptab
+    vmovl.u8      q15, d8               @D30 = ui_Bs in each 16 bt scalar
+    vtbl.8        d8, {d9}, d30         @puc_ClipTab[ui_Bs]
+    vsli.16       d8, d8, #8            @D8 = C0
+
+    vrhadd.u8     d10, d3, d4           @((p0 + q0 + 1) >> 1)
+    vmov.i8       d31, #2
+    vabd.u8       d11, d3, d4           @ABS(p0 - q0)
+    vaddl.u8      q6, d10, d1           @(p2 + ((p0 + q0 + 1) >> 1)
+    vmlsl.u8      q6, d2, d31           @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1))
+    vdup.8        d14, r2               @alpha
+    vcle.u8       d11, d14, d11         @ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
+    vdup.i8       d14, r3               @beta
+    vabd.u8       d15, d5, d4           @ABS(q1 - q0)
+    vqshrn.s16    d12, q6, #1           @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1)
+    vcge.u8       d15, d15, d14         @ABS(q1 - q0) >= Beta
+    vabd.u8       d13, d2, d3           @ABS(p1 - p0)
+    vmin.s8       d12, d12, d8          @min(deltap1 ,C0)
+    vorr          d11, d11, d15         @ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
+    vneg.s8       d15, d8               @-C0
+    vcge.u8       d13, d13, d14         @ABS(p1 - p0) >= Beta
+    vmax.s8       d12, d12, d15         @max(deltap1,-C0)
+    vorr          d11, d11, d13         @ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta ||  ABS(p1 - p0) >= Beta)
+    vceq.u16      d13, d30, #0          @ui_bs == 0
+    vaddl.u8      q14, d10, d6          @q2 + ((p0 + q0 + 1) >> 1)
+    vsubw.u8      q14, q14, d5          @q2 + ((p0 + q0 + 1) >> 1) - q1
+    vsubw.u8      q14, q14, d5          @q2 + ((p0 + q0 + 1) >> 1) - 2*q1
+    vorr          d13, d13, d11         @(ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
+                                        @|| (ui_bs == 0)
+    vqshrn.s16    d9, q14, #1           @(q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1
+    vabd.u8       d11, d1, d3           @Ap = ABS(p2 - p0)
+    vabd.u8       d10, d6, d4           @Aq= ABS(q2 - q0)
+    vclt.u8       d11, d11, d14         @Ap < Beta
+    vmin.s8       d9, d9, d8            @min(deltaq1,C0)
+    vclt.u8       d10, d10, d14         @Aq < Beta
+    vmax.s8       d9, d9, d15           @max(deltaq1,-C0)
+    vsubl.u8      q7, d4, d3            @q0 - p0
+    vshl.s16      q7, q7, #2            @(q0 - p0) << 2
+    vsub.u8       d8, d8, d11           @C0 + (Ap < Beta)
+    vaddw.u8      q7, q7, d2            @((q0 - p0) << 2) + p1
+    vsubw.u8      q7, q7, d5            @((q0 - p0) << 2) + (p1 - q1)
+    vbic          d11, d11, d13         @final condition for p1
+    vrshr.s16     q15, q7, #3           @delta = (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3
+    vsub.u8       d8, d8, d10           @C0 + (Ap < Beta) + (Aq < Beta)
+    vbic          d10, d10, d13         @final condition for q1
+    vabs.s16      q14, q15
+    vmovn.i16     d15, q14              @abs(delta)
+    vand          d12, d12, d11         @delatp1
+    vand          d9, d9, d10           @deltaq1
+    vmin.u8       d15, d15, d8          @min((abs(delta),C)
+    vadd.i8       d2, d2, d12           @p1+deltap1
+    vadd.i8       d5, d5, d9            @q1+deltaq1
+    vbic          d15, d15, d13         @abs(delta) of pixels to be changed only
+    vcge.s16      q14, q15, #0
+    vmovn.i16     d14, q14              @sign(delta)
+    vqsub.u8      d11, d3, d15          @clip(p0-delta)
+    vqadd.u8      d3, d3, d15           @clip(p0+delta)
+    vqadd.u8      d12, d4, d15          @clip(q0+delta)
+    vqsub.u8      d4, d4, d15           @clip(q0-delta)
+    vbif          d3, d11, d14          @p0
+    vbif          d4, d12, d14          @q0
+
+    sub           r0, r0, r1, lsl#3     @restore pointer
+                                        @D0->p3, D1->p2, D2->p1, D3->p0, D4->q0, D5->q1, D6->q2, D7->q3
+    vzip.8        d0, d1                @D0,D1 -> [p3:p2]
+    vzip.8        d2, d3                @D2,D3 -> [p1:p0]
+    vzip.8        d4, d5                @D4,D5 -> [q0:q1]
+    vzip.8        d6, d7                @D6,D7 -> [q2:q3]
+
+    @storing [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] in every row
+    vst4.16       {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+    vst4.16       {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+    vst4.16       {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+    vst4.16       {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+    vst4.16       {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+    vst4.16       {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+    vst4.16       {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+    vst4.16       {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+    vpop          {d8 - d15}
+    ldmfd         sp!, {r12, pc}
+
+
+
diff --git a/common/arm/ih264_default_weighted_pred_a9q.s b/common/arm/ih264_default_weighted_pred_a9q.s
new file mode 100755
index 0000000..94cda46
--- /dev/null
+++ b/common/arm/ih264_default_weighted_pred_a9q.s
@@ -0,0 +1,359 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_default_weighted_pred_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for default weighted prediction.
+@* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
+@*
+@* @author
+@*  Kaushik Senthoor R
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_default_weighted_pred_luma_a9q()
+@*  - ih264_default_weighted_pred_chroma_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@*******************************************************************************
+@* @function
+@*  ih264_default_weighted_pred_luma_a9q()
+@*
+@* @brief
+@*  This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma.
+@*
+@* @par Description:
+@*  This function gets two ht x wd blocks, calculates their rounded-average and
+@* stores it in the destination block.
+@*
+@* @param[in] pu1_src1:
+@*  UWORD8 Pointer to the buffer containing the first input block.
+@*
+@* @param[in] pu1_src2:
+@*  UWORD8 Pointer to the buffer containing the second input block.
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd1
+@*  Stride of the first input buffer
+@*
+@* @param[in] src_strd2
+@*  Stride of the second input buffer
+@*
+@* @param[in] dst_strd
+@*  Stride of the destination buffer
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
+@*
+@*******************************************************************************
+@*/
+@void ih264_default_weighted_pred_luma_a9q(UWORD8 *pu1_src1,
+@                                          UWORD8 *pu1_src2,
+@                                          UWORD8 *pu1_dst,
+@                                          WORD32 src_strd1,
+@                                          WORD32 src_strd2,
+@                                          WORD32 dst_strd,
+@                                          WORD32 ht,
+@                                          WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@   r0      => pu1_src1
+@   r1      => pu1_src2
+@   r2      => pu1_dst
+@   r3      => src_strd1
+@   [sp]    => src_strd2 (r4)
+@   [sp+4]  => dst_strd  (r5)
+@   [sp+8]  => ht        (r6)
+@   [sp+12] => wd        (r7)
+@
+.text
+.p2align 2
+
+    .global ih264_default_weighted_pred_luma_a9q
+
+ih264_default_weighted_pred_luma_a9q:
+
+    stmfd         sp!, {r4-r7, r14}     @stack stores the values of the arguments
+    ldr           r7, [sp, #32]         @Load wd
+    ldr           r4, [sp, #20]         @Load src_strd2
+    ldr           r5, [sp, #24]         @Load dst_strd
+    cmp           r7, #16
+    ldr           r6, [sp, #28]         @Load ht
+    vpush         {d8-d15}
+    beq           loop_16               @branch if wd is 16
+    cmp           r7, #8
+    beq           loop_8                @branch if wd is 8
+
+loop_4:                                 @each iteration processes four rows
+
+    vld1.32       d0[0], [r0], r3       @load row 1 in source 1
+    vld1.32       d0[1], [r0], r3       @load row 2 in source 1
+    vld1.32       d2[0], [r1], r4       @load row 1 in source 2
+    vld1.32       d2[1], [r1], r4       @load row 2 in source 2
+
+    vld1.32       d1[0], [r0], r3       @load row 3 in source 1
+    vld1.32       d1[1], [r0], r3       @load row 4 in source 1
+    vrhadd.u8     d0, d0, d2
+    vld1.32       d3[0], [r1], r4       @load row 3 in source 2
+    vld1.32       d3[1], [r1], r4       @load row 4 in source 2
+
+    subs          r6, r6, #4            @decrement ht by 4
+    vst1.32       d0[0], [r2], r5       @load row 1 in destination
+    vst1.32       d0[1], [r2], r5       @load row 2 in destination
+    vrhadd.u8     d1, d1, d3
+    vst1.32       d1[0], [r2], r5       @load row 3 in destination
+    vst1.32       d1[1], [r2], r5       @load row 4 in destination
+
+    bgt           loop_4                @if greater than 0 repeat the loop again
+
+    b             end_loops
+
+loop_8:                                 @each iteration processes four rows
+
+    vld1.8        d0, [r0], r3          @load row 1 in source 1
+    vld1.8        d4, [r1], r4          @load row 1 in source 2
+    vld1.8        d1, [r0], r3          @load row 2 in source 1
+    vld1.8        d5, [r1], r4          @load row 2 in source 2
+    vld1.8        d2, [r0], r3          @load row 3 in source 1
+    vrhadd.u8     q0, q0, q2
+    vld1.8        d6, [r1], r4          @load row 3 in source 2
+    vld1.8        d3, [r0], r3          @load row 4 in source 1
+    vrhadd.u8     d2, d2, d6
+    vld1.8        d7, [r1], r4          @load row 4 in source 2
+
+    subs          r6, r6, #4            @decrement ht by 4
+    vst1.8        d0, [r2], r5          @load row 1 in destination
+    vrhadd.u8     d3, d3, d7
+    vst1.8        d1, [r2], r5          @load row 2 in destination
+    vst1.8        d2, [r2], r5          @load row 3 in destination
+    vst1.8        d3, [r2], r5          @load row 4 in destination
+
+    bgt           loop_8                @if greater than 0 repeat the loop again
+
+    b             end_loops
+
+loop_16:                                @each iteration processes eight rows
+
+    vld1.8        {q0}, [r0], r3        @load row 1 in source 1
+    vld1.8        {q8}, [r1], r4        @load row 1 in source 2
+    vld1.8        {q1}, [r0], r3        @load row 2 in source 1
+    vld1.8        {q9}, [r1], r4        @load row 2 in source 2
+    vrhadd.u8     q0, q0, q8
+    vld1.8        {q2}, [r0], r3        @load row 3 in source 1
+    vld1.8        {q10}, [r1], r4       @load row 3 in source 2
+    vrhadd.u8     q1, q1, q9
+    vld1.8        {q3}, [r0], r3        @load row 4 in source 1
+    vld1.8        {q11}, [r1], r4       @load row 4 in source 2
+    vrhadd.u8     q2, q2, q10
+    vld1.8        {q4}, [r0], r3        @load row 5 in source 1
+    vld1.8        {q12}, [r1], r4       @load row 5 in source 2
+    vrhadd.u8     q3, q3, q11
+    vld1.8        {q5}, [r0], r3        @load row 6 in source 1
+    vld1.8        {q13}, [r1], r4       @load row 6 in source 2
+    vrhadd.u8     q4, q4, q12
+    vld1.8        {q6}, [r0], r3        @load row 7 in source 1
+    vld1.8        {q14}, [r1], r4       @load row 7 in source 2
+    vrhadd.u8     q5, q5, q13
+    vld1.8        {q7}, [r0], r3        @load row 8 in source 1
+    vld1.8        {q15}, [r1], r4       @load row 8 in source 2
+
+    vrhadd.u8     q6, q6, q14
+    vst1.8        {q0}, [r2], r5        @load row 1 in destination
+    vst1.8        {q1}, [r2], r5        @load row 2 in destination
+    vrhadd.u8     q7, q7, q15
+    vst1.8        {q2}, [r2], r5        @load row 3 in destination
+    vst1.8        {q3}, [r2], r5        @load row 4 in destination
+    subs          r6, r6, #8            @decrement ht by 8
+    vst1.8        {q4}, [r2], r5        @load row 5 in destination
+    vst1.8        {q5}, [r2], r5        @load row 6 in destination
+    vst1.8        {q6}, [r2], r5        @load row 7 in destination
+    vst1.8        {q7}, [r2], r5        @load row 8 in destination
+
+    bgt           loop_16               @if greater than 0 repeat the loop again
+
+end_loops:
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r7, r15}     @Reload the registers from sp
+
+
+@*******************************************************************************
+@* @function
+@*  ih264_default_weighted_pred_chroma_a9q()
+@*
+@* @brief
+@*  This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma.
+@*
+@* @par Description:
+@*  This function gets two ht x wd blocks, calculates their rounded-average and
+@* stores it in the destination block for U and V.
+@*
+@* @param[in] pu1_src1:
+@*  UWORD8 Pointer to the buffer containing the first input block.
+@*
+@* @param[in] pu1_src2:
+@*  UWORD8 Pointer to the buffer containing the second input block.
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd1
+@*  Stride of the first input buffer
+@*
+@* @param[in] src_strd2
+@*  Stride of the second input buffer
+@*
+@* @param[in] dst_strd
+@*  Stride of the destination buffer
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
+@*
+@*******************************************************************************
+@*/
+@void ih264_default_weighted_pred_chroma_a9q(UWORD8 *pu1_src1,
+@                                            UWORD8 *pu1_src2,
+@                                            UWORD8 *pu1_dst,
+@                                            WORD32 src_strd1,
+@                                            WORD32 src_strd2,
+@                                            WORD32 dst_strd,
+@                                            WORD32 ht,
+@                                            WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@   r0      => pu1_src1
+@   r1      => pu1_src2
+@   r2      => pu1_dst
+@   r3      => src_strd1
+@   [sp]    => src_strd2 (r4)
+@   [sp+4]  => dst_strd  (r5)
+@   [sp+8]  => ht        (r6)
+@   [sp+12] => wd        (r7)
+@
+
+
+    .global ih264_default_weighted_pred_chroma_a9q
+
+ih264_default_weighted_pred_chroma_a9q:
+
+    stmfd         sp!, {r4-r7, r14}     @stack stores the values of the arguments
+    ldr           r7, [sp, #32]         @Load wd
+    ldr           r4, [sp, #20]         @Load src_strd2
+    ldr           r5, [sp, #24]         @Load dst_strd
+    cmp           r7, #8
+    ldr           r6, [sp, #28]         @Load ht
+    vpush         {d8-d15}
+    beq           loop_8_uv             @branch if wd is 8
+    cmp           r7, #4
+    beq           loop_4_uv             @branch if wd is 4
+
+loop_2_uv:                              @each iteration processes two rows
+
+    vld1.32       d0[0], [r0], r3       @load row 1 in source 1
+    vld1.32       d0[1], [r0], r3       @load row 2 in source 1
+
+    vld1.32       d1[0], [r1], r4       @load row 1 in source 2
+    vld1.32       d1[1], [r1], r4       @load row 2 in source 2
+
+    vrhadd.u8     d0, d0, d1
+
+    subs          r6, r6, #2            @decrement ht by 2
+    vst1.32       d0[0], [r2], r5       @load row 1 in destination
+    vst1.32       d0[1], [r2], r5       @load row 2 in destination
+
+    bgt           loop_2_uv             @if greater than 0 repeat the loop again
+
+    b             end_loops_uv
+
+loop_4_uv:                              @each iteration processes two rows
+
+    vld1.8        d0, [r0], r3          @load row 1 in source 1
+    vld1.8        d2, [r1], r4          @load row 1 in source 2
+    vld1.8        d1, [r0], r3          @load row 2 in source 1
+    vrhadd.u8     d0, d0, d2
+    vld1.8        d3, [r1], r4          @load row 2 in source 2
+
+    vrhadd.u8     d1, d1, d3
+    vst1.8        d0, [r2], r5          @load row 1 in destination
+    subs          r6, r6, #2            @decrement ht by 2
+    vst1.8        d1, [r2], r5          @load row 2 in destination
+
+    bgt           loop_4_uv             @if greater than 0 repeat the loop again
+
+    b             end_loops_uv
+
+loop_8_uv:                              @each iteration processes four rows
+
+    vld1.8        {q0}, [r0], r3        @load row 1 in source 1
+    vld1.8        {q4}, [r1], r4        @load row 1 in source 2
+    vld1.8        {q1}, [r0], r3        @load row 2 in source 1
+    vrhadd.u8     q0, q0, q4
+    vld1.8        {q5}, [r1], r4        @load row 2 in source 2
+    vld1.8        {q2}, [r0], r3        @load row 3 in source 1
+    vrhadd.u8     q1, q1, q5
+    vld1.8        {q6}, [r1], r4        @load row 3 in source 2
+    vld1.8        {q3}, [r0], r3        @load row 4 in source 1
+    vrhadd.u8     q2, q2, q6
+    vld1.8        {q7}, [r1], r4        @load row 4 in source 2
+
+    vst1.8        {q0}, [r2], r5        @load row 1 in destination
+    vrhadd.u8     q3, q3, q7
+    vst1.8        {q1}, [r2], r5        @load row 2 in destination
+    subs          r6, r6, #4            @decrement ht by 4
+    vst1.8        {q2}, [r2], r5        @load row 3 in destination
+    vst1.8        {q3}, [r2], r5        @load row 4 in destination
+
+    bgt           loop_8_uv             @if greater than 0 repeat the loop again
+
+end_loops_uv:
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r7, r15}     @Reload the registers from sp
+
+
diff --git a/common/arm/ih264_ihadamard_scaling_a9.s b/common/arm/ih264_ihadamard_scaling_a9.s
new file mode 100755
index 0000000..687099a
--- /dev/null
+++ b/common/arm/ih264_ihadamard_scaling_a9.s
@@ -0,0 +1,250 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ *  ih264_ihadamard_scaling_a9.s
+@ *
+@ * @brief
+@ *  Contains function definitions for inverse hadamard transform on 4x4 DC outputs
+@ *  of 16x16 intra-prediction
+@ *
+@ * @author
+@ *  Mohit
+@ *
+@ * @par List of Functions:
+@ *  - ih264_ihadamard_scaling_4x4_a9()
+@ *  - ih264_ihadamard_scaling_2x2_uv_a9()
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@ */
+@ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
+@ * of a 16x16 intra prediction macroblock, and then performs scaling.
+@ * prediction buffer
+@ *
+@ * @par Description:
+@ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
+@ *  This inverse transformed content is scaled to based on Qp value.
+@ *
+@ * @param[in] pi2_src
+@ *  input 4x4 block of DC coefficients
+@ *
+@ * @param[out] pi2_out
+@ *  output 4x4 block
+@ *
+@ * @param[in] pu2_iscal_mat
+@ *  pointer to scaling list
+@ *
+@ * @param[in] pu2_weigh_mat
+@ *  pointer to weight matrix
+@ *
+@ * @param[in] u4_qp_div_6
+@ *  Floor (qp/6)
+@ *
+@ * @param[in] pi4_tmp
+@ * temporary buffer of size 1*16
+@ *
+@ * @returns none
+@ *
+@ * @remarks none
+@ *
+@ *******************************************************************************
+@ */
+@ *
+@ *******************************************************************************
+@ */
+@ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src,
+@       WORD16* pi2_out,
+@       const UWORD16 *pu2_iscal_mat,
+@       const UWORD16 *pu2_weigh_mat,
+@       UWORD32 u4_qp_div_6,
+@       WORD32* pi4_tmp)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pi2_out
+@r2 =>  *pu2_iscal_mat
+@r3 =>  *pu2_weigh_mat
+@r4 =>  u4_qp_div_6
+
+.text
+.p2align 2
+
+    .global ih264_ihadamard_scaling_4x4_a9
+
+ih264_ihadamard_scaling_4x4_a9:
+
+@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
+@If the macro value changes need to change the instruction according to it.
+@Only one shift is done in horizontal inverse because,
+@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+    stmfd         sp!, {r4-r12, r14}    @ stack stores the values of the arguments
+    ldr           r4, [sp, #40]         @ Loads u4_qp_div_6
+    vdup.s32      q10, r4               @ Populate the u4_qp_div_6 in Q10
+    ldrh          r6, [r3]              @ load pu2_weight_mat[0] , H for unsigned halfword load
+    ldrh          r7, [r2]              @ load pu2_iscal_mat[0] , H for unsigned halfword load
+    mul           r6, r6, r7            @ pu2_iscal_mat[0]*pu2_weigh_mat[0]
+    vdup.s32      q9, r6                @ Populate pu2_iscal_mat[0]*pu2_weigh_mat[0] 32-bit in Q9
+    vpush         {d8-d15}
+@=======================INVERSE HADAMARD TRANSFORM================================
+
+    vld4.s16      {d0, d1, d2, d3}, [r0] @load x4,x5,x6,x7
+    vaddl.s16     q12, d0, d3           @x0 = x4 + x7
+    vaddl.s16     q13, d1, d2           @x1 = x5 + x6
+    vsubl.s16     q14, d1, d2           @x2 = x5 - x6
+    vsubl.s16     q15, d0, d3           @x3 = x4 - x7
+
+    vadd.s32      q2, q12, q13          @pi4_tmp_ptr[0] = x0 + x1
+    vadd.s32      q3, q15, q14          @pi4_tmp_ptr[1] = x3 + x2
+    vsub.s32      q4, q12, q13          @pi4_tmp_ptr[2] = x0 - x1
+    vsub.s32      q5, q15, q14          @pi4_tmp_ptr[3] = x3 - x2
+
+    vtrn.32       q2, q3                @Transpose the register for vertical transform
+    vtrn.32       q4, q5
+
+    vswp          d5, d8                @Q2 = x4, Q4 = x6
+    vswp          d7, d10               @Q3 = x5, Q5 = x7
+
+
+    vadd.s32      q12, q2, q5           @x0 = x4+x7
+    vadd.s32      q13, q3, q4           @x1 = x5+x6
+    vsub.s32      q14, q3, q4           @x2 = x5-x6
+    vsub.s32      q15, q2, q5           @x3 = x4-x7
+
+    vadd.s32      q0, q12, q13          @pi4_tmp_ptr[0] = x0 + x1
+    vadd.s32      q1, q15, q14          @pi4_tmp_ptr[1] = x3 + x2
+    vsub.s32      q2, q12, q13          @pi4_tmp_ptr[2] = x0 - x1
+    vsub.s32      q3, q15, q14          @pi4_tmp_ptr[3] = x3 - x2
+
+
+    vmul.s32      q0, q0, q9            @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+    vmul.s32      q1, q1, q9            @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+    vmul.s32      q2, q2, q9            @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+    vmul.s32      q3, q3, q9            @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+    vshl.s32      q0, q0, q10           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
+    vshl.s32      q1, q1, q10           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
+    vshl.s32      q2, q2, q10           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
+    vshl.s32      q3, q3, q10           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
+
+    vqrshrn.s32   d0, q0, #0x6          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+    vqrshrn.s32   d1, q1, #0x6          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+    vqrshrn.s32   d2, q2, #0x6          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+    vqrshrn.s32   d3, q3, #0x6          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+    vst1.s16      {d0, d1, d2, d3}, [r1] @IV row store the value
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
+
+
+
+@ *******************************************************************************
+@ */
+@ * @brief This function performs a 2x2 inverse hadamard transform for chroma block
+@ *
+@ * @par Description:
+@ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
+@ *  This inverse transformed content is scaled to based on Qp value.
+@ *  Both DC blocks of U and v blocks are processesd
+@ *
+@ * @param[in] pi2_src
+@ *  input 1x8 block of ceffs. First 4 are from U and next from V
+@ *
+@ * @param[out] pi2_out
+@ *  output 1x8 block
+@ *
+@ * @param[in] pu2_iscal_mat
+@ *  pointer to scaling list
+@ *
+@ * @param[in] pu2_weigh_mat
+@ *  pointer to weight matrix
+@ *
+@ * @param[in] u4_qp_div_6
+@ *  Floor (qp/6)
+@ *
+@ * @returns none
+@ *
+@ * @remarks none
+@ *
+@ *******************************************************************************
+@ */
+@ *
+@ *******************************************************************************
+@ */
+@ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
+@                                  WORD16* pi2_out,
+@                                  const UWORD16 *pu2_iscal_mat,
+@                                  const UWORD16 *pu2_weigh_mat,
+@                                  UWORD32 u4_qp_div_6,
+
+    .global ih264_ihadamard_scaling_2x2_uv_a9
+ih264_ihadamard_scaling_2x2_uv_a9:
+
+@Registers used
+@   r0 : *pi2_src
+@   r1 : *pi2_out
+@   r2 : *pu2_iscal_mat
+@   r3 : *pu2_weigh_mat
+
+    vld1.u16      d26[0], [r2]
+    vld1.u16      d27[0], [r3]
+    vmull.u16     q15, d26, d27         @pu2_iscal_mat[0] *  pu2_weigh_mat[0]
+    vdup.u32      q15, d30[0]
+
+    vld1.u16      d28[0], [sp]          @load qp/6
+
+    vpush         {d8-d15}
+
+    vmov.u16      d29, #5
+    vsubl.u16     q14, d28, d29         @qp\6 - 5
+    vdup.s32      q14, d28[0]
+
+    vld2.s16      {d0, d1}, [r0]        @load 8 dc coeffs
+                                        @i2_x4,i2_x6,i2_y4,i1_y6 -> d0
+                                        @i2_x5,i2_x7,i2_y5,i1_y6 -> d1
+
+    vaddl.s16     q1, d0, d1            @  i4_x0 = i4_x4 + i4_x5;...x2
+    vsubl.s16     q2, d0, d1            @  i4_x1 = i4_x4 - i4_x5;...x3
+
+    vtrn.s32      q1, q2                @i4_x0 i4_x1 -> q1
+
+    vadd.s32      q3, q1, q2            @i4_x4 = i4_x0+i4_x2;.. i4_x5
+    vsub.s32      q1, q1, q2            @i4_x6 = i4_x0-i4_x2;.. i4_x7
+
+    vmul.s32      q5, q3, q15
+    vmul.s32      q6, q1, q15
+
+    vshl.s32      q7, q5, q14
+    vshl.s32      q8, q6, q14
+
+    vmovn.s32     d18, q7               @i4_x4 i4_x5 i4_y4 i4_y5
+    vmovn.s32     d19, q8               @i4_x6 i4_x7 i4_y6 i4_y7
+
+    vst2.s32      {d18-d19}, [r1]
+
+    vpop          {d8-d15}
+    bx            lr
+
+
diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s
new file mode 100755
index 0000000..afd2860
--- /dev/null
+++ b/common/arm/ih264_inter_pred_chroma_a9q.s
@@ -0,0 +1,254 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_inter_pred_chroma_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for inter prediction  interpolation.
+@*
+@* @author
+@*  Ittaim
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_inter_pred_chroma_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    Interprediction chroma filter
+@*
+@* @par Description:
+@*   Applies filtering to chroma samples as mentioned in
+@*    sec 8.4.2.2.2 titled "chroma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in]uc_dx
+@*  dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
+@*
+@* @param[in] uc_dy
+@*  dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_chroma(UWORD8 *pu1_src,
+@                             UWORD8 *pu1_dst,
+@                             WORD32 src_strd,
+@                             WORD32 dst_strd,
+@                             UWORD8 u1_dx,
+@                             UWORD8 u1_dy,
+@                             WORD32 ht,
+@                             WORD32 wd)
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  u1_dx
+@   r5 =>  u1_dy
+@   r6 =>  height
+@   r7 => width
+@
+.text
+.p2align 2
+
+    .global ih264_inter_pred_chroma_a9q
+
+ih264_inter_pred_chroma_a9q:
+
+
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    vstmdb        sp!, {d8-d15}         @push neon registers to stack
+    ldr           r4, [sp, #104]
+    ldr           r5, [sp, #108]
+    ldr           r6, [sp, #112]
+    ldr           r7, [sp, #116]
+
+    rsb           r8, r4, #8            @8-u1_dx
+    rsb           r9, r5, #8            @8-u1_dy
+    mul           r10, r8, r9
+    mul           r11, r4, r9
+
+    vdup.u8       d28, r10
+    vdup.u8       d29, r11
+
+    mul           r10, r8, r5
+    mul           r11, r4, r5
+
+    vdup.u8       d30, r10
+    vdup.u8       d31, r11
+
+    subs          r12, r7, #2           @if wd=4 branch to loop_4
+    beq           loop_2
+    subs          r12, r7, #4           @if wd=8 branch to loop_8
+    beq           loop_4
+
+loop_8:
+    sub           r6, #1
+    vld1.8        {d0, d1, d2}, [r0], r2 @ Load row0
+    vld1.8        {d5, d6, d7}, [r0], r2 @ Load row1
+    vext.8        d3, d0, d1, #2
+    vext.8        d8, d5, d6, #2
+
+    vmull.u8      q5, d0, d28
+    vmlal.u8      q5, d5, d30
+    vmlal.u8      q5, d3, d29
+    vmlal.u8      q5, d8, d31
+    vext.8        d9, d6, d7, #2
+    vext.8        d4, d1, d2, #2
+
+inner_loop_8:
+    vmull.u8      q6, d6, d30
+    vmlal.u8      q6, d1, d28
+    vmlal.u8      q6, d9, d31
+    vmlal.u8      q6, d4, d29
+    vmov          d0, d5
+    vmov          d3, d8
+
+    vqrshrun.s16  d14, q5, #6
+    vmov          d1, d6
+    vmov          d4, d9
+
+    vld1.8        {d5, d6, d7}, [r0], r2 @ Load row1
+    vqrshrun.s16  d15, q6, #6
+
+    vext.8        d8, d5, d6, #2
+    subs          r6, #1
+    vext.8        d9, d6, d7, #2
+    vst1.8        {q7}, [r1], r3        @ Store dest row
+
+    vmull.u8      q5, d0, d28
+    vmlal.u8      q5, d5, d30
+    vmlal.u8      q5, d3, d29
+    vmlal.u8      q5, d8, d31
+    bne           inner_loop_8
+
+    vmull.u8      q6, d6, d30
+    vmlal.u8      q6, d1, d28
+    vmlal.u8      q6, d9, d31
+    vmlal.u8      q6, d4, d29
+
+    vqrshrun.s16  d14, q5, #6
+    vqrshrun.s16  d15, q6, #6
+
+    vst1.8        {q7}, [r1], r3        @ Store dest row
+
+    b             end_func
+
+loop_4:
+    sub           r6, #1
+    vld1.8        {d0, d1}, [r0], r2    @ Load row0
+    vld1.8        {d2, d3}, [r0], r2    @ Load row1
+    vext.8        d1, d0, d1, #2
+    vext.8        d3, d2, d3, #2
+
+    vmull.u8      q2, d2, d30
+    vmlal.u8      q2, d0, d28
+    vmlal.u8      q2, d3, d31
+    vmlal.u8      q2, d1, d29
+
+inner_loop_4:
+    subs          r6, #1
+    vmov          d0, d2
+    vmov          d1, d3
+
+    vld1.8        {d2, d3}, [r0], r2    @ Load row1
+    vqrshrun.s16  d6, q2, #6
+
+    vext.8        d3, d2, d3, #2
+    vst1.8        {d6}, [r1], r3        @ Store dest row
+
+    vmull.u8      q2, d0, d28
+    vmlal.u8      q2, d2, d30
+    vmlal.u8      q2, d1, d29
+    vmlal.u8      q2, d3, d31
+    bne           inner_loop_4
+
+    vqrshrun.s16  d6, q2, #6
+    vst1.8        {d6}, [r1], r3        @ Store dest row
+
+    b             end_func
+
+loop_2:
+    vld1.8        {d0}, [r0], r2        @ Load row0
+    vext.8        d1, d0, d0, #2
+    vld1.8        {d2}, [r0], r2        @ Load row1
+    vext.8        d3, d2, d2, #2
+    vmull.u8      q2, d0, d28
+    vmlal.u8      q2, d1, d29
+    vmlal.u8      q2, d2, d30
+    vmlal.u8      q2, d3, d31
+    vld1.8        {d6}, [r0]            @ Load row2
+    vqrshrun.s16  d4, q2, #6
+    vext.8        d7, d6, d6, #2
+    vst1.32       d4[0], [r1], r3       @ Store dest row0
+    vmull.u8      q4, d2, d28
+    vmlal.u8      q4, d3, d29
+    vmlal.u8      q4, d6, d30
+    vmlal.u8      q4, d7, d31
+    subs          r6, #2
+    vqrshrun.s16  d8, q4, #6
+    vst1.32       d8[0], [r1], r3       @ Store dest row1
+    bne           loop_2                @ repeat if ht=2
+
+end_func:
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, pc}     @ Restoring registers from stack
+
diff --git a/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s
new file mode 100755
index 0000000..ea6bba0
--- /dev/null
+++ b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s
@@ -0,0 +1,245 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_inter_pred_luma_horz_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for inter prediction  interpolation.
+@*
+@* @author
+@*  Ittiam
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_inter_pred_luma_horz_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Interprediction luma filter for horizontal input
+@*
+@* @par Description:
+@* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
+@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@ @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_luma_horz (
+@                            UWORD8 *pu1_src,
+@                            UWORD8 *pu1_dst,
+@                            WORD32 src_strd,
+@                            WORD32 dst_strd,
+@                            WORD32 ht,
+@                            WORD32 wd   )
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r5 =>  ht
+@   r6 =>  wd
+
+.text
+.p2align 2
+
+
+    .global ih264_inter_pred_luma_horz_a9q
+
+ih264_inter_pred_luma_horz_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    vstmdb        sp!, {d8-d15}         @push neon registers to stack
+    ldr           r5, [sp, #104]        @Loads ht
+    sub           r0, r0, #2            @pu1_src-2
+    ldr           r6, [sp, #108]        @Loads wd
+    vmov.i8       d0, #5                @filter coeff
+    subs          r12, r6, #8           @if wd=8 branch to loop_8
+    vmov.i8       d1, #20               @filter coeff
+    beq           loop_8
+
+    subs          r12, r6, #4           @if wd=4 branch to loop_4
+    beq           loop_4
+
+loop_16:                                @when  wd=16
+    @// Processing row0 and row1
+    vld1.8        {d2, d3, d4}, [r0], r2 @// Load row0                        ;for checking loop
+    vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
+    vld1.8        {d5, d6, d7}, [r0], r2 @// Load row1
+    vext.8        d30, d3, d4, #5       @//extract a[5]                         (column2,row0)
+    vaddl.u8      q4, d31, d2           @// a0 + a5                             (column1,row0)
+    vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
+    vaddl.u8      q5, d30, d3           @// a0 + a5                             (column2,row0)
+    vext.8        d27, d6, d7, #5       @//extract a[5]                         (column2,row1)
+    vaddl.u8      q7, d28, d5           @// a0 + a5                             (column1,row1)
+    vext.8        d31, d2, d3, #2       @//extract a[2]                         (column1,row0)
+    vaddl.u8      q8, d27, d6           @// a0 + a5                             (column2,row1)
+    vext.8        d30, d3, d4, #2       @//extract a[2]                         (column2,row0)
+    vmlal.u8      q4, d31, d1           @// a0 + a5 + 20a2                      (column1,row0)
+    vext.8        d28, d5, d6, #2       @//extract a[2]                         (column1,row1)
+    vmlal.u8      q5, d30, d1           @// a0 + a5 + 20a2                      (column2,row0)
+    vext.8        d27, d6, d7, #2       @//extract a[2]                         (column2,row1)
+    vmlal.u8      q7, d28, d1           @// a0 + a5 + 20a2                      (column1,row1)
+    vext.8        d31, d2, d3, #3       @//extract a[3]                         (column1,row0)
+    vmlal.u8      q8, d27, d1           @// a0 + a5 + 20a2                      (column2,row1)
+    vext.8        d30, d3, d4, #3       @//extract a[3]                         (column2,row0)
+    vmlal.u8      q4, d31, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row0)
+    vext.8        d28, d5, d6, #3       @//extract a[3]                         (column1,row1)
+    vmlal.u8      q5, d30, d1           @// a0 + a5 + 20a2 + 20a3               (column2,row0)
+    vext.8        d27, d6, d7, #3       @//extract a[3]                         (column2,row1)
+    vmlal.u8      q7, d28, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row1)
+    vext.8        d31, d2, d3, #1       @//extract a[1]                         (column1,row0)
+    vmlal.u8      q8, d27, d1           @// a0 + a5 + 20a2 + 20a3               (column2,row1)
+    vext.8        d30, d3, d4, #1       @//extract a[1]                         (column2,row0)
+    vmlsl.u8      q4, d31, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row0)
+    vext.8        d28, d5, d6, #1       @//extract a[1]                         (column1,row1)
+    vmlsl.u8      q5, d30, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column2,row0)
+    vext.8        d27, d6, d7, #1       @//extract a[1]                         (column2,row1)
+    vmlsl.u8      q7, d28, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row1)
+    vext.8        d31, d2, d3, #4       @//extract a[4]                         (column1,row0)
+    vmlsl.u8      q8, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column2,row1)
+    vext.8        d30, d3, d4, #4       @//extract a[4]                         (column2,row0)
+    vmlsl.u8      q4, d31, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row0)
+    vext.8        d28, d5, d6, #4       @//extract a[4]                         (column1,row1)
+    vmlsl.u8      q5, d30, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2,row0)
+    vext.8        d27, d6, d7, #4       @//extract a[4]                         (column2,row1)
+    vmlsl.u8      q7, d28, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row1)
+    vmlsl.u8      q8, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2,row1)
+    vqrshrun.s16  d20, q4, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row0)
+    vqrshrun.s16  d21, q5, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column2,row0)
+    vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row2)
+    vst1.8        {d20, d21}, [r1], r3  @//Store dest row0
+    vqrshrun.s16  d23, q7, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row1)
+    vext.8        d30, d3, d4, #5       @//extract a[5]                         (column2,row2)
+    vqrshrun.s16  d24, q8, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column2,row1)
+    vst1.8        {d23, d24}, [r1], r3  @//Store dest row1
+    subs          r5, r5, #2            @ 2 rows done, decrement by 2
+
+    beq           end_func
+    b             loop_16               @ loop if height == 8 or 16
+
+loop_8:
+@// Processing row0 and row1
+    vld1.8        {d5, d6}, [r0], r2    @// Load row1
+    vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
+    vld1.8        {d2, d3}, [r0], r2    @// Load row0
+    vext.8        d25, d5, d6, #2       @//extract a[2]                         (column1,row1)
+    vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
+    vext.8        d24, d5, d6, #3       @//extract a[3]                         (column1,row1)
+    vext.8        d23, d5, d6, #1       @//extract a[1]                         (column1,row1)
+    vext.8        d22, d5, d6, #4       @//extract a[4]                         (column1,row1)
+    vaddl.u8      q7, d28, d5           @// a0 + a5                             (column1,row1)
+    vext.8        d29, d2, d3, #3       @//extract a[3]                         (column1,row0)
+    vmlal.u8      q7, d25, d1           @// a0 + a5 + 20a2                      (column1,row1)
+    vmlal.u8      q7, d24, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row1)
+    vmlsl.u8      q7, d23, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row1)
+    vmlsl.u8      q7, d22, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row1)
+    vext.8        d30, d2, d3, #2       @//extract a[2]                         (column1,row0)
+    vaddl.u8      q4, d31, d2           @// a0 + a5                             (column1,row0)
+    vext.8        d27, d2, d3, #1       @//extract a[1]                         (column1,row0)
+    vext.8        d26, d2, d3, #4       @//extract a[4]                         (column1,row0)
+    vmlal.u8      q4, d29, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row0)
+    vmlal.u8      q4, d30, d1           @// a0 + a5 + 20a2                      (column1,row0)
+    vmlsl.u8      q4, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row0)
+    vmlsl.u8      q4, d26, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row0)
+    vqrshrun.s16  d23, q7, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row0)
+    vst1.8        {d23}, [r1], r3       @//Store dest row0
+    vqrshrun.s16  d20, q4, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row1)
+    vst1.8        {d20}, [r1], r3       @//Store dest row1
+    subs          r5, r5, #2            @ 2 rows done, decrement by 2
+
+    beq           end_func              @ Branch if height==4
+
+    b             loop_8 @looping if height =8 or 16
+
+loop_4:
+    vld1.8        {d5, d6}, [r0], r2    @// Load row1
+    vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
+    vld1.8        {d2, d3}, [r0], r2    @// Load row0
+    vext.8        d25, d5, d6, #2       @//extract a[2]                         (column1,row1)
+    vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
+    vaddl.u8      q7, d28, d5           @// a0 + a5                             (column1,row1)
+    vext.8        d24, d5, d6, #3       @//extract a[3]                         (column1,row1)
+    vext.8        d23, d5, d6, #1       @//extract a[1]                         (column1,row1)
+    vext.8        d22, d5, d6, #4       @//extract a[4]                         (column1,row1)
+    vext.8        d29, d2, d3, #3       @//extract a[3]                         (column1,row0)
+    vmlal.u8      q7, d25, d1           @// a0 + a5 + 20a2                      (column1,row1)
+    vmlal.u8      q7, d24, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row1)
+    vmlsl.u8      q7, d23, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row1)
+    vmlsl.u8      q7, d22, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row1)
+    vaddl.u8      q4, d31, d2           @// a0 + a5                             (column1,row0)
+    vext.8        d30, d2, d3, #2       @//extract a[2]                         (column1,row0)
+    vext.8        d27, d2, d3, #1       @//extract a[1]                         (column1,row0)
+    vext.8        d26, d2, d3, #4       @//extract a[4]                         (column1,row0)
+    vmlal.u8      q4, d29, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row0)
+    vmlal.u8      q4, d30, d1           @// a0 + a5 + 20a2                      (column1,row0)
+    vmlsl.u8      q4, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row0)
+    vmlsl.u8      q4, d26, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row0)
+    vqrshrun.s16  d23, q7, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row0)
+    vst1.32       d23[0], [r1], r3      @//Store dest row0
+    vqrshrun.s16  d20, q4, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row1)
+    vst1.32       d20[0], [r1], r3      @//Store dest row1
+    subs          r5, r5, #2            @ 2 rows done, decrement by 2
+    beq           end_func
+
+    b             loop_4
+
+end_func:
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s
new file mode 100755
index 0000000..5b29e02
--- /dev/null
+++ b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s
@@ -0,0 +1,301 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_inter_pred_luma_vert_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for inter prediction  interpolation.
+@*
+@* @author
+@*  Ittiam
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_inter_pred_luma_vert_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ *    Interprediction luma filter for vertical input
+@ *
+@ * @par Description:
+@ *   Applies a 6 tap vertcal filter.The output is  clipped to 8 bits
+@ *    sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@ *
+@ * @param[in] pu1_src
+@ *  UWORD8 pointer to the source
+@ *
+@ * @param[out] pu1_dst
+@ *  UWORD8 pointer to the destination
+@ *
+@ * @param[in] src_strd
+@ *  integer source stride
+@ *
+@ * @param[in] dst_strd
+@ *  integer destination stride
+@ *
+@ * @param[in] ht
+@ *  integer height of the array
+@ *
+@ * @param[in] wd
+@ *  integer width of the array
+@ *
+@ * @returns
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *******************************************************************************
+
+@void ih264_inter_pred_luma_vert (
+@                            UWORD8 *pu1_src,
+@                            UWORD8 *pu1_dst,
+@                            WORD32 src_strd,
+@                            WORD32 dst_strd,
+@                            WORD32 ht,
+@                            WORD32 wd   )
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r5 =>  ht
+@   r6 =>  wd
+
+.text
+.p2align 2
+
+
+    .global ih264_inter_pred_luma_vert_a9q
+
+ih264_inter_pred_luma_vert_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    vstmdb        sp!, {d8-d15}         @push neon registers to stack
+    ldr           r5, [sp, #104]        @Loads ht
+    sub           r0, r0, r2, lsl #1    @pu1_src-2*src_strd
+    ldr           r6, [sp, #108]        @Loads wd
+    vmov.u16      q11, #20              @ Filter coeff 0x14 into Q11
+
+    subs          r12, r6, #8           @if wd=8 branch to loop_8
+    vmov.u16      q12, #5               @ Filter coeff 0x5  into Q12
+    beq           loop_8
+
+    subs          r12, r6, #4           @if wd=4 branch to loop_4
+    beq           loop_4
+
+loop_16:                                @when  wd=16
+
+    vld1.u32      {q0}, [r0], r2        @ Vector load from src[0_0]
+    vld1.u32      {q1}, [r0], r2        @ Vector load from src[1_0]
+    vld1.u32      {q2}, [r0], r2        @ Vector load from src[2_0]
+    vld1.u32      {q3}, [r0], r2        @ Vector load from src[3_0]
+    vld1.u32      {q4}, [r0], r2        @ Vector load from src[4_0]
+    vaddl.u8      q6, d4, d6            @ temp1 = src[2_0] + src[3_0]
+    vld1.u32      {q5}, [r0], r2        @ Vector load from src[5_0]
+
+    vaddl.u8      q7, d0, d10           @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q8, d2, d8            @ temp2 = src[1_0] + src[4_0]
+    vmla.u16      q7, q6, q11           @ temp += temp1 * 20
+    vaddl.u8      q10, d1, d11          @ temp4 = src[0_8] + src[5_8]
+    vaddl.u8      q9, d5, d7            @ temp3 = src[2_8] + src[3_8]
+    vmla.u16      q10, q9, q11          @ temp4 += temp3 * 20
+    vld1.u32      {q0}, [r0], r2
+    vaddl.u8      q13, d3, d9           @ temp5 = src[1_8] + src[4_8]
+    vaddl.u8      q6, d6, d8
+    vmls.u16      q7, q8, q12           @ temp -= temp2 * 5
+    vaddl.u8      q8, d2, d0
+    vaddl.u8      q9, d4, d10
+    vmla.u16      q8, q6, q11
+    vmls.u16      q10, q13, q12         @ temp4 -= temp5 * 5
+    vaddl.u8      q13, d5, d11
+    vaddl.u8      q6, d7, d9
+    vqrshrun.s16  d30, q7, #5           @ dst[0_0] = CLIP_U8((temp +16) >> 5)
+    vaddl.u8      q7, d3, d1
+    vld1.u32      {q1}, [r0], r2
+    vmla.u16      q7, q6, q11
+    vmls.u16      q8, q9, q12
+    vqrshrun.s16  d31, q10, #5          @ dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+    vaddl.u8      q9, d4, d2
+    vaddl.u8      q6, d8, d10
+
+    vst1.u32      {q15}, [r1], r3       @ Vector store to dst[0_0]
+    vmla.u16      q9, q6, q11
+    vaddl.u8      q10, d6, d0
+    vmls.u16      q7, q13, q12
+    vqrshrun.s16  d30, q8, #5
+    vaddl.u8      q6, d9, d11
+    vaddl.u8      q8, d5, d3
+    vaddl.u8      q13, d7, d1
+    vmla.u16      q8, q6, q11
+    vmls.u16      q9, q10, q12
+    vld1.u32      {q2}, [r0], r2
+
+    vqrshrun.s16  d31, q7, #5
+    vaddl.u8      q6, d10, d0
+    vaddl.u8      q7, d6, d4
+    vaddl.u8      q10, d8, d2
+    vmla.u16      q7, q6, q11
+    vmls.u16      q8, q13, q12
+    vst1.u32      {q15}, [r1], r3       @store row 1
+    vqrshrun.s16  d30, q9, #5
+    vaddl.u8      q9, d7, d5
+    vaddl.u8      q6, d11, d1
+    vmla.u16      q9, q6, q11
+    vaddl.u8      q13, d9, d3
+    vmls.u16      q7, q10, q12
+
+    vqrshrun.s16  d31, q8, #5
+    vmls.u16      q9, q13, q12
+    vaddl.u8      q6, d0, d2            @ temp1 = src[2_0] + src[3_0]
+    vst1.u32      {q15}, [r1], r3       @store row 2
+    vaddl.u8      q8, d10, d4           @ temp2 = src[1_0] + src[4_0]
+    vaddl.u8      q10, d9, d7           @ temp4 = src[0_8] + src[5_8]
+    vqrshrun.s16  d30, q7, #5
+    vaddl.u8      q13, d5, d11          @ temp5 = src[1_8] + src[4_8]
+    vaddl.u8      q7, d8, d6            @ temp = src[0_0] + src[5_0]
+    vqrshrun.s16  d31, q9, #5
+    vmla.u16      q7, q6, q11           @ temp += temp1 * 20
+    vaddl.u8      q9, d1, d3            @ temp3 = src[2_8] + src[3_8]
+    vst1.u32      {q15}, [r1], r3       @store row 3
+    subs          r5, r5, #4            @ 4 rows processed, decrement by 4
+    subne         r0, r0 , r2, lsl #2
+    subne         r0, r0, r2
+    beq           end_func              @ Branch if height==4
+
+    b             loop_16 @ looping if height = 8 or 16
+
+loop_8:
+@// Processing row0 and row1
+
+    vld1.u32      d0, [r0], r2          @ Vector load from src[0_0]
+    vld1.u32      d1, [r0], r2          @ Vector load from src[1_0]
+    vld1.u32      d2, [r0], r2          @ Vector load from src[2_0]
+    vld1.u32      d3, [r0], r2          @ Vector load from src[3_0]
+    vld1.u32      d4, [r0], r2          @ Vector load from src[4_0]
+    vld1.u32      d5, [r0], r2          @ Vector load from src[5_0]
+
+    vaddl.u8      q3, d2, d3            @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q4, d0, d5            @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q5, d1, d4            @ temp2 = src[1_0] + src[4_0]
+    vmla.u16      q4, q3, q11           @ temp += temp1 * 20
+    vld1.u32      d6, [r0], r2
+    vaddl.u8      q7, d3, d4
+    vaddl.u8      q8, d1, d6
+    vaddl.u8      q9, d2, d5
+    vmls.u16      q4, q5, q12           @ temp -= temp2 * 5
+    vmla.u16      q8, q7, q11
+    vld1.u32      d7, [r0], r2
+    vaddl.u8      q10, d4, d5
+    vaddl.u8      q6, d2, d7
+    vaddl.u8      q5, d3, d6
+    vmls.u16      q8, q9, q12
+    vqrshrun.s16  d26, q4, #5           @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+    vmla.u16      q6, q10, q11
+    vld1.u32      d0, [r0], r2
+    vaddl.u8      q7, d5, d6
+    vqrshrun.s16  d27, q8, #5
+    vaddl.u8      q10, d3, d0
+    vmls.u16      q6, q5, q12
+    vst1.u32      d26, [r1], r3         @ Vector store to dst[0_0]
+    vaddl.u8      q9, d4, d7
+    vmla.u16      q10, q7, q11
+    vst1.u32      d27, [r1], r3
+    vqrshrun.s16  d28, q6, #5
+    vst1.u32      d28, [r1], r3
+    vmls.u16      q10, q9, q12
+    vqrshrun.s16  d29, q10, #5
+    vst1.u32      d29, [r1], r3         @store row 3
+
+    subs          r5, r5, #4            @ 4 rows processed, decrement by 4
+    subne         r0, r0 , r2, lsl #2
+    subne         r0, r0, r2
+    beq           end_func              @ Branch if height==4
+
+    b             loop_8                @looping if height == 8 or 16
+
+
+loop_4:
+@// Processing row0 and row1
+
+    vld1.u32      d0[0], [r0], r2       @ Vector load from src[0_0]
+    vld1.u32      d1[0], [r0], r2       @ Vector load from src[1_0]
+    vld1.u32      d2[0], [r0], r2       @ Vector load from src[2_0]
+    vld1.u32      d3[0], [r0], r2       @ Vector load from src[3_0]
+    vld1.u32      d4[0], [r0], r2       @ Vector load from src[4_0]
+    vld1.u32      d5[0], [r0], r2       @ Vector load from src[5_0]
+
+    vaddl.u8      q3, d2, d3            @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q4, d0, d5            @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q5, d1, d4            @ temp2 = src[1_0] + src[4_0]
+    vmla.u16      q4, q3, q11           @ temp += temp1 * 20
+    vld1.u32      d6[0], [r0], r2
+    vaddl.u8      q7, d3, d4
+    vaddl.u8      q8, d1, d6
+    vaddl.u8      q9, d2, d5
+    vmls.u16      q4, q5, q12           @ temp -= temp2 * 5
+    vld1.u32      d7[0], [r0], r2
+    vmla.u16      q8, q7, q11
+    vaddl.u8      q10, d4, d5
+    vaddl.u8      q6, d2, d7
+    vaddl.u8      q5, d3, d6
+    vmls.u16      q8, q9, q12
+    vqrshrun.s16  d26, q4, #5           @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+    vmla.u16      q6, q10, q11
+    vld1.u32      d0[0], [r0], r2
+    vaddl.u8      q7, d5, d6
+    vqrshrun.s16  d27, q8, #5
+    vaddl.u8      q10, d3, d0
+    vmls.u16      q6, q5, q12
+    vst1.u32      d26[0], [r1], r3      @ Vector store to dst[0_0]
+    vaddl.u8      q9, d4, d7
+    vmla.u16      q10, q7, q11
+    vst1.u32      d27[0], [r1], r3
+    vqrshrun.s16  d28, q6, #5
+    vst1.u32      d28[0], [r1], r3
+    vmls.u16      q10, q9, q12
+    vqrshrun.s16  d29, q10, #5
+    vst1.u32      d29[0], [r1], r3      @store row 3
+
+    subs          r5, r5, #8
+    subeq         r0, r0, r2, lsl #2
+    subeq         r0, r0, r2
+    beq           loop_4                @ Loop if height==8
+
+end_func:
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_bilinear_a9q.s b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s
new file mode 100755
index 0000000..6a3c83d
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s
@@ -0,0 +1,398 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_inter_pred_luma_bilinear_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for inter prediction  interpolation.
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_inter_pred_luma_bilinear_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@ *******************************************************************************
+@ *  function:ih264_inter_pred_luma_bilinear
+@ *
+@* @brief
+@*    This routine applies the bilinear filter to the predictors .
+@*    The  filtering operation is described in
+@*    sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @par Description:
+@\note
+@*     This function is called to obtain pixels lying at the following
+@*    locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) .
+@*    The function averages the two adjacent values from the two input arrays in horizontal direction.
+@*
+@*
+@* @param[in] pu1_src1:
+@*  UWORD8 Pointer to the buffer containing the first input array.
+@*
+@* @param[in] pu1_src2:
+@*  UWORD8 Pointer to the buffer containing the second input array.
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination where the output of bilinear filter is stored.
+@*
+@* @param[in] src_strd1
+@*  Stride of the first input buffer
+@*
+@* @param[in] src_strd2
+@*  Stride of the second input buffer
+@*
+@* @param[in] dst_strd
+@*  integer destination stride of pu1_dst
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
+@                                   UWORD8 *pu1_src2,
+@                                   UWORD8 *pu1_dst,
+@                                   WORD32 src_strd1,
+@                                   WORD32 src_strd2,
+@                                   WORD32 dst_strd,
+@                                   WORD32 height,
+@                                   WORD32 width)
+@
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src1
+@   r1 => *pu1_src2
+@   r2 => *pu1_dst
+@   r3 =>  src_strd1
+@   r4 =>  src_strd2
+@   r5 =>  dst_strd
+@   r6 =>  height
+@   r7 => width
+@
+.text
+.p2align 2
+
+    .global ih264_inter_pred_luma_bilinear_a9q
+
+ih264_inter_pred_luma_bilinear_a9q:
+
+
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    vstmdb        sp!, {d8-d15}         @push neon registers to stack
+    ldr           r4, [sp, #104]
+    ldr           r5, [sp, #108]        @
+    ldr           r6, [sp, #112]
+    ldr           r7, [sp, #116]
+
+    subs          r12, r7, #4           @if wd=4 branch to loop_4
+    beq           loop_4
+    subs          r12, r7, #8           @if wd=8 branch to loop_8
+    beq           loop_8
+
+loop_16:                                @when  wd=16
+
+    vld1.8        {q0}, [r0], r3        @// Load row0 ;src1
+    vld1.8        {q2}, [r1], r4        @// Load row0  ;src2
+    vld1.8        {q1}, [r0], r3        @// Load row1 ;src1
+    vaddl.u8      q10, d0, d4
+    vld1.8        {q3}, [r1], r4        @// Load row1  ;src2
+    vaddl.u8      q11, d1, d5
+    vld1.8        {q4}, [r0], r3        @// Load row2 ;src1
+    vaddl.u8      q12, d2, d6
+    vld1.8        {q5}, [r0], r3        @// Load row3 ;src1
+    vaddl.u8      q13, d3, d7
+    vld1.8        {q6}, [r1], r4        @// Load row2  ;src2
+    vaddl.u8      q8, d8, d12
+    vld1.8        {q7}, [r1], r4        @// Load row3  ;src2
+    vaddl.u8      q9, d9, d13
+    vqrshrun.s16  d28, q10, #1
+    vqrshrun.s16  d29, q11, #1
+    vaddl.u8      q10, d10, d14
+    vqrshrun.s16  d30, q12, #1
+    vqrshrun.s16  d31, q13, #1
+    vst1.8        {q14}, [r2], r5       @//Store dest row0
+    vaddl.u8      q11, d11, d15
+    vst1.8        {q15}, [r2], r5       @//Store dest row1
+    vqrshrun.s16  d28, q8, #1
+    vld1.8        {q0}, [r0], r3        @// Load row4 ;src1
+    vqrshrun.s16  d29, q9, #1
+    vld1.8        {q1}, [r0], r3        @// Load row5 ;src1
+    vqrshrun.s16  d30, q10, #1
+    vld1.8        {q2}, [r1], r4        @// Load row4  ;src2
+    vqrshrun.s16  d31, q11, #1
+    vld1.8        {q3}, [r1], r4        @// Load row5  ;src2
+    vaddl.u8      q10, d0, d4
+    vst1.8        {q14}, [r2], r5       @//Store dest row2
+    vaddl.u8      q13, d3, d7
+    vst1.8        {q15}, [r2], r5       @//Store dest row3
+    vaddl.u8      q11, d1, d5
+    vld1.8        {q4}, [r0], r3        @// Load row6 ;src1
+    vaddl.u8      q12, d2, d6
+    vld1.8        {q5}, [r0], r3        @// Load row7 ;src1
+    vqrshrun.s16  d28, q10, #1
+    vld1.8        {q6}, [r1], r4        @// Load row6  ;src2
+    vqrshrun.s16  d29, q11, #1
+    vld1.8        {q7}, [r1], r4        @// Load row7  ;src2
+    vaddl.u8      q8, d8, d12
+    vaddl.u8      q9, d9, d13
+    vaddl.u8      q10, d10, d14
+    vqrshrun.s16  d30, q12, #1
+    vqrshrun.s16  d31, q13, #1
+    vst1.8        {q14}, [r2], r5       @//Store dest row4
+    vaddl.u8      q11, d11, d15
+    vst1.8        {q15}, [r2], r5       @//Store dest row5
+    vqrshrun.s16  d28, q8, #1
+    vqrshrun.s16  d30, q10, #1
+    vqrshrun.s16  d29, q9, #1
+    vld1.8        {q2}, [r1], r4        @// Load row8  ;src2
+    vqrshrun.s16  d31, q11, #1
+    vst1.8        {q14}, [r2], r5       @//Store dest row6
+    subs          r12, r6, #8
+    vst1.8        {q15}, [r2], r5       @//Store dest row7
+
+    beq           end_func @ end function if ht=8
+
+    vld1.8        {q0}, [r0], r3        @// Load row8 ;src1
+    vaddl.u8      q10, d0, d4
+    vld1.8        {q1}, [r0], r3        @// Load row9 ;src1
+    vaddl.u8      q11, d1, d5
+    vld1.8        {q3}, [r1], r4        @// Load row9  ;src2
+    vqrshrun.s16  d28, q10, #1
+    vld1.8        {q4}, [r0], r3        @// Load row10 ;src1
+    vqrshrun.s16  d29, q11, #1
+    vld1.8        {q5}, [r0], r3        @// Load row11 ;src1
+    vaddl.u8      q12, d2, d6
+    vld1.8        {q6}, [r1], r4        @// Load row10  ;src2
+    vaddl.u8      q13, d3, d7
+    vld1.8        {q7}, [r1], r4        @// Load row11 ;src2
+    vaddl.u8      q8, d8, d12
+    vaddl.u8      q9, d9, d13
+    vaddl.u8      q10, d10, d14
+    vqrshrun.s16  d30, q12, #1
+    vst1.8        {q14}, [r2], r5       @//Store dest row8
+    vqrshrun.s16  d31, q13, #1
+    vst1.8        {q15}, [r2], r5       @//Store dest row9
+    vqrshrun.s16  d28, q8, #1
+    vld1.8        {q0}, [r0], r3        @// Load row12 ;src1
+    vaddl.u8      q11, d11, d15
+    vld1.8        {q1}, [r0], r3        @// Load row13 ;src1
+    vqrshrun.s16  d29, q9, #1
+    vld1.8        {q2}, [r1], r4        @// Load row12  ;src2
+    vqrshrun.s16  d30, q10, #1
+    vld1.8        {q3}, [r1], r4        @// Load row13  ;src2
+    vqrshrun.s16  d31, q11, #1
+    vst1.8        {q14}, [r2], r5       @//Store dest row10
+    vaddl.u8      q10, d0, d4
+    vst1.8        {q15}, [r2], r5       @//Store dest row11
+    vaddl.u8      q11, d1, d5
+    vld1.8        {q4}, [r0], r3        @// Load row14 ;src1
+    vaddl.u8      q13, d3, d7
+    vld1.8        {q5}, [r0], r3        @// Load row15 ;src1
+    vaddl.u8      q12, d2, d6
+    vld1.8        {q6}, [r1], r4        @// Load row14  ;src2
+    vaddl.u8      q8, d8, d12
+    vld1.8        {q7}, [r1], r4        @// Load row15  ;src2
+    vaddl.u8      q9, d9, d13
+    vqrshrun.s16  d28, q10, #1
+    vqrshrun.s16  d29, q11, #1
+    vaddl.u8      q10, d10, d14
+    vst1.8        {q14}, [r2], r5       @//Store dest row12
+    vqrshrun.s16  d30, q12, #1
+    vqrshrun.s16  d31, q13, #1
+    vaddl.u8      q11, d11, d15
+    vst1.8        {q15}, [r2], r5       @//Store dest row13
+    vqrshrun.s16  d28, q8, #1
+    vqrshrun.s16  d29, q9, #1
+    vqrshrun.s16  d30, q10, #1
+    vst1.8        {q14}, [r2], r5       @//Store dest row14
+    vqrshrun.s16  d31, q11, #1
+    vst1.8        {q15}, [r2], r5       @//Store dest row15
+    b             end_func
+
+
+
+loop_8: @wd=8;
+    vld1.8        {d0}, [r0], r3        @// Load row0 ;src1
+    vld1.8        {d4}, [r1], r4        @// Load row0  ;src2
+    vld1.8        {d1}, [r0], r3        @// Load row1 ;src1
+    vaddl.u8      q10, d0, d4
+    vld1.8        {d5}, [r1], r4        @// Load row1  ;src2
+    vld1.8        {d2}, [r0], r3        @// Load row2 ;src1
+    vqrshrun.s16  d28, q10, #1
+    vld1.8        {d6}, [r1], r4        @// Load row2  ;src2
+    vaddl.u8      q11, d1, d5
+    vld1.8        {d3}, [r0], r3        @// Load row3 ;src1
+    vaddl.u8      q12, d2, d6
+    vst1.8        {d28}, [r2], r5       @//Store dest row0
+    vqrshrun.s16  d29, q11, #1
+    vld1.8        {d7}, [r1], r4        @// Load row3  ;src2
+    vqrshrun.s16  d30, q12, #1
+    vst1.8        {d29}, [r2], r5       @//Store dest row1
+    vaddl.u8      q13, d3, d7
+    vst1.8        {d30}, [r2], r5       @//Store dest row2
+    vqrshrun.s16  d31, q13, #1
+    subs          r12, r6, #4
+    vst1.8        {d31}, [r2], r5       @//Store dest row3
+    beq           end_func @ end function if ht=4
+
+    vld1.8        {d12}, [r1], r4       @// Load row4 ;src2
+    vld1.8        {d8}, [r0], r3        @// Load row4 ;src1
+    vld1.8        {d9}, [r0], r3        @// Load row5 ;src1
+    vaddl.u8      q8, d8, d12
+    vld1.8        {d13}, [r1], r4       @// Load row5  ;src2
+    vld1.8        {d10}, [r0], r3       @// Load row6;src1
+    vaddl.u8      q9, d9, d13
+    vld1.8        {d14}, [r1], r4       @// Load row6  ;src2
+    vqrshrun.s16  d28, q8, #1
+    vld1.8        {d11}, [r0], r3       @// Load row7 ;src1
+    vqrshrun.s16  d29, q9, #1
+    vst1.8        {d28}, [r2], r5       @//Store dest row4
+    vaddl.u8      q10, d10, d14
+    vst1.8        {d29}, [r2], r5       @//Store dest row5
+    vqrshrun.s16  d30, q10, #1
+    vld1.8        {d15}, [r1], r4       @// Load row7 ;src2
+    vaddl.u8      q11, d11, d15
+    vst1.8        {d30}, [r2], r5       @//Store dest row6
+    vqrshrun.s16  d31, q11, #1
+    subs          r12, r6, #8
+    vst1.8        {d31}, [r2], r5       @//Store dest row7
+    beq           end_func @ end function if ht=8
+
+    vld1.8        {d0}, [r0], r3        @// Load row8 ;src1
+    vld1.8        {d4}, [r1], r4        @// Load row8  ;src2
+    vld1.8        {d1}, [r0], r3        @// Load row9 ;src1
+    vaddl.u8      q10, d0, d4
+    vld1.8        {d5}, [r1], r4        @// Load row9  ;src2
+    vld1.8        {d2}, [r0], r3        @// Load row10 ;src1
+    vaddl.u8      q11, d1, d5
+    vld1.8        {d6}, [r1], r4        @// Load row10  ;src2
+    vqrshrun.s16  d28, q10, #1
+    vld1.8        {d3}, [r0], r3        @// Load row11 ;src1
+    vaddl.u8      q12, d2, d6
+    vld1.8        {d7}, [r1], r4        @// Load row11  ;src2
+    vqrshrun.s16  d29, q11, #1
+    vld1.8        {d8}, [r0], r3        @// Load row12 ;src1
+    vaddl.u8      q13, d3, d7
+    vst1.8        {d28}, [r2], r5       @//Store dest row8
+    vqrshrun.s16  d30, q12, #1
+    vld1.8        {d12}, [r1], r4       @// Load row12  ;src2
+    vqrshrun.s16  d31, q13, #1
+    vst1.8        {d29}, [r2], r5       @//Store dest row9
+    vaddl.u8      q8, d8, d12
+    vld1.8        {d9}, [r0], r3        @// Load row13 ;src1
+    vqrshrun.s16  d28, q8, #1
+    vld1.8        {d13}, [r1], r4       @// Load row13  ;src2
+    vld1.8        {d10}, [r0], r3       @// Load row14;src1
+    vaddl.u8      q9, d9, d13
+    vld1.8        {d11}, [r0], r3       @// Load row15 ;src1
+    vld1.8        {d14}, [r1], r4       @// Load row14  ;src2
+    vqrshrun.s16  d29, q9, #1
+    vld1.8        {d15}, [r1], r4       @// Load roW15 ;src2
+    vaddl.u8      q10, d10, d14
+    vst1.8        {d30}, [r2], r5       @//Store dest row10
+    vaddl.u8      q11, d11, d15
+    vst1.8        {d31}, [r2], r5       @//Store dest row11
+    vqrshrun.s16  d30, q10, #1
+    vst1.8        {d28}, [r2], r5       @//Store dest row12
+    vqrshrun.s16  d31, q11, #1
+    vst1.8        {d29}, [r2], r5       @//Store dest row13
+    vst1.8        {d30}, [r2], r5       @//Store dest row14
+    vst1.8        {d31}, [r2], r5       @//Store dest row15
+
+    b             end_func
+
+
+
+loop_4:
+    vld1.32       d0[0], [r0], r3       @// Load row0 ;src1
+    vld1.32       d4[0], [r1], r4       @// Load row0  ;src2
+    vld1.32       d1[0], [r0], r3       @// Load row1 ;src1
+    vaddl.u8      q10, d0, d4
+    vld1.32       d5[0], [r1], r4       @// Load row1  ;src2
+    vld1.32       d2[0], [r0], r3       @// Load row2 ;src1
+    vqrshrun.s16  d28, q10, #1
+    vld1.32       d6[0], [r1], r4       @// Load row2  ;src2
+    vaddl.u8      q11, d1, d5
+    vld1.32       d3[0], [r0], r3       @// Load row3 ;src1
+    vaddl.u8      q12, d2, d6
+    vst1.32       d28[0], [r2], r5      @//Store dest row0
+    vqrshrun.s16  d29, q11, #1
+    vld1.32       d7[0], [r1], r4       @// Load row3  ;src2
+    vqrshrun.s16  d30, q12, #1
+    vst1.32       d29[0], [r2], r5      @//Store dest row1
+    vaddl.u8      q13, d3, d7
+    vst1.32       d30[0], [r2], r5      @//Store dest row2
+    vqrshrun.s16  d31, q13, #1
+    subs          r12, r6, #4
+    vst1.32       d31[0], [r2], r5      @//Store dest row3
+    beq           end_func @ end function if ht=4
+
+    vld1.32       d12[0], [r1], r4      @// Load row4 ;src2
+    vld1.32       d8[0], [r0], r3       @// Load row4 ;src1
+    vld1.32       d9[0], [r0], r3       @// Load row5 ;src1
+    vaddl.u8      q8, d8, d12
+    vld1.32       d13[0], [r1], r4      @// Load row5  ;src2
+    vld1.32       d10[0], [r0], r3      @// Load row6;src1
+    vaddl.u8      q9, d9, d13
+    vld1.32       d14[0], [r1], r4      @// Load row6  ;src2
+    vqrshrun.s16  d28, q8, #1
+    vld1.32       d11[0], [r0], r3      @// Load row7 ;src1
+    vqrshrun.s16  d29, q9, #1
+    vst1.32       d28[0], [r2], r5      @//Store dest row4
+    vaddl.u8      q10, d10, d14
+    vst1.32       d29[0], [r2], r5      @//Store dest row5
+    vqrshrun.s16  d30, q10, #1
+    vld1.32       d15[0], [r1], r4      @// Load row7 ;src2
+    vaddl.u8      q11, d11, d15
+    vst1.32       d30[0], [r2], r5      @//Store dest row6
+    vqrshrun.s16  d31, q11, #1
+    vst1.32       d31[0], [r2], r5      @//Store dest row7
+
+end_func:
+
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_copy_a9q.s b/common/arm/ih264_inter_pred_luma_copy_a9q.s
new file mode 100755
index 0000000..8ba2fbf
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_copy_a9q.s
@@ -0,0 +1,253 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Interprediction luma function for copy
+@*
+@* @par Description:
+@*   Copies the array of width 'wd' and height 'ht' from the  location pointed
+@*   by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ih264_inter_pred_luma_copy (
+@                            UWORD8 *pu1_src,
+@                            UWORD8 *pu1_dst,
+@                            WORD32 src_strd,
+@                            WORD32 dst_strd,
+@                            WORD32 ht,
+@                            WORD32 wd   )
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r7 =>  ht
+@   r12 => wd
+
+.text
+.p2align 2
+
+    .global ih264_inter_pred_luma_copy_a9q
+
+ih264_inter_pred_luma_copy_a9q:
+    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
+    vstmdb        sp!, {d8-d15}         @push neon registers to stack
+    ldr           r12, [sp, #108]       @Loads wd
+    ldr           r7, [sp, #104]        @Loads ht
+    cmp           r7, #0                @checks ht == 0
+    ble           end_loops
+    tst           r12, #15              @checks wd for multiples for 4 & 8
+    beq           core_loop_wd_16
+    tst           r12, #7               @checks wd for multiples for 4 & 8
+    beq           core_loop_wd_8
+    sub           r11, r12, #4
+
+outer_loop_wd_4:
+    subs          r4, r12, #0           @checks wd == 0
+    ble           end_inner_loop_wd_4
+
+inner_loop_wd_4:
+    vld1.32       {d0[0]}, [r0]         @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add           r5, r0, r2            @pu1_src_tmp += src_strd
+    add           r6, r1, r3            @pu1_dst_tmp += dst_strd
+    vst1.32       {d0[0]}, [r1]         @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    vld1.32       {d0[0]}, [r5], r2     @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add           r0, r0, #4            @pu1_src += 4
+    vst1.32       {d0[0]}, [r6], r3     @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    vld1.32       {d0[0]}, [r5], r2     @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    subs          r4, r4, #4            @(wd -4)
+    vst1.32       {d0[0]}, [r6], r3     @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    vld1.32       {d0[0]}, [r5], r2     @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add           r1, r1, #4            @pu1_dst += 4
+    vst1.32       {d0[0]}, [r6], r3     @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+
+    bgt           inner_loop_wd_4
+
+end_inner_loop_wd_4:
+    subs          r7, r7, #4            @ht - 4
+    sub           r0, r5, r11           @pu1_src = pu1_src_tmp
+    sub           r1, r6, r11           @pu1_dst = pu1_dst_tmp
+    bgt           outer_loop_wd_4
+
+end_loops:
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
+
+
+
+core_loop_wd_8:
+    sub           r11, r12, #8
+
+outer_loop_wd_8:
+    subs          r4, r12, #0           @checks wd
+    ble           end_inner_loop_wd_8
+
+inner_loop_wd_8:
+    add           r5, r0, r2            @pu1_src_tmp += src_strd
+    vld1.8        {d0}, [r0]!           @vld1_u8(pu1_src_tmp)
+    add           r6, r1, r3            @pu1_dst_tmp += dst_strd
+    vst1.8        {d0}, [r1]!           @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8        {d1}, [r5], r2        @vld1_u8(pu1_src_tmp)
+    vst1.8        {d1}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
+    subs          r4, r4, #8            @wd - 8(Loop condition)
+    vld1.8        {d2}, [r5], r2        @vld1_u8(pu1_src_tmp)
+    vst1.8        {d2}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8        {d3}, [r5], r2        @vld1_u8(pu1_src_tmp)
+    vst1.8        {d3}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
+    bgt           inner_loop_wd_8
+
+end_inner_loop_wd_8:
+    subs          r7, r7, #4            @ht -= 4
+    sub           r0, r5, r11           @pu1_src = pu1_src_tmp
+    sub           r1, r6, r11           @pu1_dst = pu1_dst_tmp
+    bgt           outer_loop_wd_8
+
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
+
+core_loop_wd_16:
+    sub           r11, r12, #16
+
+outer_loop_wd_16:
+    subs          r4, r12, #0           @checks wd
+    ble           end_inner_loop_wd_16
+
+inner_loop_wd_16:
+    add           r5, r0, r2            @pu1_src_tmp += src_strd
+    vld1.8        {q0}, [r0]!           @vld1_u8(pu1_src_tmp)
+    add           r6, r1, r3            @pu1_dst_tmp += dst_strd
+    vst1.8        {q0}, [r1]!           @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8        {q1}, [r5], r2        @vld1_u8(pu1_src_tmp)
+    vst1.8        {q1}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
+    subs          r4, r4, #16           @wd - 8(Loop condition)
+    vld1.8        {q2}, [r5], r2        @vld1_u8(pu1_src_tmp)
+    vst1.8        {q2}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8        {q3}, [r5], r2        @vld1_u8(pu1_src_tmp)
+    vst1.8        {q3}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
+    bgt           inner_loop_wd_16
+
+end_inner_loop_wd_16:
+    subs          r7, r7, #4            @ht -= 4
+    sub           r0, r5, r11           @pu1_src = pu1_src_tmp
+    sub           r1, r6, r11           @pu1_dst = pu1_dst_tmp
+    bgt           outer_loop_wd_16
+
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
+
+
+@ /*
+@ ********************************************************************************
+@ *
+@ * @brief This function copies a 4x4 block to destination
+@ *
+@ * @par Description:
+@ * Copies a 4x4 block to destination, where both src and dst are interleaved
+@ *
+@ * @param[in] pi2_src
+@ *  Source
+@ *
+@ * @param[in] pu1_out
+@ *  Output pointer
+@ *
+@ * @param[in] pred_strd,
+@ *  Prediction buffer stride
+@ *
+@ * @param[in] out_strd
+@ *  output buffer buffer Stride
+@ *
+@ * @returns none
+@ *
+@ * @remarks none
+@ * Currently wd and height is not used, ie a 4x4 block is always copied
+@ *
+@ *******************************************************************************
+@ */
+@ void ih264_interleave_copy(WORD16 *pi2_src,
+@                            UWORD8 *pu1_out,
+@                            WORD32 pred_strd,
+@                            WORD32 out_strd
+@                            WORD32 wd
+@                            WORD32 ht)
+@ Register Usage
+@ r0 : pi2_src
+@ r1 : pu1_out
+@ r2 : src_strd
+@ r3 : out_strd
+@ Neon registers d0-d7, d16-d30 are used
+@ No need for pushing  arm and neon registers
+
+    .global ih264_interleave_copy_a9
+ih264_interleave_copy_a9:
+
+    vld1.u8       d2, [r0], r2          @load src plane 1 => d2 &pred palne 2 => d3
+    vld1.u8       d3, [r0], r2
+    vld1.u8       d4, [r0], r2
+    vld1.u8       d5, [r0], r2
+
+    mov           r0, r1
+
+    vld1.u8       d18, [r1], r3         @load out [8 bit size) -8 coeffs
+    vld1.u8       d19, [r1], r3
+    vmov.u16      q15, #0x00ff
+    vld1.u8       d20, [r1], r3
+    vld1.u8       d21, [r1], r3
+
+    vbit.u8       q9, q1, q15
+    vbit.u8       q10, q2, q15
+
+    vst1.u8       d18, [r0], r3         @store  out
+    vst1.u8       d19, [r0], r3
+    vst1.u8       d20, [r0], r3
+    vst1.u8       d21, [r0], r3
+
+    bx            lr
+
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
new file mode 100755
index 0000000..43321a8
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
@@ -0,0 +1,441 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for inter prediction  interpolation.
+@*
+@* @author
+@*  Mohit
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   This function implements a two stage cascaded six tap filter. It
+@*    applies the six tap filter in the vertical direction on the
+@*    predictor values, followed by applying the same filter in the
+@*    horizontal direction on the output of the first stage. The six tap
+@*    filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
+@*    interpolation process"
+@*
+@* @par Description:
+@*     This function is called to obtain pixels lying at the following
+@*    location (1/2,1/2). The function interpolates
+@*    the predictors first in the horizontal direction and then in the
+@*    vertical direction to output the (1/2,1/2).
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations: UNUSED in this function.
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/;
+
+@void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
+@                                UWORD8 *pu1_dst,
+@                                WORD32 src_strd,,
+@                                WORD32 dst_strd,
+@                                WORD32 ht,
+@                                WORD32 wd,
+@                                UWORD8* pu1_tmp,
+@                                UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r8 =>  ht
+@   r9 =>  wd
+
+.text
+.p2align 2
+
+    .global ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q
+
+ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    vstmdb        sp!, {d8-d15}         @push neon registers to stack
+    ldr           r8, [sp, #104]        @ loads ht
+    sub           r0, r0, r2, lsl #1    @pu1_src-2*src_strd
+    sub           r0, r0, #2            @pu1_src-2
+    ldr           r9, [sp, #108]        @ loads wd
+
+    vmov.s16      d0, #20               @ Filter coeff 20
+    vmov.s16      d1, #5                @ Filter coeff 5
+    subs          r12, r9, #4           @if wd=4 branch to loop_4
+    beq           loop_4
+    subs          r12, r9, #8           @if wd=8 branch to loop_8
+    beq           loop_8
+
+    mov           r10, #8
+    sub           r7, r3, r10
+    @when  wd=16
+
+loop_16:
+    vld1.u32      {d2, d3, d4}, [r0], r2 @ Vector load from src[0_0]
+    vld1.u32      {d5, d6, d7}, [r0], r2 @ Vector load from src[1_0]
+    vld1.u32      {d8, d9, d10}, [r0], r2 @ Vector load from src[2_0]
+    vld1.u32      {d11, d12, d13}, [r0], r2 @ Vector load from src[3_0]
+    vld1.u32      {d14, d15, d16}, [r0], r2 @ Vector load from src[4_0]
+    vld1.u32      {d17, d18, d19}, [r0], r2 @ Vector load from src[5_0]
+
+    @ vERTICAL FILTERING FOR ROW 0
+    vaddl.u8      q10, d8, d11          @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q12, d2, d17          @ temp2 = src[0_0] + src[5_0]
+    vaddl.u8      q11, d5, d14          @ temp = src[1_0] + src[4_0]
+    vaddl.u8      q13, d3, d18          @ temp2 = src[0_0] + src[5_0]
+    vmla.u16      q12, q10, d0[0]       @ temp += temp1 * 20
+    vmls.s16      q12, q11, d1[0]       @ temp -= temp2 * 5
+    vaddl.u8      q10, d6, d15          @ temp = src[1_0] + src[4_0]
+    vaddl.u8      q11, d9, d12          @ temp3 = src[2_0] + src[3_0]
+    vaddl.u8      q14, d4, d19          @ temp2 = src[0_0] + src[5_0]
+    vmla.u16      q13, q11, d0[0]       @ temp4 += temp3 * 20
+    vmls.s16      q13, q10, d1[0]       @ temp -= temp2 * 5
+    vaddl.u8      q11, d10, d13         @ temp3 = src[2_0] + src[3_0]
+    vaddl.u8      q10, d7, d16          @ temp = src[1_0] + src[4_0]
+    vmla.u16      q14, q11, d0[0]       @ temp4 += temp3 * 20
+    vmls.s16      q14, q10, d1[0]       @ temp -= temp2 * 5
+    vext.16       q10, q12, q13, #5     @//extract a[5]                         (column1)
+
+    @Q12,Q13,Q14  HAVE VERTICAL FILTERED VALUES
+    @CASCADED FILTERING FOR ROW 0
+    vext.16       q11, q12, q13, #2     @//extract a[2]                         (column1)
+    vaddl.s16     q1, d20, d24          @// a0 + a5                             (column1)
+    vaddl.s16     q15, d21, d25         @// a0 + a5                             (column1)
+    vmlal.s16     q1, d22, d0[0]        @// a0 + a5 + 20a2                      (column1)
+    vmlal.s16     q15, d23, d0[0]       @// a0 + a5 + 20a2                      (column1)
+    vext.16       q11, q12, q13, #1     @//extract a[1]                         (column1)
+    vext.16       q10, q12, q13, #3     @//extract a[3]                         (column1)
+    vmlsl.s16     q1, d22, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1)
+    vmlsl.s16     q15, d23, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1)
+    vmlal.s16     q1, d20, d0[0]        @// a0 + a5 + 20a2 + 20a3               (column1)
+    vmlal.s16     q15, d21, d0[0]       @// a0 + a5 + 20a2 + 20a3               (column1)
+    vext.16       q11, q12, q13, #4     @//extract a[4]                         (column1)
+    vext.16       q10, q13, q14, #5     @//extract a[5]                         (column2)
+    vmlsl.s16     q1, d22, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
+    vmlsl.s16     q15, d23, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
+    vqrshrun.s32  d22, q1, #10
+    vqrshrun.s32  d23, q15, #10
+    vqshrun.s16   d22, q11, #0
+    vst1.u8       {d22}, [r1], r10      @//Store dest row0, column 1; (1/2,1/2)
+    vext.16       q11, q13, q14, #2     @//extract a[2]                         (column2)
+    vaddl.s16     q1, d20, d26          @// a0 + a5                             (column2)
+    vaddl.s16     q15, d21, d27         @// a0 + a5                             (column2)
+    vmlal.s16     q1, d22, d0[0]        @// a0 + a5 + 20a2                      (column2)
+    vmlal.s16     q15, d23, d0[0]       @// a0 + a5 + 20a2                      (column2)
+    vext.16       q10, q13, q14, #3     @//extract a[3]                         (column2)
+    vext.16       q11, q13, q14, #1     @//extract a[1]                         (column2)
+    vmlal.s16     q1, d20, d0[0]        @// a0 + a5 + 20a2 + 20a3               (column2)
+    vmlal.s16     q15, d21, d0[0]       @// a0 + a5 + 20a2 + 20a3               (column2)
+    vext.16       q10, q13, q14, #4     @//extract a[4]                         (column2)
+    vmlsl.s16     q1, d22, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1         (column2)
+    vmlsl.s16     q15, d23, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1         (column2)
+    vmlsl.s16     q1, d20, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2)
+    vmlsl.s16     q15, d21, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2)
+    vqrshrun.s32  d20, q1, #10
+    vqrshrun.s32  d21, q15, #10
+    vld1.u32      {d2, d3, d4}, [r0], r2 @ Vector load from src[6_0]
+    vqshrun.s16   d22, q10, #0
+    vst1.u8       {d22}, [r1], r7       @//Store dest row0 ,column 2; (1/2,1/2)
+
+    @ vERTICAL FILTERING FOR ROW 1
+    vaddl.u8      q10, d11, d14         @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q12, d5, d2           @ temp2 = src[0_0] + src[5_0]
+    vaddl.u8      q11, d8, d17          @ temp = src[1_0] + src[4_0]
+    vaddl.u8      q13, d6, d3           @ temp2 = src[0_0] + src[5_0]
+    vmla.u16      q12, q10, d0[0]       @ temp += temp1 * 20
+    vaddl.u8      q10, d9, d18          @ temp = src[1_0] + src[4_0]
+    vmls.s16      q12, q11, d1[0]       @ temp -= temp2 * 5
+    vaddl.u8      q11, d12, d15         @ temp3 = src[2_0] + src[3_0]
+    vaddl.u8      q14, d7, d4           @ temp2 = src[0_0] + src[5_0]
+    vmla.u16      q13, q11, d0[0]       @ temp4 += temp3 * 20
+    vaddl.u8      q11, d13, d16         @ temp3 = src[2_0] + src[3_0]
+    vmls.s16      q13, q10, d1[0]       @ temp -= temp2 * 5
+    vmla.u16      q14, q11, d0[0]       @ temp4 += temp3 * 20
+    vaddl.u8      q10, d10, d19         @ temp = src[1_0] + src[4_0]
+    vmls.s16      q14, q10, d1[0]       @ temp -= temp2 * 5
+    vext.16       q10, q12, q13, #5     @//extract a[5]                         (column1)
+
+    @Q12,Q13,Q14  HAVE VERTICAL FILTERED VALUES
+    @CASCADED FILTERING FOR ROW 1
+    vext.16       q11, q12, q13, #2     @//extract a[2]                         (column1)
+    vaddl.s16     q3, d20, d24          @// a0 + a5                             (column1)
+    vaddl.s16     q15, d21, d25         @// a0 + a5                             (column1)
+    vmlal.s16     q3, d22, d0[0]        @// a0 + a5 + 20a2                      (column1)
+    vmlal.s16     q15, d23, d0[0]       @// a0 + a5 + 20a2                      (column1)
+    vext.16       q11, q12, q13, #1     @//extract a[1]                         (column1)
+    vext.16       q10, q12, q13, #3     @//extract a[3]                         (column1)
+    vmlsl.s16     q3, d22, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1)
+    vmlsl.s16     q15, d23, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1)
+    vmlal.s16     q3, d20, d0[0]        @// a0 + a5 + 20a2 + 20a3               (column1)
+    vmlal.s16     q15, d21, d0[0]       @// a0 + a5 + 20a2 + 20a3               (column1)
+    vext.16       q11, q12, q13, #4     @//extract a[4]                         (column1)
+    vext.16       q10, q13, q14, #5     @//extract a[5]                         (column2)
+    vmlsl.s16     q3, d22, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
+    vmlsl.s16     q15, d23, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
+    vqrshrun.s32  d22, q3, #10
+    vqrshrun.s32  d23, q15, #10
+    vqshrun.s16   d22, q11, #0
+    vst1.u8       {d22}, [r1], r10      @//Store dest row1, column 1; (1/2,1/2)
+    vext.16       q11, q13, q14, #2     @//extract a[2]                         (column2)
+    vaddl.s16     q3, d20, d26          @// a0 + a5                             (column2)
+    vaddl.s16     q15, d21, d27         @// a0 + a5                             (column2)
+    vmlal.s16     q3, d22, d0[0]        @// a0 + a5 + 20a2                      (column2)
+    vmlal.s16     q15, d23, d0[0]       @// a0 + a5 + 20a2                      (column2)
+    vext.16       q10, q13, q14, #3     @//extract a[3]                         (column2)
+    vext.16       q11, q13, q14, #1     @//extract a[1]                         (column2)
+    vmlal.s16     q3, d20, d0[0]        @// a0 + a5 + 20a2 + 20a3               (column2)
+    vmlal.s16     q15, d21, d0[0]       @// a0 + a5 + 20a2 + 20a3               (column2)
+    vext.16       q10, q13, q14, #4     @//extract a[4]                         (column2)
+    vmlsl.s16     q3, d22, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1         (column2)
+    vmlsl.s16     q15, d23, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1         (column2)
+    vmlsl.s16     q3, d20, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2)
+    vmlsl.s16     q15, d21, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2)
+    vqrshrun.s32  d20, q3, #10
+    vqrshrun.s32  d21, q15, #10
+    vqshrun.s16   d22, q10, #0
+    vst1.u8       {d22}, [r1], r7       @//Store dest row1 ,column 2; (1/2,1/2)
+
+    subs          r8, r8, #2            @ 2 rows processed, decrement by 2
+    subne         r0, r0 , r2, lsl #2
+    subne         r0, r0, r2
+    beq           end_func              @ Branch if height==4
+
+    b             loop_16               @ looping if height = 8 or 16
+
+loop_8:
+    vld1.u32      {d2, d3}, [r0], r2    @ Vector load from src[0_0]
+    vld1.u32      {d4, d5}, [r0], r2    @ Vector load from src[1_0]
+    vld1.u32      {d6, d7}, [r0], r2    @ Vector load from src[2_0]
+    vld1.u32      {d8, d9}, [r0], r2    @ Vector load from src[3_0]
+    vld1.u32      {d10, d11}, [r0], r2  @ Vector load from src[4_0]
+    vld1.u32      {d12, d13}, [r0], r2  @ Vector load from src[5_0]
+
+    @ vERTICAL FILTERING FOR ROW 0
+    vaddl.u8      q10, d6, d8           @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q11, d4, d10          @ temp2 = src[1_0] + src4_0]
+    vaddl.u8      q12, d2, d12          @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q13, d3, d13          @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q14, d7, d9           @ temp1 = src[2_0] + src[3_0]
+    vmla.u16      q12, q10, d0[0]       @ temp += temp1 * 20
+    vmls.s16      q12, q11, d1[0]       @ temp -= temp2 * 5
+    vaddl.u8      q15, d5, d11          @ temp2 = src[1_0] + src4_0]
+    vmla.u16      q13, q14, d0[0]       @ temp += temp1 * 20
+    vmls.s16      q13, q15, d1[0]       @ temp -= temp2 * 5
+    @Q12,Q13 HAVE VERTICAL FILTERED VALUES
+    @CASCADED FILTERING FOR ROW 0
+
+    vext.16       q10, q12, q13, #5     @//extract a[5]                         (column1)
+    vext.16       q11, q12, q13, #2     @//extract a[2]                         (column1)
+    vaddl.s16     q14, d20, d24         @// a0 + a5                             (column1)
+    vaddl.s16     q15, d21, d25         @// a0 + a5                             (column1)
+    vext.16       q9, q12, q13, #1      @//extract a[1]                         (column1)
+    vext.16       q10, q12, q13, #3     @//extract a[3]                         (column1)
+    vext.16       q1, q12, q13, #4      @//extract a[4]                         (column1)
+    vmlal.s16     q14, d22, d0[0]       @// a0 + a5 + 20a2                      (column1)
+    vmlsl.s16     q14, d18, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1)
+    vmlal.s16     q14, d20, d0[0]       @// a0 + a5 + 20a2 + 20a3               (column1)
+    vmlsl.s16     q14, d2, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
+    vld1.u32      {d14, d15}, [r0], r2  @ Vector load from src[6_0]
+    vmlal.s16     q15, d23, d0[0]       @// a0 + a5 + 20a2                      (column1)
+    vmlal.s16     q15, d21, d0[0]       @// a0 + a5 + 20a2 + 20a3               (column1)
+    vmlsl.s16     q15, d19, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1)
+    vmlsl.s16     q15, d3, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
+
+    vaddl.u8      q12, d4, d14          @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q13, d5, d15          @ temp = src[0_0] + src[5_0]
+    vqrshrun.s32  d18, q14, #10
+    vaddl.u8      q14, d9, d11          @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q10, d8, d10          @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q11, d6, d12          @ temp2 = src[1_0] + src4_0]
+    vqrshrun.s32  d19, q15, #10
+    vmla.u16      q12, q10, d0[0]       @ temp += temp1 * 20
+    vmls.s16      q12, q11, d1[0]       @ temp -= temp2 * 5
+    vaddl.u8      q15, d7, d13          @ temp2 = src[1_0] + src4_0]
+    vmla.u16      q13, q14, d0[0]       @ temp += temp1 * 20
+    vmls.s16      q13, q15, d1[0]       @ temp -= temp2 * 5
+    vqshrun.s16   d2, q9, #0
+    @ vERTICAL FILTERING FOR ROW 1
+
+    @Q12,Q13 HAVE VERTICAL FILTERED VALUES
+    @CASCADED FILTERING FOR ROW 1
+    vext.16       q10, q12, q13, #5     @//extract a[5]                         (column1)
+    vext.16       q11, q12, q13, #2     @//extract a[2]                         (column1)
+    vaddl.s16     q14, d20, d24         @// a0 + a5                             (column1)
+    vaddl.s16     q15, d21, d25         @// a0 + a5                             (column1)
+    vst1.u8       {d2}, [r1], r3        @//Store dest row0, column 1; (1/2,1/2)
+    vext.16       q9, q12, q13, #1      @//extract a[1]                         (column1)
+    vext.16       q10, q12, q13, #3     @//extract a[3]                         (column1)
+    vext.16       q2, q12, q13, #4      @//extract a[4]                         (column1)
+    vmlal.s16     q14, d22, d0[0]       @// a0 + a5 + 20a2                      (column1)
+    vmlsl.s16     q14, d18, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1)
+    vmlal.s16     q14, d20, d0[0]       @// a0 + a5 + 20a2 + 20a3               (column1)
+    vmlsl.s16     q14, d4, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
+    vmlal.s16     q15, d23, d0[0]       @// a0 + a5 + 20a2                      (column1)
+    vmlal.s16     q15, d21, d0[0]       @// a0 + a5 + 20a2 + 20a3               (column1)
+    vmlsl.s16     q15, d19, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1)
+    vmlsl.s16     q15, d5, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
+    vqrshrun.s32  d18, q14, #10
+    vqrshrun.s32  d19, q15, #10
+    vqshrun.s16   d3, q9, #0
+    vst1.u8       {d3}, [r1], r3        @//Store dest row1, column 1; (1/2,1/2)
+
+    subs          r8, r8, #2            @ 2 rows processed, decrement by 2
+    subne         r0, r0 , r2, lsl #2
+    subne         r0, r0, r2
+    beq           end_func              @ Branch if height==4
+
+    b             loop_8                @looping if height == 8 or 16
+
+loop_4:
+    vld1.u32      {d2, d3}, [r0], r2    @ Vector load from src[0_0]
+    vld1.u32      {d4, d5}, [r0], r2    @ Vector load from src[1_0]
+    vld1.u32      {d6, d7}, [r0], r2    @ Vector load from src[2_0]
+    vld1.u32      {d8, d9}, [r0], r2    @ Vector load from src[3_0]
+    vld1.u32      {d10, d11}, [r0], r2  @ Vector load from src[4_0]
+    vld1.u32      {d12, d13}, [r0], r2  @ Vector load from src[5_0]
+
+    @ vERTICAL FILTERING FOR ROW 0
+    vaddl.u8      q10, d6, d8           @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q11, d4, d10          @ temp2 = src[1_0] + src4_0]
+    vaddl.u8      q12, d2, d12          @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q13, d3, d13          @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q14, d7, d9           @ temp1 = src[2_0] + src[3_0]
+    vmla.u16      q12, q10, d0[0]       @ temp += temp1 * 20
+    vmls.s16      q12, q11, d1[0]       @ temp -= temp2 * 5
+    vaddl.u8      q15, d5, d11          @ temp2 = src[1_0] + src4_0]
+    vmla.u16      q13, q14, d0[0]       @ temp += temp1 * 20
+    vmls.s16      q13, q15, d1[0]       @ temp -= temp2 * 5
+    @Q12,Q13 HAVE VERTICAL FILTERED VALUES
+    @CASCADED FILTERING FOR ROW 0
+
+    vext.16       q10, q12, q13, #5     @//extract a[5]                         (column1)
+    vext.16       q11, q12, q13, #2     @//extract a[2]                         (column1)
+    vaddl.s16     q14, d20, d24         @// a0 + a5                             (column1)
+    vaddl.s16     q15, d21, d25         @// a0 + a5                             (column1)
+
+    vext.16       q1, q12, q13, #4      @//extract a[4]                         (column1)
+    vext.16       q9, q12, q13, #1      @//extract a[1]                         (column1)
+    vext.16       q10, q12, q13, #3     @//extract a[3]                         (column1)
+
+    vmlal.s16     q14, d22, d0[0]       @// a0 + a5 + 20a2                      (column1)
+    vmlsl.s16     q14, d18, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1)
+    vmlal.s16     q14, d20, d0[0]       @// a0 + a5 + 20a2 + 20a3               (column1)
+    vmlsl.s16     q14, d2, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
+    vld1.u32      {d14, d15}, [r0], r2  @ Vector load from src[6_0]
+    vmlal.s16     q15, d23, d0[0]       @// a0 + a5 + 20a2                      (column1)
+    vmlal.s16     q15, d21, d0[0]       @// a0 + a5 + 20a2 + 20a3               (column1)
+    vmlsl.s16     q15, d19, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1)
+    vmlsl.s16     q15, d3, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
+    vaddl.u8      q12, d4, d14          @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q13, d5, d15          @ temp = src[0_0] + src[5_0]
+    vqrshrun.s32  d18, q14, #10
+    vaddl.u8      q14, d9, d11          @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q11, d6, d12          @ temp2 = src[1_0] + src4_0]
+    vaddl.u8      q10, d8, d10          @ temp1 = src[2_0] + src[3_0]
+    vqrshrun.s32  d19, q15, #10
+    vmla.u16      q12, q10, d0[0]       @ temp += temp1 * 20
+    vmls.s16      q12, q11, d1[0]       @ temp -= temp2 * 5
+    vaddl.u8      q15, d7, d13          @ temp2 = src[1_0] + src4_0]
+    vqshrun.s16   d2, q9, #0
+    vmla.u16      q13, q14, d0[0]       @ temp += temp1 * 20
+    vmls.s16      q13, q15, d1[0]       @ temp -= temp2 * 5
+
+    @ vERTICAL FILTERING FOR ROW 1
+
+    @Q12,Q13 HAVE VERTICAL FILTERED VALUES
+    @CASCADED FILTERING FOR ROW 1
+    vext.16       q10, q12, q13, #5     @//extract a[5]                         (column1)
+    vext.16       q11, q12, q13, #2     @//extract a[2]                         (column1)
+    vst1.u32      {d2[0]}, [r1], r3     @//Store dest row0, column 1; (1/2,1/2)
+    vaddl.s16     q14, d20, d24         @// a0 + a5                             (column1)
+    vaddl.s16     q15, d21, d25         @// a0 + a5                             (column1)
+    vext.16       q9, q12, q13, #1      @//extract a[1]                         (column1)
+    vext.16       q10, q12, q13, #3     @//extract a[3]                         (column1)
+    vext.16       q2, q12, q13, #4      @//extract a[4]                         (column1)
+    vmlal.s16     q14, d22, d0[0]       @// a0 + a5 + 20a2                      (column1)
+    vmlsl.s16     q14, d18, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1)
+    vmlal.s16     q14, d20, d0[0]       @// a0 + a5 + 20a2 + 20a3               (column1)
+    vmlsl.s16     q14, d4, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
+    vmlal.s16     q15, d23, d0[0]       @// a0 + a5 + 20a2                      (column1)
+    vmlal.s16     q15, d21, d0[0]       @// a0 + a5 + 20a2 + 20a3               (column1)
+    vmlsl.s16     q15, d19, d1[0]       @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1)
+    vmlsl.s16     q15, d5, d1[0]        @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1)
+    vqrshrun.s32  d18, q14, #10
+    vqrshrun.s32  d19, q15, #10
+    vqshrun.s16   d4, q9, #0
+    vst1.u32      {d4[0]}, [r1], r3     @//Store dest row1, column 1; (1/2,1/2)
+
+    subs          r8, r8, #2            @ 2 rows processed, decrement by 2
+    subne         r0, r0 , r2, lsl #2
+    subne         r0, r0, r2
+    beq           end_func              @ Branch if height==4
+
+    b             loop_4                @looping if height == 8 or 16
+
+end_func:
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
new file mode 100755
index 0000000..65a6de7
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
@@ -0,0 +1,1044 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for inter prediction  interpolation.
+@*
+@* @author
+@*  Mohit
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   This function implements a two stage cascaded six tap filter. It
+@*    applies the six tap filter in the horizontal direction on the
+@*    predictor values, followed by applying the same filter in the
+@*    vertical direction on the output of the first stage. It then averages
+@*    the output of the 1st stage and the output of the 2nd stage to obtain
+@*    the quarter pel values. The six tap filtering operation is described
+@*    in sec 8.4.2.2.1 titled "Luma sample interpolation process".
+@*
+@* @par Description:
+@*     This function is called to obtain pixels lying at the following
+@*    location (1/2,1/4) or (1/2,3/4). The function interpolates
+@*    the predictors first in the horizontal direction and then in the
+@*    vertical direction to output the (1/2,1/2). It then averages
+@*    the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4)
+@*    or (1/2,3/4) depending on the offset.
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/;
+
+@void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
+@                                UWORD8 *pu1_dst,
+@                                WORD32 src_strd,,
+@                                WORD32 dst_strd,
+@                                WORD32 ht,
+@                                WORD32 wd,
+@                                UWORD8* pu1_tmp,
+@                                UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ht
+@   r5 =>  wd
+@   r7 =>  dydx
+@   r9 => *pu1_tmp
+
+.text
+.p2align 2
+
+    .global ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q
+
+ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @ store register values to stack
+    vstmdb        sp!, {d8-d15}         @push neon registers to stack
+    ldr           r4, [sp, #104]        @ loads ht
+    sub           r0, r0, r2, lsl #1    @ pu1_src-2*src_strd
+    sub           r0, r0, #2            @ pu1_src-2
+    ldr           r5, [sp, #108]        @ loads wd
+    ldr           r7, [sp, #116]        @ loads dydx
+    lsr           r7, r7, #3            @ dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
+    ldr           r9, [sp, #112]        @ pu1_tmp
+    add           r7, r7, #2
+    mov           r6, #48
+    mla           r7, r7, r6, r9
+
+    subs          r12, r5, #4           @if wd=4 branch to loop_4
+    beq           loop_4_start
+
+    subs          r12, r5, #8           @if wd=8 branch to loop_8
+    beq           loop_8_start
+
+    @when  wd=16
+    vmov.u16      q11, #20              @ Filter coeff 0x14 into Q11
+    vmov.u16      q12, #5               @ Filter coeff 0x5  into Q12
+    add           r8, r0, #8
+    add           r14, r1, #8
+    add           r10, r9, #8
+    mov           r12, r4
+    add           r11, r7, #8
+
+loop_16_lowhalf_start:
+    vld1.32       {q0}, [r0], r2        @ row -2 load for horizontal filter
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q3, d0, d5
+
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q4, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q3, q4, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q4, d1, d4
+    vld1.32       {q0}, [r0], r2        @ row -1 load for horizontal filter
+    vmls.u16      q3, q4, q12
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q4, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q5, d2, d3
+
+    vst1.32       {q3}, [r9], r6        @ store temp buffer 0
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q4, q5, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q5, d1, d4
+    vld1.32       {q0}, [r0], r2        @ row 0 load for horizontal filter
+    vmls.u16      q4, q5, q12
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q5, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q6, d2, d3
+
+    vst1.32       {q4}, [r9], r6        @ store temp buffer 1
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q5, q6, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q6, d1, d4
+    vld1.32       {q0}, [r0], r2        @ row 1 load for horizontal filter
+    vmls.u16      q5, q6, q12
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q6, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q7, d2, d3
+
+    vst1.32       {q5}, [r9], r6        @ store temp buffer 2
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q6, q7, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q7, d1, d4
+    vld1.32       {q0}, [r0], r2        @ row 2 load for horizontal filter
+    vmls.u16      q6, q7, q12
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q7, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q8, d2, d3
+
+    vst1.32       {q6}, [r9], r6        @ store temp buffer 3
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q7, q8, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q8, d1, d4
+
+    vmls.u16      q7, q8, q12
+loop_16_lowhalf:
+
+    vld1.32       {q0}, [r0], r2        @ row 3 load for horizontal filter
+    vext.8        d5, d0, d1, #5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q8, d0, d5
+
+    vst1.32       {q7}, [r9], r6        @ store temp buffer 4
+    vaddl.u8      q9, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q8, q9, q11
+    vext.8        d1, d0, d1, #1
+    vadd.s16      q14, q4, q7
+    vaddl.u8      q9, d1, d4
+    vadd.s16      q15, q5, q6
+    vmls.u16      q8, q9, q12
+    vld1.32       {q0}, [r0], r2        @ row 4 load for hoorizontal filter
+    vext.8        d5, d0, d1, #5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q10, d0, d5
+
+    vst1.32       {q8}, [r9], r6        @ store temp buffer r5
+
+    vaddl.s16     q9, d6, d16
+
+    vld1.32       {q13}, [r7], r6       @ load from temp buffer 0
+
+    vaddl.s16     q3, d7, d17
+
+    vqrshrun.s16  d26, q13, #5
+
+    vmlal.s16     q9, d30, d22
+    vmlsl.s16     q9, d28, d24
+    vmlal.s16     q3, d31, d22
+    vmlsl.s16     q3, d29, d24
+    vaddl.u8      q1, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q10, q1, q11
+    vqrshrun.s32  d18, q9, #10
+    vext.8        d1, d0, d1, #1
+    vqrshrun.s32  d19, q3, #10
+    vadd.s16      q14, q5, q8
+    vaddl.u8      q1, d1, d4
+    vadd.s16      q15, q6, q7
+    vmls.u16      q10, q1, q12
+    vqmovn.u16    d18, q9
+    vld1.32       {q0}, [r0], r2        @ row 5 load for horizontal filter
+
+    vrhadd.u8     d26, d18, d26
+
+    vext.8        d5, d0, d1, #5
+    vext.8        d2, d0, d1, #2
+
+    vst1.32       {q10}, [r9], r6       @ store temp buffer r6
+
+    vaddl.s16     q9, d8, d20
+
+    vaddl.s16     q3, d9, d21
+
+    vld1.32       {q4}, [r7], r6        @load from temp buffer 1
+
+
+    vst1.32       d26, [r1], r3         @ store row 0
+
+    vmlal.s16     q9, d30, d22
+    vmlsl.s16     q9, d28, d24
+
+    vqrshrun.s16  d28, q4, #5
+
+    vmlal.s16     q3, d31, d22
+    vmlsl.s16     q3, d29, d24
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q4, d0, d5
+    vaddl.u8      q1, d2, d3
+    vqrshrun.s32  d18, q9, #10
+    vext.8        d4, d0, d1, #4
+    vqrshrun.s32  d19, q3, #10
+    vmla.u16      q4, q1, q11
+    vext.8        d1, d0, d1, #1
+    vadd.s16      q13, q6, q10
+    vaddl.u8      q1, d1, d4
+    vqmovn.u16    d18, q9
+    vadd.s16      q15, q7, q8
+    vmls.u16      q4, q1, q12
+    vld1.32       {q0}, [r0], r2        @ row 6 load for horizontal filter
+
+    vrhadd.u8     d28, d28, d18
+
+    vext.8        d5, d0, d1, #5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+
+    vst1.32       d28, [r1], r3         @ store row 1
+
+    vaddl.u8      q14, d0, d5
+
+    vst1.32       {q4}, [r9], r6        @ store temp buffer r7
+
+    vaddl.s16     q9, d10, d8
+    vaddl.s16     q3, d11, d9
+
+    vld1.32       {q5}, [r7], r6        @ load from temp buffer 2
+
+    vmlal.s16     q9, d30, d22
+    vmlsl.s16     q9, d26, d24
+    vmlal.s16     q3, d31, d22
+
+    vqrshrun.s16  d26, q5, #5
+
+    vmlsl.s16     q3, d27, d24
+    vaddl.u8      q1, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q14, q1, q11
+    vqrshrun.s32  d18, q9, #10
+    vext.8        d1, d0, d1, #1
+    vqrshrun.s32  d19, q3, #10
+    vadd.s16      q5, q7, q4
+    vaddl.u8      q1, d1, d4
+    vadd.s16      q15, q8, q10
+    vmls.u16      q14, q1, q12
+    vqmovn.u16    d27, q9
+
+    vaddl.s16     q9, d12, d28
+    vaddl.s16     q3, d13, d29
+
+    vrhadd.u8     d26, d26, d27
+
+    vmlal.s16     q9, d30, d22
+    vmlsl.s16     q9, d10, d24
+    vmlal.s16     q3, d31, d22
+    vmlsl.s16     q3, d11, d24
+
+    vst1.32       d26, [r1], r3         @ store row 2
+
+    vst1.32       {q14}, [r9]
+
+
+    vqrshrun.s32  d18, q9, #10
+    vmov          q5, q10
+    vld1.32       {q15}, [r7], r6       @ load from temp buffer 3
+
+    vqrshrun.s32  d19, q3, #10
+    subs          r4, r4, #4
+
+    vqrshrun.s16  d30, q15, #5
+
+    vqmovn.u16    d18, q9
+    vmov          q6, q4
+    vmov          q3, q7
+    vrhadd.u8     d30, d18, d30
+    vmov          q4, q8
+    vmov          q7, q14
+    vst1.32       d30, [r1], r3         @ store row 3
+
+    bgt           loop_16_lowhalf       @ looping if height =16
+
+
+loop_16_highhalf_start:
+    vld1.32       {q0}, [r8], r2
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q3, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q4, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q3, q4, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q4, d1, d4
+    vld1.32       {q0}, [r8], r2
+    vmls.u16      q3, q4, q12
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q4, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q5, d2, d3
+
+    vst1.32       {q3}, [r10], r6
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q4, q5, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q5, d1, d4
+    vld1.32       {q0}, [r8], r2
+    vmls.u16      q4, q5, q12
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q5, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q6, d2, d3
+
+    vst1.32       {q4}, [r10], r6
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q5, q6, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q6, d1, d4
+    vld1.32       {q0}, [r8], r2
+    vmls.u16      q5, q6, q12
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q6, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q7, d2, d3
+
+    vst1.32       {q5}, [r10], r6
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q6, q7, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q7, d1, d4
+    vld1.32       {q0}, [r8], r2
+    vmls.u16      q6, q7, q12
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q7, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q8, d2, d3
+
+    vst1.32       {q6}, [r10], r6
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q7, q8, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q8, d1, d4
+
+    vmls.u16      q7, q8, q12
+
+loop_16_highhalf:
+
+    vld1.32       {q0}, [r8], r2
+    vext.8        d5, d0, d1, #5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q8, d0, d5
+
+    vst1.32       {q7}, [r10], r6
+
+    vaddl.u8      q9, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q8, q9, q11
+    vext.8        d1, d0, d1, #1
+    vadd.s16      q14, q4, q7
+    vaddl.u8      q9, d1, d4
+    vadd.s16      q15, q5, q6
+    vmls.u16      q8, q9, q12
+    vld1.32       {q0}, [r8], r2
+    vext.8        d5, d0, d1, #5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q10, d0, d5
+
+    vst1.32       {q8}, [r10], r6
+
+    vaddl.s16     q9, d6, d16
+
+    vld1.32       {q13}, [r11], r6
+
+    vaddl.s16     q3, d7, d17
+
+    vqrshrun.s16  d26, q13, #5
+
+    vmlal.s16     q9, d30, d22
+    vmlsl.s16     q9, d28, d24
+    vmlal.s16     q3, d31, d22
+    vmlsl.s16     q3, d29, d24
+    vaddl.u8      q1, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q10, q1, q11
+    vqrshrun.s32  d18, q9, #10
+    vext.8        d1, d0, d1, #1
+    vqrshrun.s32  d19, q3, #10
+    vadd.s16      q14, q5, q8
+    vaddl.u8      q1, d1, d4
+    vadd.s16      q15, q6, q7
+    vmls.u16      q10, q1, q12
+    vqmovn.u16    d18, q9
+    vld1.32       {q0}, [r8], r2
+
+    vrhadd.u8     d26, d18, d26
+
+    vext.8        d5, d0, d1, #5
+    vext.8        d2, d0, d1, #2
+
+    vst1.32       {q10}, [r10], r6
+
+    vaddl.s16     q9, d8, d20
+    vaddl.s16     q3, d9, d21
+
+    vld1.32       {q4}, [r11], r6
+
+
+    vst1.32       d26, [r14], r3        @store row 0
+
+    vmlal.s16     q9, d30, d22
+    vmlsl.s16     q9, d28, d24
+
+    vqrshrun.s16  d28, q4, #5
+
+    vmlal.s16     q3, d31, d22
+    vmlsl.s16     q3, d29, d24
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q4, d0, d5
+    vaddl.u8      q1, d2, d3
+    vqrshrun.s32  d18, q9, #10
+    vext.8        d4, d0, d1, #4
+    vqrshrun.s32  d19, q3, #10
+    vmla.u16      q4, q1, q11
+    vext.8        d1, d0, d1, #1
+    vadd.s16      q13, q6, q10
+    vaddl.u8      q1, d1, d4
+    vqmovn.u16    d18, q9
+    vadd.s16      q15, q7, q8
+    vmls.u16      q4, q1, q12
+    vld1.32       {q0}, [r8], r2
+
+    vrhadd.u8     d28, d28, d18
+
+    vext.8        d5, d0, d1, #5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+
+    vst1.32       d28, [r14], r3        @store row 1
+
+    vaddl.u8      q14, d0, d5
+
+    vst1.32       {q4}, [r10], r6
+
+    vaddl.s16     q9, d10, d8
+    vaddl.s16     q3, d11, d9
+
+    vld1.32       {q5}, [r11], r6
+
+    vmlal.s16     q9, d30, d22
+    vmlsl.s16     q9, d26, d24
+    vmlal.s16     q3, d31, d22
+
+    vqrshrun.s16  d26, q5, #5
+
+    vmlsl.s16     q3, d27, d24
+    vaddl.u8      q1, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q14, q1, q11
+    vqrshrun.s32  d18, q9, #10
+    vext.8        d1, d0, d1, #1
+    vqrshrun.s32  d19, q3, #10
+    vadd.s16      q5, q7, q4
+    vaddl.u8      q1, d1, d4
+    vadd.s16      q15, q8, q10
+    vmls.u16      q14, q1, q12
+    vqmovn.u16    d27, q9
+
+
+    vaddl.s16     q9, d12, d28
+    vaddl.s16     q3, d13, d29
+
+    vrhadd.u8     d26, d26, d27
+
+    vmlal.s16     q9, d30, d22
+    vmlsl.s16     q9, d10, d24
+    vmlal.s16     q3, d31, d22
+    vmlsl.s16     q3, d11, d24
+
+    vst1.32       d26, [r14], r3        @ store row 2
+
+    vst1.32       {q14}, [r10]
+
+    vqrshrun.s32  d18, q9, #10
+    vmov          q5, q10
+    vld1.32       {q15}, [r11], r6
+
+    vqrshrun.s32  d19, q3, #10
+    subs          r12, r12, #4
+
+    vqrshrun.s16  d30, q15, #5
+
+    vqmovn.u16    d18, q9
+    vmov          q6, q4
+    vmov          q3, q7
+    vrhadd.u8     d30, d18, d30
+    vmov          q4, q8
+    vmov          q7, q14
+    vst1.32       d30, [r14], r3        @ store row 3
+
+    bgt           loop_16_highhalf      @ looping if height = 8 or 16
+    b             end_func
+
+loop_8_start:
+
+    vmov.u16      q11, #20              @ Filter coeff 20 into Q11
+    vmov.u16      q12, #5               @ Filter coeff 5  into Q12
+    vld1.32       {q0}, [r0], r2        @ row -2 load for horizontal filter
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q3, d0, d5
+
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q4, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q3, q4, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q4, d1, d4
+    vld1.32       {q0}, [r0], r2        @ row -1 load for horizontal filter
+    vmls.u16      q3, q4, q12
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q4, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q5, d2, d3
+
+    vst1.32       {q3}, [r9], r6        @ store temp buffer 0
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q4, q5, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q5, d1, d4
+    vld1.32       {q0}, [r0], r2        @ row 0 load for horizontal filter
+    vmls.u16      q4, q5, q12
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q5, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q6, d2, d3
+
+    vst1.32       {q4}, [r9], r6        @ store temp buffer 1
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q5, q6, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q6, d1, d4
+    vld1.32       {q0}, [r0], r2        @ row 1 load for horizontal filter
+    vmls.u16      q5, q6, q12
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q6, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q7, d2, d3
+
+    vst1.32       {q5}, [r9], r6        @ store temp buffer 2
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q6, q7, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q7, d1, d4
+    vld1.32       {q0}, [r0], r2        @ row 2 load for horizontal filter
+    vmls.u16      q6, q7, q12
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q7, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q8, d2, d3
+
+    vst1.32       {q6}, [r9], r6        @ store temp buffer 3
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q7, q8, q11
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q8, d1, d4
+
+    vmls.u16      q7, q8, q12
+loop_8:
+
+    vld1.32       {q0}, [r0], r2        @ row 3 load for horizontal filter
+    vext.8        d5, d0, d1, #5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q8, d0, d5
+
+    vst1.32       {q7}, [r9], r6        @ store temp buffer 4
+
+    vaddl.u8      q9, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q8, q9, q11
+    vext.8        d1, d0, d1, #1
+    vadd.s16      q14, q4, q7
+    vaddl.u8      q9, d1, d4
+    vadd.s16      q15, q5, q6
+    vmls.u16      q8, q9, q12
+    vld1.32       {q0}, [r0], r2        @ row 4 load for hoorizontal filter
+    vext.8        d5, d0, d1, #5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q10, d0, d5
+
+    vst1.32       {q8}, [r9], r6        @ store temp buffer r5
+
+    vaddl.s16     q9, d6, d16
+
+    vld1.32       {q13}, [r7], r6       @ load from temp buffer 0
+
+    vaddl.s16     q3, d7, d17
+
+    vqrshrun.s16  d26, q13, #5
+
+    vmlal.s16     q9, d30, d22
+    vmlsl.s16     q9, d28, d24
+    vmlal.s16     q3, d31, d22
+    vmlsl.s16     q3, d29, d24
+    vaddl.u8      q1, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q10, q1, q11
+    vqrshrun.s32  d18, q9, #10
+    vext.8        d1, d0, d1, #1
+    vqrshrun.s32  d19, q3, #10
+    vadd.s16      q14, q5, q8
+    vaddl.u8      q1, d1, d4
+    vadd.s16      q15, q6, q7
+    vmls.u16      q10, q1, q12
+    vqmovn.u16    d18, q9
+    vld1.32       {q0}, [r0], r2        @ row 5 load for horizontal filter
+
+    vrhadd.u8     d26, d18, d26
+
+    vext.8        d5, d0, d1, #5
+    vext.8        d2, d0, d1, #2
+
+    vst1.32       {q10}, [r9], r6       @ store temp buffer r6
+
+    vaddl.s16     q9, d8, d20
+
+    vaddl.s16     q3, d9, d21
+
+    vld1.32       {q4}, [r7], r6        @load from temp buffer 1
+
+
+    vst1.32       d26, [r1], r3         @ store row 0
+
+    vmlal.s16     q9, d30, d22
+    vmlsl.s16     q9, d28, d24
+
+    vqrshrun.s16  d28, q4, #5
+
+    vmlal.s16     q3, d31, d22
+    vmlsl.s16     q3, d29, d24
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q4, d0, d5
+    vaddl.u8      q1, d2, d3
+    vqrshrun.s32  d18, q9, #10
+    vext.8        d4, d0, d1, #4
+    vqrshrun.s32  d19, q3, #10
+    vmla.u16      q4, q1, q11
+    vext.8        d1, d0, d1, #1
+    vadd.s16      q13, q6, q10
+    vaddl.u8      q1, d1, d4
+    vqmovn.u16    d18, q9
+    vadd.s16      q15, q7, q8
+    vmls.u16      q4, q1, q12
+    vld1.32       {q0}, [r0], r2        @ row 6 load for horizontal filter
+
+    vrhadd.u8     d28, d28, d18
+
+    vext.8        d5, d0, d1, #5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+
+    vst1.32       d28, [r1], r3         @ store row 1
+
+    vaddl.u8      q14, d0, d5
+
+    vst1.32       {q4}, [r9], r6        @ store temp buffer r7
+
+    vaddl.s16     q9, d10, d8
+    vaddl.s16     q3, d11, d9
+
+    vld1.32       {q5}, [r7], r6        @ load from temp buffer 2
+
+    vmlal.s16     q9, d30, d22
+    vmlsl.s16     q9, d26, d24
+    vmlal.s16     q3, d31, d22
+
+    vqrshrun.s16  d26, q5, #5
+
+    vmlsl.s16     q3, d27, d24
+    vaddl.u8      q1, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      q14, q1, q11
+    vqrshrun.s32  d18, q9, #10
+    vext.8        d1, d0, d1, #1
+    vqrshrun.s32  d19, q3, #10
+    vadd.s16      q5, q7, q4
+    vaddl.u8      q1, d1, d4
+    vadd.s16      q15, q8, q10
+    vmls.u16      q14, q1, q12
+    vqmovn.u16    d27, q9
+
+    vaddl.s16     q9, d12, d28
+    vaddl.s16     q3, d13, d29
+
+    vrhadd.u8     d26, d26, d27
+
+    vmlal.s16     q9, d30, d22
+    vmlsl.s16     q9, d10, d24
+    vmlal.s16     q3, d31, d22
+    vmlsl.s16     q3, d11, d24
+
+    vst1.32       d26, [r1], r3         @ store row 2
+
+    vst1.32       {q14}, [r9]
+
+
+    vqrshrun.s32  d18, q9, #10
+    vmov          q5, q10
+    vld1.32       {q15}, [r7], r6       @ load from temp buffer 3
+
+    vqrshrun.s32  d19, q3, #10
+    subs          r4, r4, #4
+
+    vqrshrun.s16  d30, q15, #5
+
+    vqmovn.u16    d18, q9
+    vmov          q6, q4
+    vmov          q3, q7
+    vrhadd.u8     d30, d18, d30
+    vmov          q4, q8
+    vmov          q7, q14
+    vst1.32       d30, [r1], r3         @ store row 3
+
+    bgt           loop_8 @if height =8 or 16  loop
+    b             end_func
+
+loop_4_start:
+    vmov.u16      d22, #20              @ Filter coeff 20 into D22
+    vmov.u16      d23, #5               @ Filter coeff 5  into D23
+
+    vld1.32       {q0}, [r0], r2        @row -2 load
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q3, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q4, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      d6, d8, d22
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q4, d1, d4
+    vld1.32       {q0}, [r0], r2        @ row -1 load
+    vmls.u16      d6, d8, d23
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q4, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q5, d2, d3
+
+    vst1.32       d6, [r9], r6          @ store temp buffer 0
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      d8, d10, d22
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q5, d1, d4
+    vld1.32       {q0}, [r0], r2        @ row 0 load
+    vmls.u16      d8, d10, d23
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q5, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q6, d2, d3
+
+    vst1.32       d8, [r9], r6          @ store temp buffer 1
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      d10, d12, d22
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q6, d1, d4
+    vld1.32       {q0}, [r0], r2        @ row 1 load
+    vmls.u16      d10, d12, d23
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q6, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q7, d2, d3
+
+    vst1.32       d10, [r9], r6         @ store temp buffer 2
+
+    vext.8        d4, d0, d1, #4
+    vmla.u16      d12, d14, d22
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q7, d1, d4
+    vld1.32       {q0}, [r0], r2        @ row 2 load
+    vmls.u16      d12, d14, d23
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q7, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q8, d2, d3
+    vext.8        d4, d0, d1, #4
+    vmla.u16      d14, d16, d22
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q8, d1, d4
+
+    vst1.32       d12, [r9], r6         @ store temp buffer 3
+
+    vmls.u16      d14, d16, d23
+
+loop_4:
+
+    vld1.32       {q0}, [r0], r2        @ row 3 load
+    vext.8        d5, d0, d1, #5
+    vaddl.u8      q8, d0, d5
+    vext.8        d2, d0, d1, #2
+    vext.8        d3, d0, d1, #3
+    vaddl.u8      q9, d2, d3
+    vst1.32       d14, [r9], r6         @ store temp buffer 4
+    vext.8        d4, d0, d1, #4
+    vmla.u16      d16, d18, d22
+    vext.8        d1, d0, d1, #1
+    vaddl.u8      q9, d1, d4
+    vadd.s16      d2, d10, d12
+    vmls.u16      d16, d18, d23
+    vadd.s16      d3, d8, d14
+    vld1.32       {q9}, [r0], r2        @ row 4 load
+    vext.8        d25, d18, d19, #5
+    vaddl.u8      q13, d18, d25
+    vext.8        d20, d18, d19, #2
+
+    vst1.32       d16, [r9], r6         @ store temp buffer 5
+
+    vaddl.s16     q0, d6, d16
+    vmlal.s16     q0, d2, d22
+    vext.8        d21, d18, d19, #3
+    vaddl.u8      q14, d20, d21
+    vext.8        d24, d18, d19, #4
+    vmlsl.s16     q0, d3, d23
+    vmla.u16      d26, d28, d22
+    vext.8        d19, d18, d19, #1
+    vaddl.u8      q14, d19, d24
+    vadd.s16      d2, d12, d14
+    vmls.u16      d26, d28, d23
+    vqrshrun.s32  d0, q0, #0xa
+    vadd.s16      d3, d10, d16
+    vld1.32       {q9}, [r0], r2        @ row 5 load
+    vext.8        d25, d18, d19, #5
+    vqmovn.u16    d11, q0
+    vaddl.u8      q14, d18, d25
+
+    vst1.32       d26, [r9], r6         @ store temp buffer 6
+
+    @Q3 available here
+    vld1.32       d6, [r7], r6          @ load from temp buffer 0
+    vld1.32       d7, [r7], r6          @ load from temp buffer 1
+    vqrshrun.s16  d9, q3, #5
+
+    vext.8        d20, d18, d19, #2
+
+    vaddl.s16     q0, d8, d26
+    vmlal.s16     q0, d2, d22
+    vext.8        d21, d18, d19, #3
+    vaddl.u8      q3, d20, d21
+    vext.8        d24, d18, d19, #4
+    vmlsl.s16     q0, d3, d23
+    vmla.u16      d28, d6, d22
+    vext.8        d19, d18, d19, #1
+    vaddl.u8      q3, d19, d24
+    vadd.s16      d2, d14, d16
+    vmls.u16      d28, d6, d23
+    vqrshrun.s32  d0, q0, #0xa
+    vadd.s16      d3, d12, d26
+    vld1.32       {q9}, [r0], r2        @ row 6 load
+    vext.8        d25, d18, d19, #5
+    vqmovn.u16    d13, q0
+
+    vtrn.32       d11, d13
+    vaddl.s16     q0, d10, d28
+    vrhadd.u8     d9, d9, d11
+
+    vst1.32       d28, [r9], r6         @ store temp buffer 7
+
+    vmlal.s16     q0, d2, d22
+    vaddl.u8      q15, d18, d25
+
+    vst1.32       d9[0], [r1], r3       @ store row 0
+
+    vext.8        d20, d18, d19, #2
+
+    vst1.32       d9[1], [r1], r3       @ store row 1
+
+    vext.8        d21, d18, d19, #3
+    vmlsl.s16     q0, d3, d23
+    vaddl.u8      q4, d20, d21
+    vext.8        d24, d18, d19, #4
+    vmla.u16      d30, d8, d22
+    vext.8        d19, d18, d19, #1
+    vaddl.u8      q4, d19, d24
+    vqrshrun.s32  d0, q0, #0xa
+    vadd.s16      d2, d16, d26
+    vmls.u16      d30, d8, d23
+    vqmovn.u16    d4, q0
+
+    vadd.s16      d3, d14, d28
+
+
+    vaddl.s16     q0, d12, d30
+
+    vst1.32       d30, [r9]
+
+    vmlal.s16     q0, d2, d22
+
+    vld1.32       d8, [r7], r6          @ load from temp buffer 2
+    vld1.32       d9, [r7], r6          @ load from temp buffer 3
+    vmlsl.s16     q0, d3, d23
+    subs          r4, r4, #4
+    vqrshrun.s16  d10, q4, #5
+
+    vmov          d12, d28
+
+    vqrshrun.s32  d0, q0, #0xa
+    vmov          d6, d14
+    vmov          d8, d16
+
+    vqmovn.u16    d5, q0
+
+    vtrn.32       d4, d5
+    vrhadd.u8     d4, d4, d10
+    vmov          d10, d26
+    vmov          d14, d30
+
+    vst1.32       d4[0], [r1], r3       @ store row 2
+    vst1.32       d4[1], [r1], r3       @ store row 3
+
+    bgt           loop_4
+
+end_func:
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s
new file mode 100755
index 0000000..c39ae01
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s
@@ -0,0 +1,266 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_inter_pred_luma_horz_qpel_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for inter prediction horizontal quarter pel interpolation.
+@*
+@* @author
+@*  Mohit
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_inter_pred_luma_horz_qpe_a9ql()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Quarter pel interprediction luma filter for horizontal input
+@*
+@* @par Description:
+@* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
+@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@ @param[in] pu1_tmp: temporary buffer: UNUSED in this function
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations.
+@* @returns
+@*
+@ @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_luma_horz (
+@                            UWORD8 *pu1_src,
+@                            UWORD8 *pu1_dst,
+@                            WORD32 src_strd,
+@                            WORD32 dst_strd,
+@                            WORD32 ht,
+@                            WORD32 wd,
+@                            UWORD8* pu1_tmp,
+@                            UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r5 =>  ht
+@   r6 =>  wd
+@   r7 =>  dydx
+
+.text
+.p2align 2
+
+
+    .global ih264_inter_pred_luma_horz_qpel_a9q
+
+ih264_inter_pred_luma_horz_qpel_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    vstmdb        sp!, {d8-d15}         @push neon registers to stack
+    ldr           r5, [sp, #104]        @Loads ht
+    ldr           r6, [sp, #108]        @Loads wd
+    ldr           r7, [sp, #116]        @Loads dydx
+    and           r7, r7, #3            @Finds x-offset
+    add           r7, r0, r7, lsr #1    @pu1_src + (x_offset>>1)
+    sub           r0, r0, #2            @pu1_src-2
+    vmov.i8       d0, #5                @filter coeff
+    subs          r12, r6, #8           @if wd=8 branch to loop_8
+    vmov.i8       d1, #20               @filter coeff
+
+    beq           loop_8
+
+    subs          r12, r6, #4           @if wd=4 branch to loop_4
+    beq           loop_4
+
+loop_16:                                @when  wd=16
+    @// Processing row0 and row1
+    vld1.8        {d2, d3, d4}, [r0], r2 @// Load row0
+    vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
+    vld1.8        {d5, d6, d7}, [r0], r2 @// Load row1
+    vext.8        d30, d3, d4, #5       @//extract a[5]                         (column2,row0)
+    vaddl.u8      q4, d31, d2           @// a0 + a5                             (column1,row0)
+    vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
+    vaddl.u8      q5, d30, d3           @// a0 + a5                             (column2,row0)
+    vext.8        d27, d6, d7, #5       @//extract a[5]                         (column2,row1)
+    vaddl.u8      q7, d28, d5           @// a0 + a5                             (column1,row1)
+    vext.8        d31, d2, d3, #2       @//extract a[2]                         (column1,row0)
+    vaddl.u8      q8, d27, d6           @// a0 + a5                             (column2,row1)
+    vext.8        d30, d3, d4, #2       @//extract a[2]                         (column2,row0)
+    vmlal.u8      q4, d31, d1           @// a0 + a5 + 20a2                      (column1,row0)
+    vext.8        d28, d5, d6, #2       @//extract a[2]                         (column1,row1)
+    vmlal.u8      q5, d30, d1           @// a0 + a5 + 20a2                      (column2,row0)
+    vext.8        d27, d6, d7, #2       @//extract a[2]                         (column2,row1)
+    vmlal.u8      q7, d28, d1           @// a0 + a5 + 20a2                      (column1,row1)
+    vext.8        d31, d2, d3, #3       @//extract a[3]                         (column1,row0)
+    vmlal.u8      q8, d27, d1           @// a0 + a5 + 20a2                      (column2,row1)
+    vext.8        d30, d3, d4, #3       @//extract a[3]                         (column2,row0)
+    vmlal.u8      q4, d31, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row0)
+    vext.8        d28, d5, d6, #3       @//extract a[3]                         (column1,row1)
+    vmlal.u8      q5, d30, d1           @// a0 + a5 + 20a2 + 20a3               (column2,row0)
+    vext.8        d27, d6, d7, #3       @//extract a[3]                         (column2,row1)
+    vmlal.u8      q7, d28, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row1)
+    vext.8        d31, d2, d3, #1       @//extract a[1]                         (column1,row0)
+    vmlal.u8      q8, d27, d1           @// a0 + a5 + 20a2 + 20a3               (column2,row1)
+    vext.8        d30, d3, d4, #1       @//extract a[1]                         (column2,row0)
+    vmlsl.u8      q4, d31, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row0)
+    vext.8        d28, d5, d6, #1       @//extract a[1]                         (column1,row1)
+    vmlsl.u8      q5, d30, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column2,row0)
+    vext.8        d27, d6, d7, #1       @//extract a[1]                         (column2,row1)
+    vmlsl.u8      q7, d28, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row1)
+    vext.8        d31, d2, d3, #4       @//extract a[4]                         (column1,row0)
+    vmlsl.u8      q8, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column2,row1)
+    vext.8        d30, d3, d4, #4       @//extract a[4]                         (column2,row0)
+    vmlsl.u8      q4, d31, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row0)
+    vext.8        d28, d5, d6, #4       @//extract a[4]                         (column1,row1)
+    vmlsl.u8      q5, d30, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2,row0)
+    vext.8        d27, d6, d7, #4       @//extract a[4]                         (column2,row1)
+    vmlsl.u8      q7, d28, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row1)
+    vmlsl.u8      q8, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2,row1)
+    vld1.32       {d12, d13}, [r7], r2  @Load value for interpolation           (column1,row0)
+    vqrshrun.s16  d20, q4, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row0)
+    vqrshrun.s16  d21, q5, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column2,row0)
+    vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row2)
+    vrhadd.u8     q10, q6, q10          @Interpolation step for qpel calculation
+    vqrshrun.s16  d18, q7, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row1)
+    vst1.8        {d20, d21}, [r1], r3  @//Store dest row0
+    vext.8        d30, d3, d4, #5       @//extract a[5]                         (column2,row2)
+    vqrshrun.s16  d19, q8, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column2,row1)
+    vld1.32       {d12, d13}, [r7], r2  @Load value for interpolation           (column1,row1)
+    vrhadd.u8     q9, q6, q9            @Interpolation step for qpel calculation
+    vst1.8        {d18, d19}, [r1], r3  @//Store dest row1
+    subs          r5, r5, #2            @ 2 rows done, decrement by 2
+
+    beq           end_func
+    b             loop_16
+
+loop_8:
+@// Processing row0 and row1
+
+    vld1.8        {d5, d6}, [r0], r2    @// Load row1
+    vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
+    vld1.8        {d2, d3}, [r0], r2    @// Load row0
+    vext.8        d25, d5, d6, #2       @//extract a[2]                         (column1,row1)
+    vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
+    vext.8        d24, d5, d6, #3       @//extract a[3]                         (column1,row1)
+    vext.8        d23, d5, d6, #1       @//extract a[1]                         (column1,row1)
+    vext.8        d22, d5, d6, #4       @//extract a[4]                         (column1,row1)
+    vaddl.u8      q7, d28, d5           @// a0 + a5                             (column1,row1)
+    vext.8        d29, d2, d3, #3       @//extract a[3]                         (column1,row0)
+    vmlal.u8      q7, d25, d1           @// a0 + a5 + 20a2                      (column1,row1)
+    vmlal.u8      q7, d24, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row1)
+    vmlsl.u8      q7, d23, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row1)
+    vmlsl.u8      q7, d22, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row1)
+    vext.8        d30, d2, d3, #2       @//extract a[2]                         (column1,row0)
+    vaddl.u8      q4, d31, d2           @// a0 + a5                             (column1,row0)
+    vext.8        d27, d2, d3, #1       @//extract a[1]                         (column1,row0)
+    vext.8        d26, d2, d3, #4       @//extract a[4]                         (column1,row0)
+    vmlal.u8      q4, d29, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row0)
+    vmlal.u8      q4, d30, d1           @// a0 + a5 + 20a2                      (column1,row0)
+    vmlsl.u8      q4, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row0)
+    vmlsl.u8      q4, d26, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row0)
+    vqrshrun.s16  d18, q7, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row0)
+    vld1.32       d12, [r7], r2         @Load value for interpolation           (column1,row0)
+    vld1.32       d13, [r7], r2         @Load value for interpolation           (column1,row1)
+    vqrshrun.s16  d19, q4, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row1)
+    vrhadd.u8     q9, q6, q9            @Interpolation step for qpel calculation
+    vst1.8        {d18}, [r1], r3       @//Store dest row0
+    vst1.8        {d19}, [r1], r3       @//Store dest row1
+    subs          r5, r5, #2            @ 2 rows done, decrement by 2
+
+    beq           end_func              @ Branch if height==4
+    b             loop_8 @looping if height == 8 or 16
+
+loop_4:
+    vld1.8        {d5, d6}, [r0], r2    @// Load row1
+    vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
+    vld1.8        {d2, d3}, [r0], r2    @// Load row0
+    vext.8        d25, d5, d6, #2       @//extract a[2]                         (column1,row1)
+    vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
+    vaddl.u8      q7, d28, d5           @// a0 + a5                             (column1,row1)
+    vext.8        d24, d5, d6, #3       @//extract a[3]                         (column1,row1)
+    vext.8        d23, d5, d6, #1       @//extract a[1]                         (column1,row1)
+    vext.8        d22, d5, d6, #4       @//extract a[4]                         (column1,row1)
+    vext.8        d29, d2, d3, #3       @//extract a[3]                         (column1,row0)
+    vmlal.u8      q7, d25, d1           @// a0 + a5 + 20a2                      (column1,row1)
+    vmlal.u8      q7, d24, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row1)
+    vmlsl.u8      q7, d23, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row1)
+    vmlsl.u8      q7, d22, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row1)
+    vaddl.u8      q4, d31, d2           @// a0 + a5                             (column1,row0)
+    vext.8        d30, d2, d3, #2       @//extract a[2]                         (column1,row0)
+    vld1.32       d12, [r7], r2         @Load value for interpolation           (column1,row0)
+    vld1.32       d13, [r7], r2         @Load value for interpolation           (column1,row1)
+    vext.8        d27, d2, d3, #1       @//extract a[1]                         (column1,row0)
+    vext.8        d26, d2, d3, #4       @//extract a[4]                         (column1,row0)
+    vmlal.u8      q4, d29, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row0)
+    vmlal.u8      q4, d30, d1           @// a0 + a5 + 20a2                      (column1,row0)
+    vmlsl.u8      q4, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row0)
+    vmlsl.u8      q4, d26, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row0)
+    vqrshrun.s16  d18, q7, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row0)
+    vqrshrun.s16  d19, q4, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row1)
+    vrhadd.u8     q9, q6, q9            @Interpolation step for qpel calculation
+    vst1.32       d18[0], [r1], r3      @//Store dest row0
+    vst1.32       d19[0], [r1], r3      @//Store dest row1
+
+    subs          r5, r5, #2            @ 2 rows done, decrement by 2
+    beq           end_func
+
+    b             loop_4
+
+end_func:
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
new file mode 100755
index 0000000..565cc80
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
@@ -0,0 +1,505 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for inter prediction  interpolation.
+@*
+@* @author
+@*  Mohit
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   This function implements a two stage cascaded six tap filter. It
+@*   applies the six tap filter in the vertical direction on the
+@*   predictor values, followed by applying the same filter in the
+@*   horizontal direction on the output of the first stage. It then averages
+@*   the output of the 1st stage and the final stage to obtain the quarter
+@*   pel values.The six tap filtering operation is described in sec 8.4.2.2.1
+@*   titled "Luma sample interpolation process".
+@*
+@* @par Description:
+@*    This function is called to obtain pixels lying at the following
+@*    location (1/4,1/2) or (3/4,1/2). The function interpolates
+@*    the predictors first in the verical direction and then in the
+@*    horizontal direction to output the (1/2,1/2). It then averages
+@*    the output of the 2nd stage and (1/2,1/2) value to obtain (1/4,1/2)
+@*    or (3/4,1/2) depending on the offset.
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/;
+
+@void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src,
+@                                UWORD8 *pu1_dst,
+@                                WORD32 src_strd,,
+@                                WORD32 dst_strd,
+@                                WORD32 ht,
+@                                WORD32 wd,
+@                                UWORD8* pu1_tmp,
+@                                UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ht
+@   r5 =>  wd
+@   r6 =>  dydx
+@   r9 => *pu1_tmp
+
+.text
+.p2align 2
+
+    .global ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q
+
+ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    vstmdb        sp!, {d8-d15}         @push neon registers to stack
+    ldr           r4, [sp, #104]        @ loads ht
+    sub           r0, r0, r2, lsl #1    @pu1_src-2*src_strd
+    sub           r0, r0, #2            @pu1_src-2
+    ldr           r5, [sp, #108]        @ loads wd
+    ldr           r6, [sp, #116]        @ loads dydx
+    and           r6, r6, #2            @ dydx & 0x3 followed by dydx>>1 and dydx<<1
+    ldr           r9, [sp, #112]        @pu1_tmp
+    add           r7, r9, #4
+    add           r6, r7, r6            @ pi16_pred1_temp += (x_offset>>1)
+
+    vmov.u16      q13, #0x14            @ Filter coeff 20 into Q13
+    vmov.u16      q12, #0x5             @ Filter coeff 5  into Q12
+    mov           r7, #0x20
+    mov           r8, #0x30
+    subs          r12, r5, #4           @if wd=4 branch to loop_4
+    beq           loop_4
+
+    subs          r12, r5, #8           @if wd=8 branch to loop_8
+    beq           loop_8
+
+    @when  wd=16
+    vmov.u16      q14, #0x14            @ Filter coeff 20 into Q13
+    vmov.u16      q15, #0x5             @ Filter coeff 5  into Q12
+    add           r14, r2, #0
+    sub           r2, r2, #16
+
+
+loop_16:
+
+    vld1.u32      {q0}, [r0]!           @ Vector load from src[0_0]
+    vld1.u32      d12, [r0], r2         @ Vector load from src[0_0]
+    vld1.u32      {q1}, [r0]!           @ Vector load from src[1_0]
+    vld1.u32      d13, [r0], r2         @ Vector load from src[1_0]
+    vld1.u32      {q2}, [r0]!           @ Vector load from src[2_0]
+    vld1.u32      d14, [r0], r2         @ Vector load from src[2_0]
+    vld1.u32      {q3}, [r0]!           @ Vector load from src[3_0]
+    vld1.u32      d15, [r0], r2         @ Vector load from src[3_0]
+    vld1.u32      {q4}, [r0]!           @ Vector load from src[4_0]
+    vld1.u32      d16, [r0], r2         @ Vector load from src[4_0]
+
+    vld1.u32      {q5}, [r0]!           @ Vector load from src[5_0]
+    vld1.u32      d17, [r0], r2         @ Vector load from src[5_0]
+
+    vaddl.u8      q10, d4, d6
+    vaddl.u8      q9, d0, d10
+    vaddl.u8      q11, d2, d8
+    vmla.u16      q9, q10, q14
+    vaddl.u8      q12, d5, d7
+    vaddl.u8      q10, d1, d11
+    vaddl.u8      q13, d3, d9
+    vmla.u16      q10, q12, q14
+    vaddl.u8      q12, d14, d15
+    vmls.u16      q9, q11, q15
+    vaddl.u8      q11, d12, d17
+    vmls.u16      q10, q13, q15
+    vaddl.u8      q13, d13, d16
+    vmla.u16      q11, q12, q14
+    vmls.u16      q11, q13, q15
+    vst1.32       {q9}, [r9]!
+    vst1.32       {q10}, [r9]!
+    vext.16       q12, q9, q10, #2
+    vext.16       q13, q9, q10, #3
+    vst1.32       {q11}, [r9]
+    vext.16       q11, q9, q10, #5
+    vadd.s16      q0, q12, q13
+    vext.16       q12, q9, q10, #1
+    vext.16       q13, q9, q10, #4
+    vadd.s16      q12, q12, q13
+
+    vaddl.s16     q13, d18, d22
+    vmlal.s16     q13, d0, d28
+    vmlsl.s16     q13, d24, d30
+
+    vaddl.s16     q11, d19, d23
+    vmlal.s16     q11, d1, d28
+    vmlsl.s16     q11, d25, d30
+
+    vqrshrun.s32  d18, q13, #10
+    vqrshrun.s32  d19, q11, #10
+    vld1.32       {q11}, [r9]!
+    vqmovn.u16    d18, q9
+
+    vext.16       q12, q10, q11, #2
+    vext.16       q13, q10, q11, #3
+    vext.16       q0, q10, q11, #5
+    vst1.32       d18, [r1]
+    vadd.s16      q9, q12, q13
+    vext.16       q12, q10, q11, #1
+    vext.16       q13, q10, q11, #4
+    vadd.s16      q12, q12, q13
+
+    vaddl.s16     q13, d0, d20
+    vmlal.s16     q13, d18, d28
+    vmlsl.s16     q13, d24, d30
+
+    vaddl.s16     q11, d1, d21
+    vmlal.s16     q11, d19, d28
+    vmlsl.s16     q11, d25, d30
+
+    vqrshrun.s32  d18, q13, #10
+    vqrshrun.s32  d19, q11, #10
+
+    vaddl.u8      q12, d7, d9
+    vld1.32       {q10}, [r6]!
+    vld1.32       {q11}, [r6], r7
+
+    vqmovn.u16    d19, q9
+
+    vld1.32       d18, [r1]
+    vqrshrun.s16  d20, q10, #5
+    vqrshrun.s16  d21, q11, #5
+    vaddl.u8      q11, d4, d10
+    vld1.u32      {q0}, [r0]!           @ Vector load from src[6_0]
+    vrhadd.u8     q9, q9, q10
+    vld1.u32      d12, [r0], r2         @ Vector load from src[6_0]
+    vaddl.u8      q10, d6, d8
+    vaddl.u8      q13, d5, d11
+    vst1.32       {q9}, [r1], r3        @ store row 0
+
+@ROW_2
+
+    vaddl.u8      q9, d2, d0
+
+    vmla.u16      q9, q10, q14
+
+    vaddl.u8      q10, d3, d1
+
+    vmla.u16      q10, q12, q14
+    vaddl.u8      q12, d15, d16
+    vmls.u16      q9, q11, q15
+    vaddl.u8      q11, d13, d12
+    vmls.u16      q10, q13, q15
+    vaddl.u8      q13, d14, d17
+    vmla.u16      q11, q12, q14
+    vmls.u16      q11, q13, q15
+    vst1.32       {q9}, [r9]!
+    vst1.32       {q10}, [r9]!
+    vext.16       q12, q9, q10, #2
+    vext.16       q13, q9, q10, #3
+    vst1.32       {q11}, [r9]
+    vext.16       q11, q9, q10, #5
+    vadd.s16      q1, q12, q13
+    vext.16       q12, q9, q10, #1
+    vext.16       q13, q9, q10, #4
+    vadd.s16      q12, q12, q13
+
+    vaddl.s16     q13, d18, d22
+    vmlal.s16     q13, d2, d28
+    vmlsl.s16     q13, d24, d30
+
+    vaddl.s16     q11, d19, d23
+    vmlal.s16     q11, d3, d28
+    vmlsl.s16     q11, d25, d30
+
+    vqrshrun.s32  d18, q13, #10
+    vqrshrun.s32  d19, q11, #10
+    vld1.32       {q11}, [r9]!
+    vqmovn.u16    d18, q9
+
+    vext.16       q12, q10, q11, #2
+    vext.16       q13, q10, q11, #3
+    vext.16       q1, q10, q11, #5
+    vst1.32       d18, [r1]
+    vadd.s16      q9, q12, q13
+    vext.16       q12, q10, q11, #1
+    vext.16       q13, q10, q11, #4
+    vadd.s16      q12, q12, q13
+
+    vaddl.s16     q13, d2, d20
+    vmlal.s16     q13, d18, d28
+    vmlsl.s16     q13, d24, d30
+
+    vaddl.s16     q11, d3, d21
+    vmlal.s16     q11, d19, d28
+    vmlsl.s16     q11, d25, d30
+
+    vqrshrun.s32  d18, q13, #10
+    vqrshrun.s32  d19, q11, #10
+    vaddl.u8      q12, d9, d11
+    vld1.32       {q10}, [r6]!
+    vld1.32       {q11}, [r6], r7
+    vqmovn.u16    d19, q9
+    vld1.32       d18, [r1]
+    vqrshrun.s16  d20, q10, #5
+    vqrshrun.s16  d21, q11, #5
+
+    vrhadd.u8     q9, q9, q10
+
+    vst1.32       {q9}, [r1], r3        @ store row 1
+
+    subs          r4, r4, #2
+    subne         r0, r0 , r14, lsl #2
+    subne         r0, r0, r14
+
+    beq           end_func              @ Branch if height==4
+    b             loop_16               @ Loop if height==8
+
+loop_8:
+    vld1.u32      {q0}, [r0], r2        @ Vector load from src[0_0]
+    vld1.u32      {q1}, [r0], r2        @ Vector load from src[1_0]
+    vld1.u32      {q2}, [r0], r2        @ Vector load from src[2_0]
+    vld1.u32      {q3}, [r0], r2        @ Vector load from src[3_0]
+    vld1.u32      {q4}, [r0], r2        @ Vector load from src[4_0]
+
+    vld1.u32      {q5}, [r0], r2        @ Vector load from src[5_0]
+    vaddl.u8      q7, d4, d6
+    vaddl.u8      q6, d0, d10
+    vaddl.u8      q8, d2, d8
+    vmla.u16      q6, q7, q13
+    vaddl.u8      q9, d5, d7
+    vaddl.u8      q7, d1, d11
+    vaddl.u8      q11, d3, d9
+    vmla.u16      q7, q9, q13
+    vmls.u16      q6, q8, q12
+    vld1.32       {q0}, [r0], r2        @ Vector load from src[6_0]
+    vaddl.u8      q8, d6, d8
+    vmls.u16      q7, q11, q12
+    vaddl.u8      q14, d2, d0
+    vst1.32       {q6}, [r9]!           @ store row 0 to temp buffer: col 0
+    vext.16       q11, q6, q7, #5
+    vaddl.u8      q9, d4, d10
+    vmla.u16      q14, q8, q13
+    vaddl.s16     q15, d12, d22
+    vst1.32       {q7}, [r9], r7        @ store row 0 to temp buffer: col 1
+    vaddl.s16     q11, d13, d23
+    vext.16       q8, q6, q7, #2
+    vmls.u16      q14, q9, q12
+    vext.16       q9, q6, q7, #3
+    vext.16       q10, q6, q7, #4
+    vext.16       q7, q6, q7, #1
+    vadd.s16      q8, q8, q9
+    vadd.s16      q9, q7, q10
+    vaddl.u8      q10, d7, d9
+    vmlal.s16     q15, d16, d26
+    vmlsl.s16     q15, d18, d24
+    vmlal.s16     q11, d17, d26
+    vmlsl.s16     q11, d19, d24
+    vaddl.u8      q7, d3, d1
+    vst1.32       {q14}, [r9]!          @ store row 1 to temp buffer: col 0
+    vmla.u16      q7, q10, q13
+    vqrshrun.s32  d12, q15, #10
+    vaddl.u8      q8, d5, d11
+    vqrshrun.s32  d13, q11, #10
+    vmls.u16      q7, q8, q12
+@   vld1.32     {q1},[r0],r2            ; Vector load from src[7_0]
+    vqmovn.u16    d25, q6
+    vaddl.u8      q8, d8, d10
+
+
+    vext.16       q11, q14, q7, #5
+    vaddl.u8      q10, d4, d2
+    vaddl.s16     q15, d28, d22
+    vmla.u16      q10, q8, q13
+    vst1.32       {q7}, [r9], r7        @ store row 1 to temp buffer: col 1
+    vaddl.s16     q11, d29, d23
+    vext.16       q8, q14, q7, #2
+    vext.16       q9, q14, q7, #3
+    vext.16       q6, q14, q7, #4
+    vext.16       q7, q14, q7, #1
+    vadd.s16      q8, q8, q9
+    vadd.s16      q9, q6, q7
+    vld1.32       {q7}, [r6], r8        @ load row 0 from temp buffer
+    vmlal.s16     q15, d16, d26
+    vmlsl.s16     q15, d18, d24
+    vmlal.s16     q11, d17, d26
+    vmlsl.s16     q11, d19, d24
+    vqrshrun.s16  d14, q7, #0x5
+    vld1.32       {q14}, [r6], r8       @ load row 1 from temp buffer
+    vaddl.u8      q9, d6, d0
+    vqrshrun.s32  d16, q15, #10
+    vqrshrun.s16  d15, q14, #0x5
+    vqrshrun.s32  d17, q11, #10
+    vmov          d12, d25
+    vmov          d25, d24
+
+    vqmovn.u16    d13, q8
+    vrhadd.u8     q6, q6, q7
+
+    vst1.32       d12, [r1], r3         @ store row 0
+    vst1.32       d13, [r1], r3         @ store row 1
+
+    subs          r4, r4, #2
+    subne         r0, r0 , r2, lsl #2
+    subne         r0, r0, r2
+
+    beq           end_func              @ Branch if height==4
+    b             loop_8                @ Loop if height==8
+
+loop_4:
+    vld1.u32      {q0}, [r0], r2        @ Vector load from src[0_0]
+    vld1.u32      {q1}, [r0], r2        @ Vector load from src[1_0]
+    vld1.u32      {q2}, [r0], r2        @ Vector load from src[2_0]
+    vld1.u32      {q3}, [r0], r2        @ Vector load from src[3_0]
+    vld1.u32      {q4}, [r0], r2        @ Vector load from src[4_0]
+    vld1.u32      {q5}, [r0], r2        @ Vector load from src[5_0]
+
+    vaddl.u8      q7, d4, d6            @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q6, d0, d10           @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q8, d2, d8            @ temp2 = src[1_0] + src[4_0]
+    vmla.u16      q6, q7, q13           @ temp += temp1 * 20
+    vaddl.u8      q9, d5, d7            @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q7, d1, d11           @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q11, d3, d9           @ temp2 = src[1_0] + src[4_0]
+    vmla.u16      q7, q9, q13           @ temp += temp1 * 20
+    vmls.u16      q6, q8, q12           @ temp -= temp2 * 5
+    vld1.32       {q0}, [r0], r2        @ Vector load from src[6_0]
+    vaddl.u8      q8, d6, d8
+    vmls.u16      q7, q11, q12          @ temp -= temp2 * 5
+    @Q6 and Q7 have filtered values
+    vaddl.u8      q14, d2, d0
+    vst1.32       {q6}, [r9]!           @ store row 0 to temp buffer: col 0
+    vext.16       q11, q6, q7, #5
+    vaddl.u8      q9, d4, d10
+    vmla.u16      q14, q8, q13
+    vaddl.s16     q15, d12, d22
+    vst1.32       {q7}, [r9], r7        @ store row 0 to temp buffer: col 1
+    vaddl.s16     q11, d13, d23
+    vext.16       q8, q6, q7, #2
+    vmls.u16      q14, q9, q12
+    vext.16       q9, q6, q7, #3
+    vext.16       q10, q6, q7, #4
+    vext.16       q7, q6, q7, #1
+    vadd.s16      q8, q8, q9
+    vadd.s16      q9, q7, q10
+    vaddl.u8      q10, d7, d9
+    vmlal.s16     q15, d16, d26
+    vmlsl.s16     q15, d18, d24
+    vmlal.s16     q11, d17, d26
+    vmlsl.s16     q11, d19, d24
+    vaddl.u8      q7, d3, d1
+    vst1.32       {q14}, [r9]!          @ store row 1 to temp buffer: col 0
+    vmla.u16      q7, q10, q13
+    vqrshrun.s32  d12, q15, #10
+    vaddl.u8      q8, d5, d11
+    vqrshrun.s32  d13, q11, #10
+    vmls.u16      q7, q8, q12
+    vqmovn.u16    d25, q6
+    vaddl.u8      q8, d8, d10
+
+    vext.16       q11, q14, q7, #5
+    vaddl.u8      q10, d4, d2
+    vaddl.s16     q15, d28, d22
+    vmla.u16      q10, q8, q13
+    vst1.32       {q7}, [r9], r7        @ store row 1 to temp buffer: col 1
+    vaddl.s16     q11, d29, d23
+    vext.16       q8, q14, q7, #2
+    vext.16       q9, q14, q7, #3
+    vext.16       q6, q14, q7, #4
+    vext.16       q7, q14, q7, #1
+    vadd.s16      q8, q8, q9
+    vadd.s16      q9, q6, q7
+    vld1.32       d14, [r6], r8         @load row 0 from temp buffer
+    vmlal.s16     q15, d16, d26
+    vmlsl.s16     q15, d18, d24
+    vmlal.s16     q11, d17, d26
+    vmlsl.s16     q11, d19, d24
+    vqrshrun.s16  d14, q7, #0x5
+    vld1.32       d28, [r6], r8         @load row 1 from temp buffer
+    vaddl.u8      q9, d6, d0
+    vqrshrun.s32  d16, q15, #10
+    vqrshrun.s16  d15, q14, #0x5
+    vqrshrun.s32  d17, q11, #10
+    vmov          d12, d25
+    vmov          d25, d24
+
+    vqmovn.u16    d13, q8
+    vrhadd.u8     q6, q6, q7
+    vst1.32       d12[0], [r1], r3      @ store row 0
+    vst1.32       d13[0], [r1], r3      @store row 1
+
+    subs          r4, r4, #2
+    subne         r0, r0 , r2, lsl #2
+    subne         r0, r0, r2
+
+    beq           end_func              @ Branch if height==4
+    b             loop_4                @ Loop if height==8
+
+end_func:
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
new file mode 100755
index 0000000..3c8b60a
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
@@ -0,0 +1,355 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for inter prediction  interpolation.
+@*
+@* @author
+@*  Mohit
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   This function implements two six tap filters. It
+@*    applies the six tap filter in the horizontal direction on the
+@*    predictor values, then applies the same filter in the
+@*    vertical direction on the predictor values. It then averages these
+@*    two outputs to obtain quarter pel values in horizontal and vertical direction.
+@*    The six tap filtering operation is described in sec 8.4.2.2.1 titled
+@*    "Luma sample interpolation process"
+@*
+@* @par Description:
+@*    This function is called to obtain pixels lying at the following
+@*    location (1/4,1/4) or (3/4,1/4) or (1/4,3/4) or (3/4,3/4).
+@*    The function interpolates the predictors first in the horizontal direction
+@*    and then in the vertical direction, and then averages these two
+@*    values.
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/;
+
+@void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src,
+@                                UWORD8 *pu1_dst,
+@                                WORD32 src_strd,,
+@                                WORD32 dst_strd,
+@                                WORD32 ht,
+@                                WORD32 wd,
+@                                UWORD8* pu1_tmp,
+@                                UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ht
+@   r5 =>  wd
+@   r6 =>  dydx
+
+.text
+.p2align 2
+
+    .global ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q
+
+ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    vstmdb        sp!, {d8-d15}         @push neon registers to stack
+    ldr           r4, [sp, #104]        @ loads ht
+    ldr           r5, [sp, #108]        @ loads wd
+    ldr           r6, [sp, #116]        @dydx
+    and           r7, r6, #3
+    add           r7, r0, r7, lsr #1    @pu1_pred_vert = pu1_src + (x_offset>>1)
+
+    and           r6, r6, #12           @Finds y-offset
+    lsr           r6, r6, #3            @dydx>>3
+    mul           r6, r2, r6
+    add           r6, r0, r6            @pu1_pred_horz = pu1_src + (y_offset>>1)*src_strd
+    sub           r7, r7, r2, lsl #1    @pu1_pred_vert-2*src_strd
+    sub           r6, r6, #2            @pu1_pred_horz-2
+    vmov.u8       d30, #20              @ Filter coeff 20
+    vmov.u8       d31, #5               @ Filter coeff 5
+
+    subs          r12, r5, #4           @if wd=4 branch to loop_4
+    beq           loop_4
+    subs          r12, r5, #8           @if wd=8 branch to loop_8
+    beq           loop_8
+
+loop_16:
+    vld1.32       {q0}, [r7], r2        @ Vector load from src[0_0]
+    vld1.32       {q1}, [r7], r2        @ Vector load from src[1_0]
+    vld1.32       {q2}, [r7], r2        @ Vector load from src[2_0]
+    vld1.32       {q3}, [r7], r2        @ Vector load from src[3_0]
+    vld1.32       {q4}, [r7], r2        @ Vector load from src[4_0]
+    add           r11, r6, #8
+    vld1.32       {q5}, [r7], r2        @ Vector load from src[5_0]
+    vld1.32       {q9}, [r6], r2        @ horz row0, col 0
+    vaddl.u8      q12, d0, d10
+    vmlal.u8      q12, d4, d30
+    vmlal.u8      q12, d6, d30
+    vmlsl.u8      q12, d2, d31
+    vmlsl.u8      q12, d8, d31
+    vext.8        d23, d18, d19, #5
+    vext.8        d20, d18, d19, #2
+    vext.8        d21, d18, d19, #3
+    vext.8        d22, d18, d19, #4
+    vext.8        d19, d18, d19, #1
+    vqrshrun.s16  d26, q12, #5
+    vaddl.u8      q14, d18, d23
+    vmlal.u8      q14, d20, d30
+    vmlal.u8      q14, d21, d30
+    vmlsl.u8      q14, d19, d31
+    vmlsl.u8      q14, d22, d31
+    vld1.32       {q9}, [r11], r2       @ horz row 0, col 1
+    vaddl.u8      q12, d1, d11
+    vmlal.u8      q12, d5, d30
+    vmlal.u8      q12, d7, d30
+    vmlsl.u8      q12, d3, d31
+    vmlsl.u8      q12, d9, d31
+    vqrshrun.s16  d28, q14, #5
+    vext.8        d23, d18, d19, #5
+    vext.8        d20, d18, d19, #2
+    vext.8        d21, d18, d19, #3
+    vext.8        d22, d18, d19, #4
+    vext.8        d19, d18, d19, #1
+    vqrshrun.s16  d27, q12, #5
+    vld1.32       {q6}, [r7], r2        @ src[6_0]
+
+    vaddl.u8      q12, d18, d23
+    vmlal.u8      q12, d20, d30
+    vmlal.u8      q12, d21, d30
+    vmlsl.u8      q12, d19, d31
+    vmlsl.u8      q12, d22, d31
+
+    vaddl.u8      q8, d2, d12
+    vmlal.u8      q8, d6, d30
+    vmlal.u8      q8, d8, d30
+    vmlsl.u8      q8, d4, d31
+    vmlsl.u8      q8, d10, d31
+    vqrshrun.s16  d29, q12, #5
+    vld1.32       {q9}, [r6], r2        @ horz row 1, col 0
+
+    vaddl.u8      q12, d3, d13
+    vmlal.u8      q12, d7, d30
+    vmlal.u8      q12, d9, d30
+    vmlsl.u8      q12, d5, d31
+    vmlsl.u8      q12, d11, d31
+    vrhadd.u8     q14, q14, q13
+    vqrshrun.s16  d26, q8, #5
+    vext.8        d23, d18, d19, #5
+    vext.8        d20, d18, d19, #2
+    vext.8        d21, d18, d19, #3
+    vext.8        d22, d18, d19, #4
+    vst1.32       {q14}, [r1], r3       @ store row 0
+    vext.8        d19, d18, d19, #1
+    vqrshrun.s16  d27, q12, #5
+
+    vaddl.u8      q14, d18, d23
+    vmlal.u8      q14, d20, d30
+    vmlal.u8      q14, d21, d30
+    vmlsl.u8      q14, d19, d31
+    vmlsl.u8      q14, d22, d31
+
+    vld1.32       {q9}, [r11], r2       @ horz row 1, col 1
+
+    vext.8        d23, d18, d19, #5
+    vext.8        d20, d18, d19, #2
+    vext.8        d21, d18, d19, #3
+    vext.8        d22, d18, d19, #4
+    vext.8        d19, d18, d19, #1
+
+    vqrshrun.s16  d28, q14, #5
+    vaddl.u8      q12, d18, d23
+    vmlal.u8      q12, d20, d30
+    vmlal.u8      q12, d21, d30
+    vmlsl.u8      q12, d19, d31
+    vmlsl.u8      q12, d22, d31
+
+    vqrshrun.s16  d29, q12, #5
+    vrhadd.u8     q14, q14, q13
+    vst1.32       {q14}, [r1], r3       @ store row 1
+
+    subs          r4, r4, #2            @ 2 rows processed, decrement by 2
+    subne         r7, r7 , r2, lsl #2
+    subne         r7, r7, r2
+    beq           end_func              @ Branch if height==4
+
+    b             loop_16               @ looping if height = 8 or 16
+
+
+loop_8:
+    vld1.32       d0, [r7], r2          @ Vector load from src[0_0]
+    vld1.32       d1, [r7], r2          @ Vector load from src[1_0]
+    vld1.32       d2, [r7], r2          @ Vector load from src[2_0]
+    vld1.32       d3, [r7], r2          @ Vector load from src[3_0]
+    vld1.32       d4, [r7], r2          @ Vector load from src[4_0]
+    vld1.32       d5, [r7], r2          @ Vector load from src[5_0]
+    vaddl.u8      q5, d0, d5
+    vmlal.u8      q5, d2, d30
+    vmlal.u8      q5, d3, d30
+    vmlsl.u8      q5, d1, d31
+    vmlsl.u8      q5, d4, d31
+    vld1.32       {q6}, [r6], r2        @horz row 0
+    vext.8        d17, d12, d13, #5
+    vext.8        d14, d12, d13, #2
+    vext.8        d15, d12, d13, #3
+    vext.8        d16, d12, d13, #4
+    vext.8        d13, d12, d13, #1
+    vqrshrun.s16  d26, q5, #5
+    vld1.32       d6, [r7], r2          @ src[6_0]
+    vaddl.u8      q5, d12, d17
+    vmlal.u8      q5, d14, d30
+    vmlal.u8      q5, d15, d30
+    vmlsl.u8      q5, d13, d31
+    vmlsl.u8      q5, d16, d31
+    vld1.32       {q6}, [r6], r2        @ horz row 1
+    vaddl.u8      q9, d1, d6
+    vmlal.u8      q9, d3, d30
+    vmlal.u8      q9, d4, d30
+    vmlsl.u8      q9, d2, d31
+    vmlsl.u8      q9, d5, d31
+    vqrshrun.s16  d28, q5, #5
+    vext.8        d17, d12, d13, #5
+    vext.8        d14, d12, d13, #2
+    vext.8        d15, d12, d13, #3
+    vext.8        d16, d12, d13, #4
+    vext.8        d13, d12, d13, #1
+    vqrshrun.s16  d27, q9, #5
+    vaddl.u8      q5, d12, d17
+    vmlal.u8      q5, d14, d30
+    vmlal.u8      q5, d15, d30
+    vmlsl.u8      q5, d13, d31
+    vmlsl.u8      q5, d16, d31
+    vqrshrun.s16  d29, q5, #5
+    vrhadd.u8     q13, q13, q14
+    vst1.32       d26, [r1], r3
+    vst1.32       d27, [r1], r3
+
+    subs          r4, r4, #2            @ 2 rows processed, decrement by 2
+    subne         r7, r7 , r2, lsl #2
+    subne         r7, r7, r2
+    beq           end_func              @ Branch if height==4
+    b             loop_8                @looping if height == 8 or 16
+
+loop_4:
+    vld1.32       d0[0], [r7], r2       @ Vector load from src[0_0]
+    vld1.32       d1[0], [r7], r2       @ Vector load from src[1_0]
+    vld1.32       d2[0], [r7], r2       @ Vector load from src[2_0]
+    vld1.32       d3[0], [r7], r2       @ Vector load from src[3_0]
+    vld1.32       d4[0], [r7], r2       @ Vector load from src[4_0]
+    vld1.32       d5[0], [r7], r2       @ Vector load from src[5_0]
+    vaddl.u8      q5, d0, d5
+    vmlal.u8      q5, d2, d30
+    vmlal.u8      q5, d3, d30
+    vmlsl.u8      q5, d1, d31
+    vmlsl.u8      q5, d4, d31
+    vld1.32       {q6}, [r6], r2        @load for horz filter row 0
+    vext.8        d17, d12, d13, #5
+    vext.8        d14, d12, d13, #2
+    vext.8        d15, d12, d13, #3
+    vext.8        d16, d12, d13, #4
+    vext.8        d13, d12, d13, #1
+    vqrshrun.s16  d26, q5, #5
+    vld1.32       d6[0], [r7], r2       @ Vector load from src[6_0]
+    vaddl.u8      q5, d12, d17
+    vmlal.u8      q5, d14, d30
+    vmlal.u8      q5, d15, d30
+    vmlsl.u8      q5, d13, d31
+    vmlsl.u8      q5, d16, d31
+    vld1.32       {q6}, [r6], r2        @horz row 1
+    vaddl.u8      q9, d1, d6
+    vmlal.u8      q9, d3, d30
+    vmlal.u8      q9, d4, d30
+    vmlsl.u8      q9, d2, d31
+    vmlsl.u8      q9, d5, d31
+    vqrshrun.s16  d28, q5, #5
+    vext.8        d17, d12, d13, #5
+    vext.8        d14, d12, d13, #2
+    vext.8        d15, d12, d13, #3
+    vext.8        d16, d12, d13, #4
+    vext.8        d13, d12, d13, #1
+    vqrshrun.s16  d27, q9, #5
+    vaddl.u8      q5, d12, d17
+    vmlal.u8      q5, d14, d30
+    vmlal.u8      q5, d15, d30
+    vmlsl.u8      q5, d13, d31
+    vmlsl.u8      q5, d16, d31
+    vqrshrun.s16  d29, q5, #5
+    vrhadd.u8     q13, q13, q14
+    vst1.32       d26[0], [r1], r3
+    vst1.32       d27[0], [r1], r3
+
+    subs          r4, r4, #2            @ 2 rows processed, decrement by 2
+    subne         r7, r7 , r2, lsl #2
+    subne         r7, r7, r2
+    beq           end_func              @ Branch if height==4
+    b             loop_4                @ Loop if height==8
+end_func:
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s
new file mode 100755
index 0000000..d45055e
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s
@@ -0,0 +1,330 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_inter_pred_luma_vert_qpel_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for inter prediction vertical quarter pel interpolation.
+@*
+@* @author
+@*  Mohit
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_inter_pred_luma_vert_qpel_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     Quarter pel interprediction luma filter for vertical input
+@*
+@* @par Description:
+@* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
+@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer: UNUSED in this function
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations.
+@* @returns
+@*
+@ @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_luma_vert (
+@                            UWORD8 *pu1_src,
+@                            UWORD8 *pu1_dst,
+@                            WORD32 src_strd,
+@                            WORD32 dst_strd,
+@                            WORD32 ht,
+@                            WORD32 wd,
+@                            UWORD8* pu1_tmp,
+@                            UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r5 =>  ht
+@   r6 =>  wd
+@   r7 =>  dydx
+
+.text
+.p2align 2
+
+    .global ih264_inter_pred_luma_vert_qpel_a9q
+
+ih264_inter_pred_luma_vert_qpel_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    vstmdb        sp!, {d8-d15}         @push neon registers to stack
+    ldr           r5, [sp, #104]        @Loads ht
+
+    ldr           r6, [sp, #108]        @Loads wd
+    ldr           r7, [sp, #116]        @Loads dydx
+    and           r7, r7, #12           @Finds y-offset
+    lsr           r7, r7, #3            @dydx>>3
+    mul           r7, r2, r7
+    add           r7, r0, r7            @pu1_src + (y_offset>>1)*src_strd
+    vmov.u16      q11, #20              @ Filter coeff 0x14 into Q11
+    sub           r0, r0, r2, lsl #1    @pu1_src-2*src_strd
+    subs          r12, r6, #8           @if wd=8 branch to loop_8
+    vmov.u16      q12, #5               @ Filter coeff 0x5  into Q12
+    beq           loop_8
+
+    subs          r12, r6, #4           @if wd=4 branch to loop_4
+    beq           loop_4
+
+loop_16:                                @when  wd=16
+
+    vld1.u32      {q0}, [r0], r2        @ Vector load from src[0_0]
+    vld1.u32      {q1}, [r0], r2        @ Vector load from src[1_0]
+    vld1.u32      {q2}, [r0], r2        @ Vector load from src[2_0]
+    vld1.u32      {q3}, [r0], r2        @ Vector load from src[3_0]
+    vld1.u32      {q4}, [r0], r2        @ Vector load from src[4_0]
+    vaddl.u8      q6, d4, d6            @ temp1 = src[2_0] + src[3_0]
+    vld1.u32      {q5}, [r0], r2        @ Vector load from src[5_0]
+    vaddl.u8      q7, d0, d10           @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q8, d2, d8            @ temp2 = src[1_0] + src[4_0]
+    vmla.u16      q7, q6, q11           @ temp += temp1 * 20
+    vaddl.u8      q10, d1, d11          @ temp4 = src[0_8] + src[5_8]
+    vaddl.u8      q9, d5, d7            @ temp3 = src[2_8] + src[3_8]
+    vmla.u16      q10, q9, q11          @ temp4 += temp3 * 20
+    vld1.u32      {q0}, [r0], r2
+    vaddl.u8      q13, d3, d9           @ temp5 = src[1_8] + src[4_8]
+    vaddl.u8      q6, d6, d8
+    vmls.u16      q7, q8, q12           @ temp -= temp2 * 5
+    vaddl.u8      q8, d2, d0
+    vaddl.u8      q9, d4, d10
+    vmla.u16      q8, q6, q11
+    vmls.u16      q10, q13, q12         @ temp4 -= temp5 * 5
+    vaddl.u8      q13, d5, d11
+    vaddl.u8      q6, d7, d9
+    vqrshrun.s16  d30, q7, #5           @ dst[0_0] = CLIP_U8((temp +16) >> 5)
+    vaddl.u8      q7, d3, d1
+    vld1.u32      {q1}, [r0], r2
+    vmla.u16      q7, q6, q11
+    vmls.u16      q8, q9, q12
+    vqrshrun.s16  d31, q10, #5          @ dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+    vld1.u32      {q10}, [r7], r2       @ Load for interpolation row 0
+    vrhadd.u8     q15, q10, q15         @ Interpolation to obtain qpel value
+    vaddl.u8      q9, d4, d2
+    vaddl.u8      q6, d8, d10
+
+    vst1.u32      {q15}, [r1], r3       @ Vector store to dst[0_0]
+    vmla.u16      q9, q6, q11
+    vaddl.u8      q10, d6, d0
+    vmls.u16      q7, q13, q12
+    vqrshrun.s16  d30, q8, #5
+    vaddl.u8      q6, d9, d11
+    vaddl.u8      q8, d5, d3
+    vaddl.u8      q13, d7, d1
+    vmla.u16      q8, q6, q11
+    vmls.u16      q9, q10, q12
+    vld1.u32      {q2}, [r0], r2
+
+    vqrshrun.s16  d31, q7, #5
+    vld1.u32      {q7}, [r7], r2        @ Load for interpolation row 1
+    vaddl.u8      q6, d10, d0
+    vrhadd.u8     q15, q7, q15          @ Interpolation to obtain qpel value
+    vaddl.u8      q7, d6, d4
+    vaddl.u8      q10, d8, d2
+    vmla.u16      q7, q6, q11
+    vmls.u16      q8, q13, q12
+    vst1.u32      {q15}, [r1], r3       @store row 1
+    vqrshrun.s16  d30, q9, #5
+    vaddl.u8      q9, d7, d5
+    vaddl.u8      q6, d11, d1
+    vmla.u16      q9, q6, q11
+    vaddl.u8      q13, d9, d3
+    vmls.u16      q7, q10, q12
+    vqrshrun.s16  d31, q8, #5
+    vld1.u32      {q8}, [r7], r2        @ Load for interpolation row 2
+    vmls.u16      q9, q13, q12
+    vrhadd.u8     q15, q8, q15          @ Interpolation to obtain qpel value
+    vaddl.u8      q6, d0, d2            @ temp1 = src[2_0] + src[3_0]
+    vst1.u32      {q15}, [r1], r3       @store row 2
+    vaddl.u8      q8, d10, d4           @ temp2 = src[1_0] + src[4_0]
+    vaddl.u8      q10, d9, d7           @ temp4 = src[0_8] + src[5_8]
+    vqrshrun.s16  d30, q7, #5
+    vaddl.u8      q13, d5, d11          @ temp5 = src[1_8] + src[4_8]
+    vaddl.u8      q7, d8, d6            @ temp = src[0_0] + src[5_0]
+    vqrshrun.s16  d31, q9, #5
+    vld1.u32      {q9}, [r7], r2        @ Load for interpolation row 3
+    vmla.u16      q7, q6, q11           @ temp += temp1 * 20
+    vrhadd.u8     q15, q9, q15          @ Interpolation to obtain qpel value
+    vaddl.u8      q9, d1, d3            @ temp3 = src[2_8] + src[3_8]
+    vst1.u32      {q15}, [r1], r3       @store row 3
+    subs          r5, r5, #4            @ 4 rows processed, decrement by 4
+    subne         r0, r0 , r2, lsl #2
+    subne         r0, r0, r2
+    beq           end_func              @ Branch if height==4
+
+    b             loop_16 @ looping if height = 8 or 16
+
+
+loop_8:
+
+    @// Processing row0 and row1
+    vld1.u32      d0, [r0], r2          @ Vector load from src[0_0]
+    vld1.u32      d1, [r0], r2          @ Vector load from src[1_0]
+    vld1.u32      d2, [r0], r2          @ Vector load from src[2_0]
+    vld1.u32      d3, [r0], r2          @ Vector load from src[3_0]
+    vld1.u32      d4, [r0], r2          @ Vector load from src[4_0]
+    vld1.u32      d5, [r0], r2          @ Vector load from src[5_0]
+
+    vaddl.u8      q3, d2, d3            @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q4, d0, d5            @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q5, d1, d4            @ temp2 = src[1_0] + src[4_0]
+    vmla.u16      q4, q3, q11           @ temp += temp1 * 20
+    vld1.u32      d6, [r0], r2
+    vaddl.u8      q7, d3, d4
+    vaddl.u8      q8, d1, d6
+    vaddl.u8      q9, d2, d5
+    vmls.u16      q4, q5, q12           @ temp -= temp2 * 5
+    vmla.u16      q8, q7, q11
+    vld1.u32      d7, [r0], r2
+    vaddl.u8      q10, d4, d5
+    vaddl.u8      q6, d2, d7
+    vaddl.u8      q5, d3, d6
+    vmls.u16      q8, q9, q12
+    vqrshrun.s16  d26, q4, #5           @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+    vmla.u16      q6, q10, q11
+    vld1.32       d8, [r7], r2          @Load value for interpolation           (row0)
+    vld1.32       d9, [r7], r2          @Load value for interpolation           (row1)
+    vld1.u32      d0, [r0], r2
+    vaddl.u8      q7, d5, d6
+    vqrshrun.s16  d27, q8, #5
+    vrhadd.u8     q13, q4, q13          @ Interpolation step for qpel calculation
+    vaddl.u8      q10, d3, d0
+    vmls.u16      q6, q5, q12
+    vst1.u32      d26, [r1], r3         @ Vector store to dst[0_0]
+    vaddl.u8      q9, d4, d7
+    vmla.u16      q10, q7, q11
+    vst1.u32      d27, [r1], r3         @ Vector store to dst[1_0]
+    vqrshrun.s16  d28, q6, #5
+    vmls.u16      q10, q9, q12
+    vld1.32       d12, [r7], r2         @Load value for interpolation           (row2)
+    vld1.32       d13, [r7], r2         @Load value for interpolation           (row3)
+    vqrshrun.s16  d29, q10, #5
+    subs          r9, r5, #4
+    vrhadd.u8     q14, q6, q14
+    vst1.u32      d28, [r1], r3         @store row 2
+    vst1.u32      d29, [r1], r3         @store row 3
+
+    subs          r5, r5, #4            @ 4 rows processed, decrement by 4
+    subne         r0, r0 , r2, lsl #2
+    subne         r0, r0, r2
+    beq           end_func              @ Branch if height==4
+    b             loop_8                @looping if height == 8 or 16
+
+loop_4:
+@// Processing row0 and row1
+
+    vld1.u32      d0[0], [r0], r2       @ Vector load from src[0_0]
+    vld1.u32      d1[0], [r0], r2       @ Vector load from src[1_0]
+    vld1.u32      d2[0], [r0], r2       @ Vector load from src[2_0]
+    vld1.u32      d3[0], [r0], r2       @ Vector load from src[3_0]
+    vld1.u32      d4[0], [r0], r2       @ Vector load from src[4_0]
+    vld1.u32      d5[0], [r0], r2       @ Vector load from src[5_0]
+
+    vaddl.u8      q3, d2, d3            @ temp1 = src[2_0] + src[3_0]
+    vaddl.u8      q4, d0, d5            @ temp = src[0_0] + src[5_0]
+    vaddl.u8      q5, d1, d4            @ temp2 = src[1_0] + src[4_0]
+    vmla.u16      q4, q3, q11           @ temp += temp1 * 20
+    vld1.u32      d6, [r0], r2
+    vaddl.u8      q7, d3, d4
+    vaddl.u8      q8, d1, d6
+    vaddl.u8      q9, d2, d5
+    vmls.u16      q4, q5, q12           @ temp -= temp2 * 5
+    vld1.u32      d7[0], [r0], r2
+    vmla.u16      q8, q7, q11
+    vaddl.u8      q10, d4, d5
+    vaddl.u8      q6, d2, d7
+    vaddl.u8      q5, d3, d6
+    vmls.u16      q8, q9, q12
+    vqrshrun.s16  d26, q4, #5           @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+    vld1.u32      d8[0], [r7], r2       @Load value for interpolation - row 0
+    vld1.u32      d9[0], [r7], r2       @Load value for interpolation - row 1
+    vmla.u16      q6, q10, q11
+    vld1.u32      d0[0], [r0], r2
+    vaddl.u8      q7, d5, d6
+    vqrshrun.s16  d27, q8, #5
+    vaddl.u8      q10, d3, d0
+    vrhadd.u8     q13, q13, q4          @Interpolation step for qpel calculation
+    vmls.u16      q6, q5, q12
+    vst1.u32      d26[0], [r1], r3      @ Vector store to dst[0_0]
+    vaddl.u8      q9, d4, d7
+    vmla.u16      q10, q7, q11
+    vst1.u32      d27[0], [r1], r3      @ store row 1
+    vqrshrun.s16  d28, q6, #5
+    vld1.u32      d12[0], [r7], r2      @Load value for interpolation - row 2
+    vld1.u32      d13[0], [r7], r2      @Load value for interpolation - row 3
+
+    vmls.u16      q10, q9, q12
+    vqrshrun.s16  d29, q10, #5
+    vrhadd.u8     q14, q6, q14          @Interpolation step for qpel calculation
+    vst1.u32      d28[0], [r1], r3      @store row 2
+    vst1.u32      d29[0], [r1], r3      @store row 3
+
+    subs          r5, r5, #8
+    subeq         r0, r0, r2, lsl #2
+    subeq         r0, r0, r2
+    beq           loop_4                @ Loop if height==8
+
+end_func:
+    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_intra_pred_chroma_a9q.s b/common/arm/ih264_intra_pred_chroma_a9q.s
new file mode 100755
index 0000000..d03fc55
--- /dev/null
+++ b/common/arm/ih264_intra_pred_chroma_a9q.s
@@ -0,0 +1,551 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_intra_pred_chroma_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for intra chroma prediction .
+@*
+@* @author
+@*  Ittiam
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_intra_pred_chroma_mode_horz_a9q()
+@*  - ih264_intra_pred_chroma_8x8_mode_vert_a9q()
+@*  - ih264_intra_pred_chroma_mode_dc_a9q()
+@*  - ih264_intra_pred_chroma_mode_plane_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+.text
+.p2align 2
+
+    .extern ih264_gai1_intrapred_chroma_plane_coeffs1
+.hidden ih264_gai1_intrapred_chroma_plane_coeffs1
+    .extern ih264_gai1_intrapred_chroma_plane_coeffs2
+.hidden ih264_gai1_intrapred_chroma_plane_coeffs2
+scratch_chroma_intrapred_addr1:
+    .long ih264_gai1_intrapred_chroma_plane_coeffs1 - scrlblc1 - 8
+
+scratch_intrapred_chroma_plane_addr1:
+    .long ih264_gai1_intrapred_chroma_plane_coeffs2 - scrlblc2 - 8
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_chroma_8x8_mode_dc
+@*
+@* @brief
+@*     Perform Intra prediction for  chroma_8x8 mode:DC
+@*
+@* @par Description:
+@*    Perform Intra prediction for  chroma_8x8 mode:DC ,described in sec 8.3.4.1
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination with alternate U and V samples
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@** @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
+@                                        UWORD8 *pu1_dst,
+@                                        WORD32 src_strd,
+@                                        WORD32 dst_strd,
+@                                        WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+
+    .global ih264_intra_pred_chroma_8x8_mode_dc_a9q
+
+ih264_intra_pred_chroma_8x8_mode_dc_a9q:
+
+    stmfd         sp!, {r4, r14}        @store register values to stack
+    ldr           r4, [sp, #8]          @r4 =>  ui_neighboravailability
+    vpush         {d8-d15}
+
+    ands          r2, r4, #0x01         @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE
+    beq           top_available
+    ands          r2, r4, #0x04         @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+    beq           left_available
+
+    vld1.u8       {q0}, [r0]            @BOTH LEFT AND TOP AVAILABLE
+    add           r0, r0, #18
+    vld1.u8       {q1}, [r0]
+    vaddl.u8      q2, d1, d2
+    vaddl.u8      q3, d0, d3
+    vmovl.u8      q1, d3
+    vmovl.u8      q0, d0
+
+    vadd.u16      d12, d4, d5
+    vadd.u16      d13, d2, d3
+    vadd.u16      d15, d6, d7
+    vadd.u16      d14, d0, d1
+
+    vpadd.u32     d12, d12, d15
+    vpadd.u32     d14, d13, d14
+    vqrshrun.s16  d12, q6, #3
+    vqrshrun.s16  d14, q7, #2
+    vdup.u16      d8, d12[0]
+    vdup.u16      d9, d14[0]
+    vdup.u16      d10, d14[1]
+    vdup.u16      d11, d12[1]
+    b             str_pred
+
+top_available:                          @ONLY TOP AVAILABLE
+    ands          r2, r4, #0x04         @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+    beq           none_available
+
+    add           r0, r0, #18
+    vld1.u8       {q0}, [r0]
+    vmovl.u8      q1, d0
+    vmovl.u8      q2, d1
+    vadd.u16      d0, d2, d3
+    vadd.u16      d1, d4, d5
+    vpaddl.u32    q0, q0
+    vqrshrun.s16  d0, q0, #2
+    vdup.u16      d8, d0[0]
+    vdup.u16      d9, d0[2]
+    vmov          q5, q4
+    b             str_pred
+
+left_available:                         @ONLY LEFT AVAILABLE
+    vld1.u8       {q0}, [r0]
+    vmovl.u8      q1, d0
+    vmovl.u8      q2, d1
+    vadd.u16      d0, d2, d3
+    vadd.u16      d1, d4, d5
+    vpaddl.u32    q0, q0
+    vqrshrun.s16  d0, q0, #2
+    vdup.u16      q5, d0[0]
+    vdup.u16      q4, d0[2]
+    b             str_pred
+
+none_available:                         @NONE AVAILABLE
+    vmov.u8       q4, #128
+    vmov.u8       q5, #128
+
+str_pred:
+    vst1.8        {q4}, [r1], r3
+    vst1.8        {q4}, [r1], r3
+    vst1.8        {q4}, [r1], r3
+    vst1.8        {q4}, [r1], r3
+    vst1.8        {q5}, [r1], r3
+    vst1.8        {q5}, [r1], r3
+    vst1.8        {q5}, [r1], r3
+    vst1.8        {q5}, [r1], r3
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4, pc}         @Restoring registers from stack
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_chroma_8x8_mode_horz
+@*
+@* @brief
+@*  Perform Intra prediction for  chroma_8x8 mode:Horizontal
+@*
+@* @par Description:
+@*   Perform Intra prediction for  chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination with alternate U and V samples
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src,
+@                                         UWORD8 *pu1_dst,
+@                                         WORD32 src_strd,
+@                                         WORD32 dst_strd,
+@                                         WORD32 ui_neighboravailability)
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_chroma_8x8_mode_horz_a9q
+
+ih264_intra_pred_chroma_8x8_mode_horz_a9q:
+
+    stmfd         sp!, {r14}            @store register values to stack
+
+    vld1.u8       {q0}, [r0]
+    mov           r2, #6
+
+    vdup.u16      q1, d1[3]
+    vdup.u16      q2, d1[2]
+    vst1.8        {q1}, [r1], r3
+
+loop_8x8_horz:
+    vext.8        q0, q0, q0, #12
+    vst1.8        {q2}, [r1], r3
+    vdup.u16      q1, d1[3]
+    subs          r2, #2
+    vdup.u16      q2, d1[2]
+    vst1.8        {q1}, [r1], r3
+    bne           loop_8x8_horz
+
+    vext.8        q0, q0, q0, #12
+    vst1.8        {q2}, [r1], r3
+
+    ldmfd         sp!, {pc}             @restoring registers from stack
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_chroma_8x8_mode_vert
+@*
+@* @brief
+@*   Perform Intra prediction for  chroma_8x8 mode:vertical
+@*
+@* @par Description:
+@*Perform Intra prediction for  chroma_8x8 mode:vertical ,described in sec 8.3.4.3
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@*   UWORD8 pointer to the destination with alternate U and V samples
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src,
+@                                        UWORD8 *pu1_dst,
+@                                        WORD32 src_strd,
+@                                        WORD32 dst_strd,
+@                                        WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_chroma_8x8_mode_vert_a9q
+
+ih264_intra_pred_chroma_8x8_mode_vert_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+
+    add           r0, r0, #18
+    vld1.8        {q0}, [r0]
+
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_chroma_8x8_mode_plane
+@*
+@* @brief
+@*   Perform Intra prediction for  chroma_8x8 mode:PLANE
+@*
+@* @par Description:
+@*  Perform Intra prediction for  chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination with alternate U and V samples
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src,
+@                                        UWORD8 *pu1_dst,
+@                                        WORD32 src_strd,
+@                                        WORD32 dst_strd,
+@                                        WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+    .global ih264_intra_pred_chroma_8x8_mode_plane_a9q
+ih264_intra_pred_chroma_8x8_mode_plane_a9q:
+
+    stmfd         sp!, {r4-r10, r12, lr}
+    vpush         {d8-d15}
+
+
+    vld1.32       d0, [r0]
+    add           r10, r0, #10
+    vld1.32       d1, [r10]
+    add           r10, r10, #6
+    vrev64.16     d5, d0
+    vld1.32       d2, [r10]!
+    add           r10, r10, #2
+    vrev64.16     d7, d2
+    vld1.32       d3, [r10]
+    sub           r5, r3, #8
+    ldr           r12, scratch_chroma_intrapred_addr1
+scrlblc1:
+    add           r12, r12, pc
+    vsubl.u8      q5, d5, d1
+    vld1.64       {q4}, [r12]           @ Load multiplication factors 1 to 8 into D3
+    vsubl.u8      q6, d3, d7
+    vmul.s16      q7, q5, q4
+    vmul.s16      q8, q6, q4
+    vuzp.16       q7, q8
+
+    vpadd.s16     d14, d14
+    vpadd.s16     d15, d15
+    vpadd.s16     d16, d16
+    vpadd.s16     d17, d17
+    vpadd.s16     d14, d14
+    vpadd.s16     d15, d15
+    vpadd.s16     d16, d16
+    vpadd.s16     d17, d17
+
+    mov           r6, #34
+    vdup.16       q9, r6
+
+    vmull.s16     q11, d14, d18
+    vmull.s16     q12, d15, d18
+    vmull.s16     q13, d16, d18
+    vmull.s16     q14, d17, d18
+
+    vrshrn.s32    d10, q11, #6
+    vrshrn.s32    d12, q12, #6
+    vrshrn.s32    d13, q13, #6
+    vrshrn.s32    d14, q14, #6
+
+
+    ldrb          r6, [r0], #1
+    add           r10, r0, #31
+    ldrb          r8, [r0], #1
+    ldrb          r7, [r10], #1
+    ldrb          r9, [r10], #1
+
+    add           r6, r6, r7
+    add           r8, r8, r9
+    lsl           r6, r6, #4
+    lsl           r8, r8, #4
+
+    vdup.16       q0, r6
+    vdup.16       q1, r8
+    vdup.16       q2, d12[0]
+    vdup.16       q3, d10[0]
+
+    vdup.16       q12, d14[0]
+    vdup.16       q13, d13[0]
+    vzip.16       q2, q12
+    vzip.16       q3, q13
+    vzip.16       q0, q1
+
+    ldr           r12, scratch_intrapred_chroma_plane_addr1
+scrlblc2:
+    add           r12, r12, pc
+    vld1.64       {q4}, [r12]
+    vmov.16       q5, q4
+    vmov          q11, q4
+    vzip.16       q4, q5
+
+    vmul.s16      q6, q2, q4
+    vmul.s16      q8, q2, q5
+    vadd.s16      q6, q0, q6
+    vadd.s16      q8, q0, q8
+
+
+    vdup.16       q10, d22[0]
+    vmul.s16      q2, q3, q10
+    vdup.16       q15, d22[1]
+    vmul.s16      q9, q3, q10
+    vmul.s16      q7, q3, q15
+    vmul.s16      q4, q3, q15
+    vadd.s16      q12, q6, q2
+    vadd.s16      q0, q8, q9
+    vadd.s16      q1, q6, q7
+    vqrshrun.s16  d28, q12, #5
+    vadd.s16      q13, q8, q4
+    vqrshrun.s16  d29, q0, #5
+    vdup.16       q10, d22[2]
+    vst1.8        {q14}, [r1], r3
+    vqrshrun.s16  d28, q1, #5
+    vqrshrun.s16  d29, q13, #5
+    vmul.s16      q2, q3, q10
+    vmul.s16      q9, q3, q10
+    vst1.8        {q14}, [r1], r3
+    vadd.s16      q12, q6, q2
+    vadd.s16      q0, q8, q9
+    vdup.16       q15, d22[3]
+    vqrshrun.s16  d28, q12, #5
+    vqrshrun.s16  d29, q0, #5
+    vmul.s16      q7, q3, q15
+    vmul.s16      q4, q3, q15
+    vst1.8        {q14}, [r1], r3
+    vadd.s16      q1, q6, q7
+    vadd.s16      q13, q8, q4
+    vdup.16       q10, d23[0]
+    vqrshrun.s16  d28, q1, #5
+    vqrshrun.s16  d29, q13, #5
+    vmul.s16      q2, q3, q10
+    vmul.s16      q9, q3, q10
+    vst1.8        {q14}, [r1], r3
+    vadd.s16      q12, q6, q2
+    vadd.s16      q0, q8, q9
+    vdup.16       q15, d23[1]
+    vqrshrun.s16  d28, q12, #5
+    vqrshrun.s16  d29, q0, #5
+    vmul.s16      q7, q3, q15
+    vmul.s16      q4, q3, q15
+    vst1.8        {q14}, [r1], r3
+    vadd.s16      q1, q6, q7
+    vadd.s16      q13, q8, q4
+    vdup.16       q10, d23[2]
+    vqrshrun.s16  d28, q1, #5
+    vqrshrun.s16  d29, q13, #5
+    vmul.s16      q2, q3, q10
+    vmul.s16      q9, q3, q10
+    vst1.8        {q14}, [r1], r3
+    vadd.s16      q12, q6, q2
+    vadd.s16      q0, q8, q9
+    vdup.16       q15, d23[3]
+    vqrshrun.s16  d28, q12, #5
+    vqrshrun.s16  d29, q0, #5
+    vmul.s16      q7, q3, q15
+    vmul.s16      q4, q3, q15
+    vst1.8        {q14}, [r1], r3
+    vadd.s16      q1, q6, q7
+    vadd.s16      q13, q8, q4
+    vqrshrun.s16  d28, q1, #5
+    vqrshrun.s16  d29, q13, #5
+    vst1.8        {q14}, [r1], r3
+
+
+
+end_func_plane:
+
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r10, r12, pc}
+
+
+
+
diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
new file mode 100755
index 0000000..e38e203
--- /dev/null
+++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
@@ -0,0 +1,520 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_intra_pred_luma_16x16_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for intra 16x16 Luma prediction .
+@*
+@* @author
+@*  Ittiam
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_intra_pred_luma_16x16_mode_vert_a9q()
+@*  - ih264_intra_pred_luma_16x16_mode_horz_a9q()
+@*  - ih264_intra_pred_luma_16x16_mode_dc_a9q()
+@*  - ih264_intra_pred_luma_16x16_mode_plane_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_intra_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+
+.text
+.p2align 2
+
+
+    .extern ih264_gai1_intrapred_luma_plane_coeffs
+.hidden ih264_gai1_intrapred_luma_plane_coeffs
+scratch_intrapred_addr1:
+    .long ih264_gai1_intrapred_luma_plane_coeffs - scrlbl1 - 8
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_16x16_mode_vert_a9q
+@*
+@* @brief
+@*   Perform Intra prediction for  luma_16x16 mode:vertical
+@*
+@* @par Description:
+@* Perform Intra prediction for  luma_16x16 mode:Vertical ,described in sec 8.3.3.1
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src,
+@                                        UWORD8 *pu1_dst,
+@                                        WORD32 src_strd,
+@                                        WORD32 dst_strd,
+@                                        WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_16x16_mode_vert_a9q
+
+ih264_intra_pred_luma_16x16_mode_vert_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+
+    add           r0, r0, #17
+    vld1.8        {q0}, [r0]
+
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_16x16_mode_horz_a9q
+@*
+@* @brief
+@*  Perform Intra prediction for  luma_16x16 mode:horizontal
+@*
+@* @par Description:
+@*  Perform Intra prediction for  luma_16x16 mode:horizontal ,described in sec 8.3.3.2
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
+@                                         UWORD8 *pu1_dst,
+@                                         WORD32 src_strd,
+@                                         WORD32 dst_strd,
+@                                         WORD32 ui_neighboravailability)
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+    .global ih264_intra_pred_luma_16x16_mode_horz_a9q
+
+ih264_intra_pred_luma_16x16_mode_horz_a9q:
+
+    stmfd         sp!, {r14}            @store register values to stack
+
+    vld1.u8       {q0}, [r0]
+    mov           r2, #14
+
+    vdup.u8       q1, d1[7]
+    vdup.u8       q2, d1[6]
+    vst1.8        {q1}, [r1], r3
+
+loop_16x16_horz:
+    vext.8        q0, q0, q0, #14
+    vst1.8        {q2}, [r1], r3
+    vdup.u8       q1, d1[7]
+    subs          r2, #2
+    vdup.u8       q2, d1[6]
+    vst1.8        {q1}, [r1], r3
+    bne           loop_16x16_horz
+
+    vext.8        q0, q0, q0, #14
+    vst1.8        {q2}, [r1], r3
+
+    ldmfd         sp!, {pc}             @Restoring registers from stack
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_16x16_mode_dc_a9q
+@*
+@* @brief
+@*  Perform Intra prediction for  luma_16x16 mode:DC
+@*
+@* @par Description:
+@*  Perform Intra prediction for  luma_16x16 mode:DC ,described in sec 8.3.3.3
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
+@                                       UWORD8 *pu1_dst,
+@                                       WORD32 src_strd,
+@                                       WORD32 dst_strd,
+@                                       WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+    .global ih264_intra_pred_luma_16x16_mode_dc_a9q
+
+ih264_intra_pred_luma_16x16_mode_dc_a9q:
+
+    stmfd         sp!, {r4, r14}        @store register values to stack
+    ldr           r4, [sp, #8]          @r4 =>  ui_neighboravailability
+
+    ands          r2, r4, #0x01         @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE
+    beq           top_available
+    ands          r2, r4, #0x04         @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+    beq           left_available
+
+    vld1.u8       {q0}, [r0]            @BOTH LEFT AND TOP AVAILABLE
+    add           r0, r0, #17
+    vpaddl.u8     q0, q0
+    vld1.u8       {q1}, [r0]
+    vpaddl.u8     q1, q1
+    vadd.u16      q0, q0, q1
+    vadd.u16      d0, d0, d1
+    vpaddl.u16    d0, d0
+    vpaddl.u32    d0, d0
+    vqrshrun.s16  d0, q0, #5
+    vdup.u8       q0, d0[0]
+    b             str_pred
+
+top_available:                          @ONLY TOP AVAILABLE
+    ands          r2, r4, #0x04         @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+    beq           none_available
+
+    add           r0, r0, #17
+    vld1.u8       {q0}, [r0]
+    vpaddl.u8     q0, q0
+    vadd.u16      d0, d0, d1
+    vpaddl.u16    d0, d0
+    vpaddl.u32    d0, d0
+    vqrshrun.s16  d0, q0, #4
+    vdup.u8       q0, d0[0]
+    b             str_pred
+
+left_available: @ONLY LEFT AVAILABLE
+    vld1.u8       {q0}, [r0]
+    vpaddl.u8     q0, q0
+    vadd.u16      d0, d0, d1
+    vpaddl.u16    d0, d0
+    vpaddl.u32    d0, d0
+    vqrshrun.s16  d0, q0, #4
+    vdup.u8       q0, d0[0]
+    b             str_pred
+
+none_available:                         @NONE AVAILABLE
+    vmov.u8       q0, #128
+
+str_pred:
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+    vst1.8        {q0}, [r1], r3
+
+    ldmfd         sp!, {r4, pc}         @Restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_16x16_mode_plane_a9q
+@*
+@* @brief
+@*  Perform Intra prediction for  luma_16x16 mode:PLANE
+@*
+@* @par Description:
+@*  Perform Intra prediction for  luma_16x16 mode:PLANE ,described in sec 8.3.3.4
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
+@                                        UWORD8 *pu1_dst,
+@                                        WORD32 src_strd,
+@                                        WORD32 dst_strd,
+@                                        WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+    .global ih264_intra_pred_luma_16x16_mode_plane_a9q
+ih264_intra_pred_luma_16x16_mode_plane_a9q:
+
+    stmfd         sp!, {r4-r10, r12, lr}
+
+    mov           r2, r1
+    add           r1, r0, #17
+    add           r0, r0, #15
+
+    mov           r8, #9
+    sub           r1, r1, #1
+    mov           r10, r1               @top_left
+    mov           r4, #-1
+    vld1.32       d2, [r1], r8
+    ldr           r7, scratch_intrapred_addr1
+scrlbl1:
+    add           r7, r7, pc
+
+    vld1.32       d0, [r1]
+    vrev64.8      d2, d2
+    vld1.32       {q3}, [r7]
+    vsubl.u8      q0, d0, d2
+    vmovl.u8      q8, d6
+    vmul.s16      q0, q0, q8
+    vmovl.u8      q9, d7
+
+    add           r7, r0, r4, lsl #3
+    sub           r0, r7, r4, lsl #1
+    rsb           lr, r4, #0x0
+
+    vpadd.s16     d0, d0, d1
+
+    ldrb          r8, [r7], r4
+    ldrb          r9, [r0], lr
+
+    vpaddl.s16    d0, d0
+    sub           r12, r8, r9
+
+    ldrb          r8, [r7], r4
+
+    vpaddl.s32    d0, d0
+    ldrb          r9, [r0], lr
+    sub           r8, r8, r9
+    vshl.s32      d2, d0, #2
+    add           r12, r12, r8, lsl #1
+
+    vadd.s32      d0, d0, d2
+    ldrb          r8, [r7], r4
+    ldrb          r9, [r0], lr
+    vrshr.s32     d0, d0, #6            @ i_b = D0[0]
+    sub           r8, r8, r9
+    ldrb          r5, [r7], r4
+    add           r8, r8, r8, lsl #1
+
+    vdup.16       q2, d0[0]
+    add           r12, r12, r8
+    ldrb          r9, [r0], lr
+    vmul.s16      q0, q2, q8
+    sub           r5, r5, r9
+    vmul.s16      q1, q2, q9
+    add           r12, r12, r5, lsl #2
+
+    ldrb          r8, [r7], r4
+    ldrb          r9, [r0], lr
+    sub           r8, r8, r9
+    ldrb          r5, [r7], r4
+    add           r8, r8, r8, lsl #2
+    ldrb          r6, [r0], lr
+    add           r12, r12, r8
+    ldrb          r8, [r7], r4
+    ldrb          r9, [r0], lr
+
+    sub           r5, r5, r6
+    sub           r8, r8, r9
+    add           r5, r5, r5, lsl #1
+    rsb           r8, r8, r8, lsl #3
+    add           r12, r12, r5, lsl #1
+    ldrb          r5, [r7], r4
+    ldrb          r6, [r10]             @top_left
+    add           r12, r12, r8
+    sub           r9, r5, r6
+    ldrb          r6, [r1, #7]
+    add           r12, r12, r9, lsl #3  @ i_c = r12
+    add           r8, r5, r6
+
+    add           r12, r12, r12, lsl #2
+    lsl           r8, r8, #4            @ i_a = r8
+
+    add           r12, r12, #0x20
+    lsr           r12, r12, #6
+
+    vshl.s16      q14, q2, #3
+    vdup.16       q3, r12
+
+    vdup.16       q15, r8
+    vshl.s16      q13, q3, #3
+    vsub.s16      q15, q15, q14
+    vsub.s16      q15, q15, q13
+    vadd.s16      q14, q15, q3
+
+    mov           r0, #14
+    vadd.s16      q13, q14, q0
+    vadd.s16      q14, q14, q1
+    vqrshrun.s16  d20, q13, #5
+    vqrshrun.s16  d21, q14, #5
+
+loop_16x16_plane:
+
+    vadd.s16      q13, q13, q3
+    vadd.s16      q14, q14, q3
+    vqrshrun.s16  d22, q13, #5
+    vst1.32       {q10}, [r2], r3
+    vqrshrun.s16  d23, q14, #5
+
+    vadd.s16      q13, q13, q3
+    subs          r0, #2
+    vadd.s16      q14, q14, q3
+    vqrshrun.s16  d20, q13, #5
+    vst1.32       {q11}, [r2], r3
+    vqrshrun.s16  d21, q14, #5
+    bne           loop_16x16_plane
+
+    vadd.s16      q13, q13, q3
+    vadd.s16      q14, q14, q3
+    vqrshrun.s16  d22, q13, #5
+    vst1.32       {q10}, [r2], r3
+    vqrshrun.s16  d23, q14, #5
+    vst1.32       {q11}, [r2], r3
+
+    ldmfd         sp!, {r4-r10, r12, pc}
+
+
+
diff --git a/common/arm/ih264_intra_pred_luma_4x4_a9q.s b/common/arm/ih264_intra_pred_luma_4x4_a9q.s
new file mode 100755
index 0000000..cb386ea
--- /dev/null
+++ b/common/arm/ih264_intra_pred_luma_4x4_a9q.s
@@ -0,0 +1,842 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_intra_pred_luma_4x4_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for intra 4x4 Luma prediction .
+@*
+@* @author
+@*  Ittiam
+@*
+@* @par List of Functions:
+@*
+@*  -ih264_intra_pred_luma_4x4_mode_vert_a9q
+@*  -ih264_intra_pred_luma_4x4_mode_horz_a9q
+@*  -ih264_intra_pred_luma_4x4_mode_dc_a9q
+@*  -ih264_intra_pred_luma_4x4_mode_diag_dl_a9q
+@*  -ih264_intra_pred_luma_4x4_mode_diag_dr_a9q
+@*  -ih264_intra_pred_luma_4x4_mode_vert_r_a9q
+@*  -ih264_intra_pred_luma_4x4_mode_horz_d_a9q
+@*  -ih264_intra_pred_luma_4x4_mode_vert_l_a9q
+@*  -ih264_intra_pred_luma_4x4_mode_horz_u_a9q
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_intra_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+
+.text
+.p2align 2
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_vert
+@*
+@* @brief
+@*  Perform Intra prediction for  luma_4x4 mode:vertical
+@*
+@* @par Description:
+@* Perform Intra prediction for  luma_4x4 mode:vertical ,described in sec 8.3.1.2.1
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@void ih264_intra_pred_luma_4x4_mode_vert(UWORD8 *pu1_src,
+@                                        UWORD8 *pu1_dst,
+@                                        WORD32 src_strd,
+@                                        WORD32 dst_strd,
+@                                        WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+    .global ih264_intra_pred_luma_4x4_mode_vert_a9q
+
+ih264_intra_pred_luma_4x4_mode_vert_a9q:
+
+
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+
+    add           r0, r0, #5
+
+    vld1.32       d0[0], [r0]
+
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+
+
+
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_horz
+@*
+@* @brief
+@*  Perform Intra prediction for  luma_4x4 mode:horizontal
+@*
+@* @par Description:
+@*  Perform Intra prediction for  luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ih264_intra_pred_luma_4x4_mode_horz(UWORD8 *pu1_src,
+@                                         UWORD8 *pu1_dst,
+@                                         WORD32 src_strd,
+@                                         WORD32 dst_strd,
+@                                         WORD32 ui_neighboravailability)
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+
+    .global ih264_intra_pred_luma_4x4_mode_horz_a9q
+
+ih264_intra_pred_luma_4x4_mode_horz_a9q:
+
+
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    add           r0, r0, #3
+    mov           r2 , #-1
+
+    ldrb          r5, [r0], r2
+    vdup.u8       d0, r5
+    ldrb          r6, [r0], r2
+    vst1.32       d0[0], [r1], r3
+    vdup.u8       d1, r6
+    ldrb          r7, [r0], r2
+    vst1.32       d1[0], [r1], r3
+    vdup.u8       d2, r7
+    ldrb          r8, [r0], r2
+    vst1.32       d2[0], [r1], r3
+    vdup.u8       d3, r8
+    vst1.32       d3[0], [r1], r3
+
+
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_dc
+@*
+@* @brief
+@*  Perform Intra prediction for  luma_4x4 mode:DC
+@*
+@* @par Description:
+@*  Perform Intra prediction for  luma_4x4 mode:DC ,described in sec 8.3.1.2.3
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_dc(UWORD8 *pu1_src,
+@                                       UWORD8 *pu1_dst,
+@                                       WORD32 src_strd,
+@                                       WORD32 dst_strd,
+@                                       WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+
+    .global ih264_intra_pred_luma_4x4_mode_dc_a9q
+
+ih264_intra_pred_luma_4x4_mode_dc_a9q:
+
+
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    ldr           r4, [sp, #40]         @   r4 =>  ui_neighboravailability
+
+    ands          r5, r4, #0x01
+    beq           top_available         @LEFT NOT AVAILABLE
+
+    add           r10, r0, #3
+    mov           r2, #-1
+    ldrb          r5, [r10], r2
+    ldrb          r6, [r10], r2
+    ldrb          r7, [r10], r2
+    add           r5, r5, r6
+    ldrb          r8, [r10], r2
+    add           r5, r5, r7
+    ands          r11, r4, #0x04        @ CHECKING IF TOP_AVAILABLE  ELSE BRANCHING TO ONLY LEFT AVAILABLE
+    add           r5, r5, r8
+    beq           left_available
+    add           r10, r0, #5
+    @    BOTH LEFT AND TOP AVAILABLE
+    ldrb          r6, [r10], #1
+    ldrb          r7, [r10], #1
+    add           r5, r5, r6
+    ldrb          r8, [r10], #1
+    add           r5, r5, r7
+    ldrb          r9, [r10], #1
+    add           r5, r5, r8
+    add           r5, r5, r9
+    add           r5, r5, #4
+    lsr           r5, r5, #3
+    vdup.u8       d0, r5
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    b             end_func
+
+top_available: @ ONLT TOP AVAILABLE
+    ands          r11, r4, #0x04        @ CHECKING TOP AVAILABILTY  OR ELSE BRANCH TO NONE AVAILABLE
+    beq           none_available
+
+    add           r10, r0, #5
+    ldrb          r6, [r10], #1
+    ldrb          r7, [r10], #1
+    ldrb          r8, [r10], #1
+    add           r5, r6, r7
+    ldrb          r9, [r10], #1
+    add           r5, r5, r8
+    add           r5, r5, r9
+    add           r5, r5, #2
+    lsr           r5, r5, #2
+    vdup.u8       d0, r5
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    b             end_func
+
+left_available: @ONLY LEFT AVAILABLE
+    add           r5, r5, #2
+    lsr           r5, r5, #2
+    vdup.u8       d0, r5
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    b             end_func
+
+none_available:                         @NONE AVAILABLE
+    mov           r5, #128
+    vdup.u8       d0, r5
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    vst1.32       d0[0], [r1], r3
+    b             end_func
+
+
+end_func:
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_diag_dl
+@*
+@* @brief
+@*  Perform Intra prediction for  luma_4x4 mode:Diagonal_Down_Left
+@*
+@* @par Description:
+@*  Perform Intra prediction for  luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_diag_dl(UWORD8 *pu1_src,
+@                                            UWORD8 *pu1_dst,
+@                                            WORD32 src_strd,
+@                                            WORD32 dst_strd,
+@                                            WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_4x4_mode_diag_dl_a9q
+
+ih264_intra_pred_luma_4x4_mode_diag_dl_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+
+    add           r0, r0, #5
+    sub           r5, r3, #2
+    add           r6, r0, #7
+    vld1.8        {d0}, [r0]
+    vext.8        d1, d0, d0, #1
+    vext.8        d2, d0, d0, #2
+    vld1.8        {d2[6]}, [r6]
+    vaddl.u8      q10, d0, d1
+    vaddl.u8      q11, d1, d2
+    vadd.u16      q12, q10, q11
+    vqrshrun.s16  d3, q12, #2
+    vst1.32       {d3[0]}, [r1], r3
+    vext.8        d4, d3, d3, #1
+    vst1.32       {d4[0]}, [r1], r3
+    vst1.16       {d3[1]}, [r1]!
+    vst1.16       {d3[2]}, [r1], r5
+    vst1.16       {d4[1]}, [r1]!
+    vst1.16       {d4[2]}, [r1]
+
+end_func_diag_dl:
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_diag_dr
+@*
+@* @brief
+@* Perform Intra prediction for  luma_4x4 mode:Diagonal_Down_Right
+@*
+@* @par Description:
+@*  Perform Intra prediction for  luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_diag_dr(UWORD8 *pu1_src,
+@                                            UWORD8 *pu1_dst,
+@                                            WORD32 src_strd,
+@                                            WORD32 dst_strd,
+@                                            WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_4x4_mode_diag_dr_a9q
+
+ih264_intra_pred_luma_4x4_mode_diag_dr_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+
+
+    vld1.u8       {d0}, [r0]
+    add           r0, r0, #1
+    vld1.u8       {d1}, [r0]
+    vext.8        d2, d1, d1, #1
+    vaddl.u8      q10, d0, d1
+    vaddl.u8      q11, d1, d2
+    vadd.u16      q12, q10, q11
+    vqrshrun.s16  d3, q12, #2
+
+    vext.8        d4, d3, d3, #1
+    sub           r5, r3, #2
+    vst1.16       {d4[1]}, [r1]!
+    vst1.16       {d4[2]}, [r1], r5
+    vst1.16       {d3[1]}, [r1]!
+    vst1.16       {d3[2]}, [r1], r5
+    vst1.32       {d4[0]}, [r1], r3
+    vst1.32       {d3[0]}, [r1], r3
+
+end_func_diag_dr:
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_vert_r
+@*
+@* @brief
+@* Perform Intra prediction for  luma_4x4 mode:Vertical_Right
+@*
+@* @par Description:
+@*   Perform Intra prediction for  luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_vert_r(UWORD8 *pu1_src,
+@                                            UWORD8 *pu1_dst,
+@                                            WORD32 src_strd,
+@                                            WORD32 dst_strd,
+@                                            WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_4x4_mode_vert_r_a9q
+
+ih264_intra_pred_luma_4x4_mode_vert_r_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+
+
+    vld1.u8       {d0}, [r0]
+    add           r0, r0, #1
+    vld1.u8       {d1}, [r0]
+    vext.8        d2, d1, d1, #1
+    vaddl.u8      q10, d0, d1
+    vaddl.u8      q11, d1, d2
+    vadd.u16      q12, q10, q11
+    vqrshrun.s16  d4, q10, #1
+    vqrshrun.s16  d3, q12, #2
+    sub           r5, r3, #2
+    vext.8        d5, d3, d3, #3
+    vst1.32       {d4[1]}, [r1], r3
+    vst1.32       {d5[0]}, [r1], r3
+    sub           r8, r3, #3
+    vst1.u8       {d3[2]}, [r1]!
+    vst1.16       {d4[2]}, [r1]!
+    vst1.u8       {d4[6]}, [r1], r8
+    vst1.u8       {d3[1]}, [r1]!
+    vst1.16       {d5[0]}, [r1]!
+    vst1.u8       {d5[2]}, [r1]
+
+
+end_func_vert_r:
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_horz_d
+@*
+@* @brief
+@* Perform Intra prediction for  luma_4x4 mode:Horizontal_Down
+@*
+@* @par Description:
+@*   Perform Intra prediction for  luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_horz_d(UWORD8 *pu1_src,
+@                                            UWORD8 *pu1_dst,
+@                                            WORD32 src_strd,
+@                                            WORD32 dst_strd,
+@                                            WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_4x4_mode_horz_d_a9q
+
+ih264_intra_pred_luma_4x4_mode_horz_d_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+
+    vld1.u8       {d0}, [r0]
+    add           r0, r0, #1
+    vld1.u8       {d1}, [r0]
+    vext.8        d2, d1, d0, #1
+    vaddl.u8      q10, d0, d1
+    vaddl.u8      q11, d1, d2
+    vadd.u16      q12, q10, q11
+    vqrshrun.s16  d4, q10, #1
+    vqrshrun.s16  d5, q12, #2
+    sub           r5, r3, #2
+    vmov.8        d6, d5
+    vtrn.8        d4, d5 @
+    vst1.u16      {d5[1]}, [r1]!
+    vst1.16       {d6[2]}, [r1], r5
+    vst1.u16      {d4[1]}, [r1]!
+    vst1.16       {d5[1]}, [r1], r5
+    vst1.u16      {d5[0]}, [r1]!
+    vst1.16       {d4[1]}, [r1], r5
+    vst1.u16      {d4[0]}, [r1]!
+    vst1.16       {d5[0]}, [r1], r5
+
+end_func_horz_d:
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_vert_l
+@*
+@* @brief
+@*  Perform Intra prediction for  luma_4x4 mode:Vertical_Left
+@*
+@* @par Description:
+@*   Perform Intra prediction for  luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_vert_l(UWORD8 *pu1_src,
+@                                            UWORD8 *pu1_dst,
+@                                            WORD32 src_strd,
+@                                            WORD32 dst_strd,
+@                                            WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_4x4_mode_vert_l_a9q
+
+ih264_intra_pred_luma_4x4_mode_vert_l_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    add           r0, r0, #4
+    vld1.u8       {d0}, [r0]
+    add           r0, r0, #1
+    vld1.u8       {d1}, [r0]
+    vext.8        d2, d1, d0, #1
+    vaddl.u8      q10, d0, d1
+    vaddl.u8      q11, d1, d2
+    vadd.u16      q12, q10, q11
+    vqrshrun.s16  d4, q10, #1
+    vqrshrun.s16  d5, q12, #2
+    vext.8        d6, d4, d4, #1
+    vext.8        d7, d5, d5, #1
+    vst1.32       {d6[0]}, [r1], r3
+    vext.8        d16, d4, d4, #2
+    vext.8        d17, d5, d5, #2
+    vst1.32       {d7[0]}, [r1], r3
+    vst1.32       {d16[0]}, [r1], r3
+    vst1.32       {d17[0]}, [r1], r3
+
+
+
+end_func_vert_l:
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_horz_u
+@*
+@* @brief
+@*     Perform Intra prediction for  luma_4x4 mode:Horizontal_Up
+@*
+@* @par Description:
+@*      Perform Intra prediction for  luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_horz_u(UWORD8 *pu1_src,
+@                                           UWORD8 *pu1_dst,
+@                                           WORD32 src_strd,
+@                                           WORD32 dst_strd,
+@                                           WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_4x4_mode_horz_u_a9q
+
+ih264_intra_pred_luma_4x4_mode_horz_u_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    mov           r10, r0
+    vld1.u8       {d0}, [r0]
+    ldrb          r9, [r0], #1
+    vext.8        d1, d0, d0, #1
+    vld1.u8       {d0[7]}, [r10]
+    vext.8        d2, d1, d1, #1
+    vaddl.u8      q10, d0, d1
+    vaddl.u8      q11, d1, d2
+    vadd.u16      q12, q10, q11
+    vqrshrun.s16  d4, q10, #1
+    vqrshrun.s16  d5, q12, #2
+    vmov          d6, d4
+    vext.8        d6, d5, d4, #1
+    vst1.8        {d4[2]}, [r1]!
+    vst1.8        {d6[0]}, [r1]!
+    vtrn.8        d6, d5 @
+    sub           r5, r3, #2
+    vtrn.8        d4, d6 @
+    vdup.8        d7, r9
+    vst1.16       {d6[0]}, [r1], r5
+    vst1.16       {d6[0]}, [r1]!
+    vst1.16       {d5[3]}, [r1], r5
+    vst1.16       {d5[3]}, [r1]!
+    vst1.16       {d7[3]}, [r1], r5
+    vst1.32       {d7[0]}, [r1], r3
+
+end_func_horz_u:
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_intra_pred_luma_8x8_a9q.s b/common/arm/ih264_intra_pred_luma_8x8_a9q.s
new file mode 100755
index 0000000..6da1c95
--- /dev/null
+++ b/common/arm/ih264_intra_pred_luma_8x8_a9q.s
@@ -0,0 +1,1037 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_intra_pred_luma_8x8_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for intra 8x8 Luma prediction .
+@*
+@* @author
+@*  Ittiam
+@*
+@* @par List of Functions:
+@*
+@*  -ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q
+@*  -ih264_intra_pred_luma_8x8_mode_vert_a9q
+@*  -ih264_intra_pred_luma_8x8_mode_horz_a9q
+@*  -ih264_intra_pred_luma_8x8_mode_dc_a9q
+@*  -ih264_intra_pred_luma_8x8_mode_diag_dl_a9q
+@*  -ih264_intra_pred_luma_8x8_mode_diag_dr_a9q
+@*  -ih264_intra_pred_luma_8x8_mode_vert_r_a9q
+@*  -ih264_intra_pred_luma_8x8_mode_horz_d_a9q
+@*  -ih264_intra_pred_luma_8x8_mode_vert_l_a9q
+@*  -ih264_intra_pred_luma_8x8_mode_horz_u_a9q
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_intra_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+
+
+.text
+.p2align 2
+
+    .extern ih264_gai1_intrapred_luma_8x8_horz_u
+.hidden ih264_gai1_intrapred_luma_8x8_horz_u
+scratch_intrapred_addr_8x8:
+    .long ih264_gai1_intrapred_luma_8x8_horz_u -  scrlb8x8l2 - 8
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_ref_filtering
+@*
+@* @brief
+@* Reference sample filtering process for Intra_8x8 sample prediction
+@*
+@* @par Description:
+@*  Perform Reference sample filtering process for Intra_8x8 sample prediction ,described in sec 8.3.2.2.1
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride [Not used]
+@*
+@* @param[in] dst_strd
+@*  integer destination stride[Not used]
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels[Not used]
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_ref_filtering(UWORD8 *pu1_src,
+@                                                 UWORD8 *pu1_dst)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+
+
+    .global ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q
+
+ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    vpush         {d8-d15}
+
+
+    vld1.u8       {q0}, [r0]!           @
+    vld1.u8       {q1}, [r0]
+    add           r0, r0, #8            @
+    vext.8        q2, q0, q1, #1
+    vext.8        q3, q1, q1, #1
+    vext.8        q4, q2, q3, #1
+    vext.8        q5, q3, q3, #1
+    vld1.8        {d10[7]}, [r0]        @ LOADING SRC[24] AGIN TO THE END FOR p'[ 15, -1 ] = ( p[ 14, -1 ] + 3 * p[ 15, -1 ] + 2 ) >> 2
+    vaddl.u8      q10, d0, d4
+    vaddl.u8      q7, d0, d0            @    SPECIAL CASE FOR p'[ -1 ,7 ] = ( p[ -1, 6 ] + 3 * p[ -1, 7 ] + 2 ) >> 2
+    vadd.u16      q7, q10, q7
+    vaddl.u8      q11, d1, d5
+    vqrshrun.s16  d14, q7, #2
+    vaddl.u8      q12, d4, d8
+    vaddl.u8      q13, d5, d9
+    vst1.8        {d14[0]}, [r1]!
+    vadd.u16      q12, q10, q12
+    vadd.u16      q13, q11, q13
+    vaddl.u8      q9, d2, d6
+    vaddl.u8      q8, d6, d10
+    vqrshrun.s16  d4, q12, #2
+    vqrshrun.s16  d5, q13, #2
+    vadd.u16      q6, q8, q9
+    vst1.8        {q2}, [r1]!
+    vqrshrun.s16  d6, q6, #2
+    vst1.8        {d6}, [r1]
+
+
+end_func_ref_filt:
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_vert
+@*
+@* @brief
+@*   Perform Intra prediction for  luma_8x8 mode:vertical
+@*
+@* @par Description:
+@* Perform Intra prediction for  luma_8x8 mode:vertical ,described in sec 8.3.2.2.2
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@void ih264_intra_pred_luma_8x8_mode_vert(UWORD8 *pu1_src,
+@                                        UWORD8 *pu1_dst,
+@                                        WORD32 src_strd,
+@                                        WORD32 dst_strd,
+@                                        WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_8x8_mode_vert_a9q
+
+ih264_intra_pred_luma_8x8_mode_vert_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+
+    add           r0, r0, #9
+    vld1.8        d0, [r0]
+
+    vst1.8        d0, [r1], r3
+    vst1.8        d0, [r1], r3
+    vst1.8        d0, [r1], r3
+    vst1.8        d0, [r1], r3
+    vst1.8        d0, [r1], r3
+    vst1.8        d0, [r1], r3
+    vst1.8        d0, [r1], r3
+    vst1.8        d0, [r1], r3
+
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_horz
+@*
+@* @brief
+@*  Perform Intra prediction for  luma_8x8 mode:horizontal
+@*
+@* @par Description:
+@*  Perform Intra prediction for  luma_8x8 mode:horizontal ,described in sec 8.3.2.2.2
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src,
+@                                         UWORD8 *pu1_dst,
+@                                         WORD32 src_strd,
+@                                         WORD32 dst_strd,
+@                                         WORD32 ui_neighboravailability)
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_8x8_mode_horz_a9q
+
+ih264_intra_pred_luma_8x8_mode_horz_a9q:
+
+    stmfd         sp!, {r14}            @store register values to stack
+
+    vld1.u8       {d0}, [r0]
+    mov           r2, #6
+
+    vdup.u8       d1, d0[7]
+    vdup.u8       d2, d0[6]
+    vst1.8        {d1}, [r1], r3
+
+loop_8x8_horz:
+    vext.8        d0, d0, d0, #6
+    vst1.8        {d2}, [r1], r3
+    vdup.u8       d1, d0[7]
+    subs          r2, #2
+    vdup.u8       d2, d0[6]
+    vst1.8        {d1}, [r1], r3
+    bne           loop_8x8_horz
+
+    vext.8        d0, d0, d0, #6
+    vst1.8        {d2}, [r1], r3
+
+    ldmfd         sp!, {pc}             @restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_dc
+@*
+@* @brief
+@*  Perform Intra prediction for  luma_8x8 mode:DC
+@*
+@* @par Description:
+@*  Perform Intra prediction for  luma_8x8 mode:DC ,described in sec 8.3.2.2.3
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src,
+@                                       UWORD8 *pu1_dst,
+@                                       WORD32 src_strd,
+@                                       WORD32 dst_strd,
+@                                       WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_8x8_mode_dc_a9q
+
+ih264_intra_pred_luma_8x8_mode_dc_a9q:
+
+    stmfd         sp!, {r4, r14}        @store register values to stack
+    ldr           r4, [sp, #8]          @r4 =>  ui_neighboravailability
+
+    ands          r2, r4, #0x01         @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE
+    beq           top_available
+    ands          r2, r4, #0x04         @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+    beq           left_available
+
+    vld1.u8       {d0}, [r0]            @BOTH LEFT AND TOP AVAILABLE
+    add           r0, r0, #9
+    vld1.u8       {d1}, [r0]
+    vpaddl.u8     q0, q0
+    vadd.u16      d0, d0, d1
+    vpaddl.u16    d0, d0
+    vpaddl.u32    d0, d0
+    vqrshrun.s16  d0, q0, #4
+    vdup.u8       d0, d0[0]
+    b             str_pred
+
+top_available:                          @ONLY TOP AVAILABLE
+    ands          r2, r4, #0x04         @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+    beq           none_available
+
+    add           r0, r0, #9
+    vld1.u8       {d0}, [r0]
+    vpaddl.u8     d0, d0
+    vpaddl.u16    d0, d0
+    vpaddl.u32    d0, d0
+    vqrshrun.s16  d0, q0, #3
+    vdup.u8       d0, d0[0]
+    b             str_pred
+
+left_available:                         @ONLY LEFT AVAILABLE
+    vld1.u8       {d0}, [r0]
+    vpaddl.u8     d0, d0
+    vpaddl.u16    d0, d0
+    vpaddl.u32    d0, d0
+    vqrshrun.s16  d0, q0, #3
+    vdup.u8       d0, d0[0]
+    b             str_pred
+
+none_available:                         @NONE AVAILABLE
+    vmov.u8       q0, #128
+
+str_pred:
+    vst1.8        {d0}, [r1], r3
+    vst1.8        {d0}, [r1], r3
+    vst1.8        {d0}, [r1], r3
+    vst1.8        {d0}, [r1], r3
+    vst1.8        {d0}, [r1], r3
+    vst1.8        {d0}, [r1], r3
+    vst1.8        {d0}, [r1], r3
+    vst1.8        {d0}, [r1], r3
+
+    ldmfd         sp!, {r4, pc}         @Restoring registers from stack
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_diag_dl
+@*
+@* @brief
+@*  Perform Intra prediction for  luma_8x8 mode:Diagonal_Down_Left
+@*
+@* @par Description:
+@*  Perform Intra prediction for  luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.4
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src,
+@                                            UWORD8 *pu1_dst,
+@                                            WORD32 src_strd,
+@                                            WORD32 dst_strd,
+@                                            WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+    .global ih264_intra_pred_luma_8x8_mode_diag_dl_a9q
+
+ih264_intra_pred_luma_8x8_mode_diag_dl_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+
+    add           r0, r0, #9
+    sub           r5, r3, #4
+    add           r6, r0, #15
+    vld1.8        {q0}, [r0]
+    vext.8        q2, q0, q0, #2
+    vext.8        q1, q0, q0, #1
+    vld1.8        {d5[6]}, [r6]
+    @ q1 = q0 shifted to left once
+    @ q2 = q1 shifted to left once
+    vaddl.u8      q10, d0, d2           @Adding for FILT121
+    vaddl.u8      q11, d1, d3
+    vaddl.u8      q12, d2, d4
+    vaddl.u8      q13, d3, d5
+    vadd.u16      q12, q10, q12
+    vadd.u16      q13, q11, q13
+
+    vqrshrun.s16  d4, q12, #2
+    vqrshrun.s16  d5, q13, #2
+    @Q2 has all FILT121 values
+    vst1.8        {d4}, [r1], r3
+    vext.8        q9, q2, q2, #1
+    vext.8        q8, q9, q9, #1
+    vst1.8        {d18}, [r1], r3
+    vext.8        q15, q8, q8, #1
+    vst1.8        {d16}, [r1], r3
+    vst1.8        {d30}, [r1], r3
+    vst1.32       {d4[1]}, [r1]!
+    vst1.32       {d5[0]}, [r1], r5
+    vst1.32       {d18[1]}, [r1]!
+    vst1.32       {d19[0]}, [r1], r5
+    vst1.32       {d16[1]}, [r1]!
+    vst1.32       {d17[0]}, [r1], r5
+    vst1.32       {d30[1]}, [r1]!
+    vst1.32       {d31[0]}, [r1], r5
+
+
+end_func_diag_dl:
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_diag_dr
+@*
+@* @brief
+@* Perform Intra prediction for  luma_8x8 mode:Diagonal_Down_Right
+@*
+@* @par Description:
+@*  Perform Intra prediction for  luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.5
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src,
+@                                            UWORD8 *pu1_dst,
+@                                            WORD32 src_strd,
+@                                            WORD32 dst_strd,
+@                                            WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_8x8_mode_diag_dr_a9q
+
+ih264_intra_pred_luma_8x8_mode_diag_dr_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+
+
+    vld1.u8       {q0}, [r0]
+    add           r0, r0, #1
+    vld1.u8       {q1}, [r0]
+    vext.8        q2, q1, q1, #1
+    @ q1 = q0 shifted to left once
+    @ q2 = q1 shifted to left once
+    vaddl.u8      q10, d0, d2           @Adding for FILT121
+    vaddl.u8      q11, d1, d3
+    vaddl.u8      q12, d2, d4
+    vaddl.u8      q13, d3, d5
+    vadd.u16      q12, q10, q12
+    vadd.u16      q13, q11, q13
+    vqrshrun.s16  d4, q12, #2
+    vqrshrun.s16  d5, q13, #2
+    @Q2 has all FILT121 values
+    sub           r5, r3, #4
+    vext.8        q9, q2, q2, #15
+    vst1.8        {d19}, [r1], r3
+    vext.8        q8, q9, q9, #15
+    vst1.8        {d17}, [r1], r3
+    vext.8        q15, q8, q8, #15
+    vst1.8        {d31}, [r1], r3
+    vst1.32       {d4[1]}, [r1]!
+    vst1.32       {d5[0]}, [r1], r5
+    vst1.32       {d18[1]}, [r1]!
+    vst1.32       {d19[0]}, [r1], r5
+    vst1.32       {d16[1]}, [r1]!
+    vst1.32       {d17[0]}, [r1], r5
+    vst1.32       {d30[1]}, [r1]!
+    vst1.32       {d31[0]}, [r1], r5
+    vst1.8        {d4}, [r1], r3
+
+end_func_diag_dr:
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_vert_r
+@*
+@* @brief
+@* Perform Intra prediction for  luma_8x8 mode:Vertical_Right
+@*
+@* @par Description:
+@*   Perform Intra prediction for  luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.6
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src,
+@                                            UWORD8 *pu1_dst,
+@                                            WORD32 src_strd,
+@                                            WORD32 dst_strd,
+@                                            WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_8x8_mode_vert_r_a9q
+
+ih264_intra_pred_luma_8x8_mode_vert_r_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+
+    vld1.u8       {q0}, [r0]
+    add           r0, r0, #1
+    vld1.u8       {q1}, [r0]
+    vext.8        q2, q1, q1, #1
+    @ q1 = q0 shifted to left once
+    @ q2 = q1 shifted to left once
+    vaddl.u8      q10, d0, d2
+    vaddl.u8      q11, d1, d3
+    vaddl.u8      q12, d2, d4
+    vaddl.u8      q13, d3, d5
+    vadd.u16      q12, q10, q12
+    vadd.u16      q13, q11, q13
+
+    vqrshrun.s16  d4, q10, #1
+    vqrshrun.s16  d5, q11, #1
+    vqrshrun.s16  d6, q12, #2
+    vqrshrun.s16  d7, q13, #2
+    @Q2 has all FILT11 values
+    @Q3 has all FILT121 values
+    sub           r5, r3, #6
+    sub           r6, r3, #4
+    vst1.8        {d5}, [r1], r3        @ row 0
+    vext.8        q9, q3, q3, #15
+    vmov.8        q11, q9
+    vext.8        q8, q2, q2, #1
+    vst1.8        {d19}, [r1], r3       @row 1
+
+    vmov.8        q15, q8
+    vext.8        q10, q2, q2, #15
+    vuzp.8        q8, q9
+    @row 2
+    vext.8        q14, q8, q8, #1
+    vst1.8        {d21}, [r1]
+    vst1.8        {d6[6]}, [r1], r3
+    @row 3
+
+    vst1.16       {d29[1]}, [r1]!
+    vst1.32       {d7[0]}, [r1]!
+    vst1.16       {d7[2]}, [r1], r5
+@row 4
+    vst1.16       {d19[1]}, [r1]!
+    vst1.32       {d5[0]}, [r1]!
+    vst1.16       {d5[2]}, [r1], r5
+
+@row 5
+    vext.8        q13, q9, q9, #1
+    vst1.16       {d17[1]}, [r1]!
+    vst1.32       {d23[0]}, [r1]!
+    vst1.16       {d23[2]}, [r1], r5
+
+
+@row 6
+    vst1.16       {d27[0]}, [r1]!
+    vst1.8        {d27[2]}, [r1]!
+    vst1.8        {d5[0]}, [r1]!
+    vst1.32       {d31[0]}, [r1], r6
+@row 7
+    vst1.32       {d29[0]}, [r1]!
+    vst1.32       {d7[0]}, [r1]!
+
+
+
+end_func_vert_r:
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_horz_d
+@*
+@* @brief
+@* Perform Intra prediction for  luma_8x8 mode:Horizontal_Down
+@*
+@* @par Description:
+@*   Perform Intra prediction for  luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.7
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src,
+@                                            UWORD8 *pu1_dst,
+@                                            WORD32 src_strd,
+@                                            WORD32 dst_strd,
+@                                            WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+    .global ih264_intra_pred_luma_8x8_mode_horz_d_a9q
+
+ih264_intra_pred_luma_8x8_mode_horz_d_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    vpush         {d8-d15}
+
+    vld1.u8       {q0}, [r0]
+    add           r0, r0, #1
+    vld1.u8       {q1}, [r0]
+    vext.8        q2, q1, q1, #1
+    @ q1 = q0 shifted to left once
+    @ q2 = q1 shifted to left once
+    vaddl.u8      q10, d0, d2
+    vaddl.u8      q11, d1, d3
+    vaddl.u8      q12, d2, d4
+    vaddl.u8      q13, d3, d5
+    vadd.u16      q12, q10, q12
+    vadd.u16      q13, q11, q13
+
+    vqrshrun.s16  d4, q10, #1
+    vqrshrun.s16  d5, q11, #1
+    vqrshrun.s16  d6, q12, #2
+    vqrshrun.s16  d7, q13, #2
+    @Q2 has all FILT11 values
+    @Q3 has all FILT121 values
+    vmov.8        q4, q2
+    vmov.8        q5, q3
+    sub           r6, r3, #6
+    vtrn.8        q4, q5 @
+    vmov.8        q6, q4
+    vmov.8        q7, q5
+    sub           r5, r3, #4
+    vtrn.16       q6, q7
+    vext.8        q8, q3, q3, #14
+    @ROW 0
+    vst1.8        {d17}, [r1]
+    vst1.16       {d10[3]}, [r1], r3
+
+    @ROW 1
+    vst1.32       {d14[1]}, [r1]!
+    vst1.32       {d7[0]}, [r1], r5
+    @ROW 2
+    vst1.16       {d10[2]}, [r1]!
+    vst1.32       {d14[1]}, [r1]!
+    vst1.16       {d7[0]}, [r1], r6
+    @ROW 3
+    vst1.32       {d12[1]}, [r1]!
+    vst1.32       {d14[1]}, [r1], r5
+    @ROW 4
+    vst1.16       {d14[1]}, [r1]!
+    vst1.32       {d12[1]}, [r1]!
+    vst1.16       {d14[2]}, [r1], r6
+    @ROW 5
+    vst1.32       {d14[0]}, [r1]!
+    vst1.32       {d12[1]}, [r1], r5
+    @ROW 6
+    vst1.16       {d10[0]}, [r1]!
+    vst1.16       {d8[1]}, [r1]!
+    vst1.16       {d14[1]}, [r1]!
+    vst1.16       {d12[2]}, [r1], r6
+    @ROW 7
+    vst1.32       {d12[0]}, [r1]!
+    vst1.32       {d14[0]}, [r1], r5
+
+end_func_horz_d:
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_vert_l
+@*
+@* @brief
+@*  Perform Intra prediction for  luma_8x8 mode:Vertical_Left
+@*
+@* @par Description:
+@*   Perform Intra prediction for  luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.8
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src,
+@                                            UWORD8 *pu1_dst,
+@                                            WORD32 src_strd,
+@                                            WORD32 dst_strd,
+@                                            WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+
+    .global ih264_intra_pred_luma_8x8_mode_vert_l_a9q
+
+ih264_intra_pred_luma_8x8_mode_vert_l_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @Restoring registers from stack
+    vpush         {d8-d15}
+    add           r0, r0, #9
+    vld1.u8       {q0}, [r0]
+    add           r0, r0, #1
+    vld1.u8       {q1}, [r0]
+    vext.8        q2, q1, q1, #1
+    vaddl.u8      q10, d0, d2
+    vaddl.u8      q11, d1, d3
+    vaddl.u8      q12, d2, d4
+    vaddl.u8      q13, d3, d5
+    vadd.u16      q12, q10, q12
+    vadd.u16      q13, q11, q13
+
+    vqrshrun.s16  d4, q10, #1
+    vqrshrun.s16  d5, q11, #1
+    vqrshrun.s16  d6, q12, #2
+    vext.8        q4, q2, q2, #1
+    vqrshrun.s16  d7, q13, #2
+    @Q2 has all FILT11 values
+    @Q3 has all FILT121 values
+
+    vext.8        q5, q3, q3, #1
+    @ROW 0,1
+    vst1.8        {d4}, [r1], r3
+    vst1.8        {d6}, [r1], r3
+
+    vext.8        q6, q4, q4, #1
+    vext.8        q7, q5, q5, #1
+    @ROW 2,3
+    vst1.8        {d8}, [r1], r3
+    vst1.8        {d10}, [r1], r3
+
+    vext.8        q8, q6, q6, #1
+    vext.8        q9, q7, q7, #1
+    @ROW 4,5
+    vst1.8        {d12}, [r1], r3
+    vst1.8        {d14}, [r1], r3
+    @ROW 6,7
+    vst1.8        {d16}, [r1], r3
+    vst1.8        {d18}, [r1], r3
+
+end_func_vert_l:
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_horz_u
+@*
+@* @brief
+@*     Perform Intra prediction for  luma_8x8 mode:Horizontal_Up
+@*
+@* @par Description:
+@*      Perform Intra prediction for  luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.9
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@*  availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src,
+@                                           UWORD8 *pu1_dst,
+@                                           WORD32 src_strd,
+@                                           WORD32 dst_strd,
+@                                           WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  ui_neighboravailability
+
+    .global ih264_intra_pred_luma_8x8_mode_horz_u_a9q
+
+ih264_intra_pred_luma_8x8_mode_horz_u_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @store register values to stack
+    vpush         {d8-d15}
+
+    vld1.u8       {q0}, [r0]
+    vld1.u8       {d1[7]}, [r0]
+    vext.8        q1, q0, q0, #1
+    vext.8        q2, q1, q1, #1
+    @ LOADING V TABLE
+    ldr           r12, scratch_intrapred_addr_8x8
+scrlb8x8l2:
+    add           r12, r12, pc
+    vaddl.u8      q10, d0, d2
+    vaddl.u8      q11, d1, d3
+    vaddl.u8      q12, d2, d4
+    vaddl.u8      q13, d3, d5
+    vadd.u16      q12, q10, q12
+    vadd.u16      q13, q11, q13
+    vld1.u8       {q5}, [r12]
+    vqrshrun.s16  d4, q10, #1
+    vqrshrun.s16  d5, q11, #1
+    vqrshrun.s16  d6, q12, #2
+    vqrshrun.s16  d7, q13, #2
+    @Q2 has all FILT11 values
+    @Q3 has all FILT121 values
+    vtbl.u8       d12, {q2, q3}, d10
+    vdup.u8       q7, d5[7]             @
+    vtbl.u8       d13, {q2, q3}, d11
+    vext.8        q8, q6, q7, #2
+    vext.8        q9, q8, q7, #2
+    vst1.8        {d12}, [r1], r3
+    vext.8        q10, q9, q7, #2
+    vst1.8        {d16}, [r1], r3
+    vst1.8        {d18}, [r1], r3
+    vst1.8        {d20}, [r1], r3
+    vst1.8        {d13}, [r1], r3
+    vst1.8        {d17}, [r1], r3
+    vst1.8        {d19}, [r1], r3
+    vst1.8        {d21}, [r1], r3
+
+
+end_func_horz_u:
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
+
+
+
+
+
+
+
+
diff --git a/common/arm/ih264_iquant_itrans_recon_a9.s b/common/arm/ih264_iquant_itrans_recon_a9.s
new file mode 100755
index 0000000..f71ca69
--- /dev/null
+++ b/common/arm/ih264_iquant_itrans_recon_a9.s
@@ -0,0 +1,871 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ *  ih264_iquant_itrans_recon_a9.s
+@ *
+@ * @brief
+@ *  Contains function definitions for single stage  inverse transform
+@ *
+@ * @author
+@ *  Mohit
+@ *  Harinarayanaan
+@ *
+@ * @par List of Functions:
+@ *  - ih264_iquant_itrans_recon_4x4_a9()
+@ *  - ih264_iquant_itrans_recon_8x8_a9()
+@ *  - ih264_iquant_itrans_recon_chroma_4x4_a9()
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@*/
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
+@ *
+@ * @par Description:
+@ *  Performs inverse transform Ci4 and adds the residue to get the
+@ *  reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ *  Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ *  Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ *  Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ *     QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ *  Prediction stride
+@ *
+@ * @param[in] out_strd
+@ *  Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*16
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns  Void
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
+@                                   UWORD8 *pu1_pred,
+@                                   UWORD8 *pu1_out,
+@                                   WORD32 pred_strd,
+@                                   WORD32 out_strd,
+@                                   const UWORD16 *pu2_iscal_mat,
+@                                   const UWORD16 *pu2_weigh_mat,
+@                                   UWORD32 u4_qp_div_6,
+@                                   WORD32 *pi4_tmp,
+@                                   WORD32 iq_start_idx
+@                                   WORD16 *pi2_dc_ld_addr)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 =>  pred_strd
+@r4 =>  out_strd
+@r5 =>  *pu2_iscal_mat
+@r6 =>  *pu2_weigh_mat
+@r7 =>  u4_qp_div_6
+@r8 =>  iq_start_idx
+@r10=>  pi2_dc_ld_addr
+.text
+.p2align 2
+
+    .global ih264_iquant_itrans_recon_4x4_a9
+
+ih264_iquant_itrans_recon_4x4_a9:
+
+@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
+@If the macro value changes need to change the instruction according to it.
+@Only one shift is done in horizontal inverse because,
+@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
+    ldr           r7, [sp, #52]         @Loads u4_qp_div_6
+    ldr           r4, [sp, #40]         @Loads out_strd
+    vdup.s32      q15, r7               @Populate the u4_qp_div_6 in Q15
+    ldr           r5, [sp, #44]         @Loads *pu2_iscal_mat
+
+    ldr           r6, [sp, #48]         @Loads *pu2_weigh_mat
+
+    ldr           r8, [sp, #60]         @Loads iq_start_idx
+
+    ldr           r10, [sp, #64]        @Load alternate dc address
+
+    vpush         {d8-d15}
+@=======================DEQUANT FROM HERE===================================
+
+    vld4.s16      {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15
+    vld4.s16      {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15
+    vmul.s16      q10, q10, q13         @x[i]=(scale[i] * dequant[i]) where i = 0..7
+    vld4.s16      {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15
+
+    vmul.s16      q11, q11, q14         @x[i]=(scale[i] * dequant[i]) where i = 8..15
+
+    subs          r8, r8, #1            @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
+    ldreqsh       r9, [r10]             @ Loads signed halfword pi2_dc_ld_addr[0], if r8==1
+
+    vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+    vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+    vmull.s16     q2, d18, d22          @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+    vmull.s16     q3, d19, d23          @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+    vshl.s32      q0, q0, q15           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
+    vshl.s32      q1, q1, q15           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
+    vshl.s32      q2, q2, q15           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
+    vshl.s32      q3, q3, q15           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
+
+    vqrshrn.s32   d0, q0, #0x4          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+    vqrshrn.s32   d1, q1, #0x4          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+    vqrshrn.s32   d2, q2, #0x4          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+    vqrshrn.s32   d3, q3, #0x4          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+    vmoveq.16     d0[0], r9             @ Restore dc value in case of intra, i.e. r8 == 1
+
+@========= PROCESS IDCT FROM HERE =======
+@Steps for Stage 1:
+@------------------
+    vld1.32       d30[0], [r1], r3      @I row Load pu1_pred buffer
+    vadd.s16      d4, d0, d2            @x0 = q0 + q1;
+
+    vsub.s16      d5, d0, d2            @x1 = q0 - q1;
+
+    vshr.s16      d8, d1, #1            @q0>>1
+    vshr.s16      d9, d3, #1            @q1>>1
+
+    vsub.s16      d6, d8, d3            @x2 = (q0 >> 1) -  q1;
+    vadd.s16      d7, d1, d9            @x3 = q0+ (q1 >> 1);
+    vld1.32       d30[1], [r1], r3      @II row Load pu1_pred buffer
+
+    vswp          d6, d7                @Reverse positions of x2 and x3
+
+    vsub.s16      q6, q2, q3            @x0-x3 and x1-x2 combined
+    vadd.s16      q5, q2, q3            @x0 + x3 and x1+x2 combined
+
+    vld1.32       d31[0], [r1], r3      @III row Load pu1_pred buf
+
+    vswp          d12, d13
+@Steps for Stage 2:
+@------------------
+    vtrn.16       d10, d11
+    vtrn.16       d12, d13
+    vtrn.32       d10, d12
+    vtrn.32       d11, d13
+    vadd.s16      d14, d10, d12         @x0 = q0 + q1;
+
+    vsub.s16      d15, d10, d12         @x1 = q0 - q1;
+
+    vshr.s16      d18, d11, #1          @q0>>1
+    vshr.s16      d19, d13, #1          @q1>>1
+
+    vsub.s16      d16, d18, d13         @x2 = (q0 >> 1) -  q1;
+    vadd.s16      d17, d11, d19         @x3 = q0+ (q1 >> 1);
+
+    vld1.32       d31[1], [r1], r3      @IV row Load pu1_pred buffer
+    vswp          d16, d17              @Reverse positions of x2 and x3
+
+    vsub.s16      q11, q7, q8           @x0-x3 and x1-x2 combined
+    vadd.s16      q10, q7, q8           @x0 + x3 and x1+x2 combined
+
+    vswp          d22, d23
+
+    vrshr.s16     q10, q10, #6          @
+    vrshr.s16     q11, q11, #6
+
+    vaddw.u8      q10, q10, d30
+    vaddw.u8      q11, q11, d31
+
+    vqmovun.s16   d0, q10
+    vqmovun.s16   d1, q11
+
+    vst1.32       d0[0], [r2], r4       @I row store the value
+    vst1.32       d0[1], [r2], r4       @II row store the value
+    vst1.32       d1[0], [r2], r4       @III row store the value
+    vst1.32       d1[1], [r2]           @IV row store the value
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
+
+
+    @/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
+@ *
+@ * @par Description:
+@ *  Performs inverse transform Ci4 and adds the residue to get the
+@ *  reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ *  Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ *  Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ *  Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ *     QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ *  Prediction stride
+@ *
+@ * @param[in] out_strd
+@ *  Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*16
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns  Void
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
+@                                   UWORD8 *pu1_pred,
+@                                   UWORD8 *pu1_out,
+@                                   WORD32 pred_strd,
+@                                   WORD32 out_strd,
+@                                   const UWORD16 *pu2_iscal_mat,
+@                                   const UWORD16 *pu2_weigh_mat,
+@                                   UWORD32 u4_qp_div_6,
+@                                   WORD32 *pi4_tmp
+@                                   WORD16 *pi2_dc_src)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 =>  pred_strd
+@r4 =>  out_strd
+@r5 =>  *pu2_iscal_mat
+@r6 =>  *pu2_weigh_mat
+@r7 =>  u4_qp_div_6
+
+    .global ih264_iquant_itrans_recon_chroma_4x4_a9
+ih264_iquant_itrans_recon_chroma_4x4_a9:
+
+@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
+@If the macro value changes need to change the instruction according to it.
+@Only one shift is done in horizontal inverse because,
+@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
+    ldr           r7, [sp, #52]         @Loads u4_qp_div_6
+    ldr           r4, [sp, #40]         @Loads out_strd
+    vdup.s32      q15, r7               @Populate the u4_qp_div_6 in Q15
+    ldr           r5, [sp, #44]         @Loads *pu2_iscal_mat
+    ldr           r6, [sp, #48]         @Loads *pu2_weigh_mat
+    ldr           r8, [sp, #60]         @loads *pi2_dc_src
+
+    vpush         {d8-d15}
+@=======================DEQUANT FROM HERE===================================
+
+    vld4.s16      {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15
+    vld4.s16      {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15
+    vmul.s16      q10, q10, q13         @x[i]=(scale[i] * dequant[i]) where i = 0..7
+    vld4.s16      {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15
+
+    vmul.s16      q11, q11, q14         @x[i]=(scale[i] * dequant[i]) where i = 8..15
+
+    vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+    vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+    vmull.s16     q2, d18, d22          @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+    vmull.s16     q3, d19, d23          @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+    vshl.s32      q0, q0, q15           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
+    vshl.s32      q1, q1, q15           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
+    vshl.s32      q2, q2, q15           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
+    vshl.s32      q3, q3, q15           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
+
+    vqrshrn.s32   d0, q0, #0x4          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+    vqrshrn.s32   d1, q1, #0x4          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+    vqrshrn.s32   d2, q2, #0x4          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+    vqrshrn.s32   d3, q3, #0x4          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+    ldrsh         r9, [r8]              @ Loads signed halfword pi2_dc_src[0]
+    vmov.16       d0[0], r9             @ Restore dc value since its chroma iq-it
+
+@========= PROCESS IDCT FROM HERE =======
+@Steps for Stage 1:
+@------------------
+    vld2.8        {d28, d29}, [r1], r3  @I row Load pu1_pred buffer
+    vadd.s16      d4, d0, d2            @x0 = q0 + q1;
+
+    vsub.s16      d5, d0, d2            @x1 = q0 - q1;
+
+    vshr.s16      d8, d1, #1            @q0>>1
+    vshr.s16      d9, d3, #1            @q1>>1
+
+    vsub.s16      d6, d8, d3            @x2 = (q0 >> 1) -  q1;
+    vadd.s16      d7, d1, d9            @x3 = q0+ (q1 >> 1);
+    vld2.8        {d29, d30}, [r1], r3  @II row Load pu1_pred buffer
+
+    vswp          d6, d7                @Reverse positions of x2 and x3
+
+    vsub.s16      q6, q2, q3            @x0-x3 and x1-x2 combined
+    vtrn.32       d28, d29              @ D28 -- row I and II of pu1_pred_buffer
+    vadd.s16      q5, q2, q3            @x0 + x3 and x1+x2 combined
+
+    vld2.8        {d29, d30}, [r1], r3  @III row Load pu1_pred buf
+
+    vswp          d12, d13
+@Steps for Stage 2:
+@------------------
+    vtrn.16       d10, d11
+    vtrn.16       d12, d13
+    vtrn.32       d10, d12
+    vtrn.32       d11, d13
+    vadd.s16      d14, d10, d12         @x0 = q0 + q1;
+
+    vsub.s16      d15, d10, d12         @x1 = q0 - q1;
+
+    vshr.s16      d18, d11, #1          @q0>>1
+    vshr.s16      d19, d13, #1          @q1>>1
+
+    vsub.s16      d16, d18, d13         @x2 = (q0 >> 1) -  q1;
+    vadd.s16      d17, d11, d19         @x3 = q0+ (q1 >> 1);
+
+    vld2.8        {d30, d31}, [r1], r3  @IV row Load pu1_pred buffer
+    vswp          d16, d17              @Reverse positions of x2 and x3
+
+    vsub.s16      q11, q7, q8           @x0-x3 and x1-x2 combined
+    vtrn.32       d29, d30              @ D29 -- row III and IV of pu1_pred_buf
+    vadd.s16      q10, q7, q8           @x0 + x3 and x1+x2 combined
+
+    vswp          d22, d23
+
+    vrshr.s16     q10, q10, #6          @
+    vrshr.s16     q11, q11, #6
+
+    vaddw.u8      q10, q10, d28
+    vaddw.u8      q11, q11, d29
+
+    vld1.u8       d0, [r2], r4          @Loading out buffer 16 coeffs
+    vld1.u8       d1, [r2], r4
+    vld1.u8       d2, [r2], r4
+    vld1.u8       d3, [r2], r4
+
+    sub           r2, r2, r4, lsl #2
+
+    vqmovun.s16   d20, q10              @Getting quantized coeffs
+    vqmovun.s16   d22, q11
+
+    vmovl.u8      q10, d20              @Move the coffs into 16 bit
+    vmovl.u8      q11, d22              @so that we can use vbit to copy
+
+    vmov.u16      q14, #0x00ff          @Copy lsb from qantized(long)coeffs
+
+    vbit.u8       q0, q10, q14
+    vbit.u8       q1, q11, q14
+
+    vst1.u8       d0, [r2], r4
+    vst1.u8       d1, [r2], r4
+    vst1.u8       d2, [r2], r4
+    vst1.u8       d3, [r2]
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
+
+
+@/*
+@ *******************************************************************************
+@ *
+@ * @brief
+@ *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
+@ *
+@ * @par Description:
+@ *  Performs inverse transform Ci8 and adds the residue to get the
+@ *  reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ *  Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ *  Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ *  Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ *     QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ *  Prediction stride
+@ *
+@ * @param[in] out_strd
+@ *  Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*64
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns  Void
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
+@                                   UWORD8 *pu1_pred,
+@                                   UWORD8 *pu1_out,
+@                                   WORD32 pred_strd,
+@                                   WORD32 out_strd,
+@                                   const UWORD16 *pu2_iscal_mat,
+@                                   const UWORD16 *pu2_weigh_mat,
+@                                   UWORD32 u4_qp_div_6,
+@                                   WORD32 *pi4_tmp,
+@                                   WORD32 iq_start_idx)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 =>  pred_strd
+@r4 =>  out_strd
+@r5 =>  *pu2_iscal_mat
+@r6 =>  *pu2_weigh_mat
+@r7 =>  u4_qp_div_6
+
+
+    .global ih264_iquant_itrans_recon_8x8_a9
+ih264_iquant_itrans_recon_8x8_a9:
+
+    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
+    ldr           r7, [sp, #52]         @Loads u4_qp_div_6
+    ldr           r4, [sp, #40]         @Loads out_strd
+
+    ldr           r5, [sp, #44]         @Loads *pu2_iscal_mat
+    ldr           r6, [sp, #48]         @Loads *pu2_weigh_mat
+    vdup.s32      q15, r7               @Populate the u4_qp_div_6 in Q15
+    vpush         {d8-d15}
+
+idct_8x8_begin:
+
+@========= DEQUANT FROM HERE ===========
+
+    vld1.32       {q13}, [r5]!          @ Q13 = dequant values row 0
+    vld1.32       {q10}, [r6]!          @ Q10 = scaling factors row 0
+    vld1.32       {q14}, [r5]!          @ Q14 = dequant values row 1
+    vmul.s16      q10, q10, q13         @ Q10 = x[i] = (scale[i] * dequant[i]) where i = 0..7
+    vld1.32       {q11}, [r6]!          @ Q11 = scaling factors row 1
+    vld1.32       {q8}, [r0]!           @ Q8  = Source row 0
+    vmul.s16      q11, q11, q14         @ Q11 = x[i] = (scale[i] * dequant[i]) where i = 8..15
+    vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+    vld1.32       {q9}, [r0]!           @ Q8  = Source row 1
+    vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+    vmull.s16     q2, d18, d22          @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+    vld1.32       {q13}, [r6]!          @ Scaling factors row 2
+    vmull.s16     q3, d19, d23          @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+    vld1.32       {q14}, [r6]!          @ Scaling factors row 3
+    vshl.s32      q0, q0, q15           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
+    vld1.32       {q10}, [r5]!          @ Q10 = Dequant values row 2
+    vshl.s32      q1, q1, q15           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
+    vld1.32       {q8}, [r0]!           @ Source Row 2
+    vshl.s32      q2, q2, q15           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
+    vld1.32       {q11}, [r5]!          @ Q11 = Dequant values row 3
+    vshl.s32      q3, q3, q15           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
+    vld1.32       {q9}, [r0]!           @ Source Row 3
+    vmul.s16      q10, q10, q13         @ Dequant row2*scale matrix row 2
+    vmul.s16      q11, q11, q14         @ Dequant row 3*scale matrix row 3
+    vld1.32       {q4}, [r6]!           @ Scaling factors row 4
+    vqrshrn.s32   d0, q0, #0x6          @ D0  = c[i] = ((q[i] + 32) >> 6) where i = 0..3
+    vqrshrn.s32   d1, q1, #0x6          @ D1  = c[i] = ((q[i] + 32) >> 6) where i = 4..7
+    vld1.32       {q5}, [r6]!           @ Scaling factors row 5
+    vqrshrn.s32   d2, q2, #0x6          @ D2  = c[i] = ((q[i] + 32) >> 6) where i = 8..11
+    vqrshrn.s32   d3, q3, #0x6          @ D3  = c[i] = ((q[i] + 32) >> 6) where i = 12..15
+    vld1.32       {q13}, [r5]!          @ Q13 = Dequant values row 4
+    vmull.s16     q2, d16, d20          @ p[i] = (x[i] * trns_coeff[i]) where i=16..19
+    vmull.s16     q3, d17, d21          @ p[i] = (x[i] * trns_coeff[i]) where i=20..23
+    vld1.32       {q12}, [r5]!          @ Q12 = Dequant values row 5
+    vmull.s16     q6, d18, d22          @ p[i] = (x[i] * trns_coeff[i]) where i=24..27
+    vmull.s16     q7, d19, d23          @ p[i] = (x[i] * trns_coeff[i]) where i=28..31
+
+    vld1.32       {q14}, [r0]!          @ Source row 4
+    vmul.s16      q10, q4, q13          @ Dequant row4*scale matrix row 4
+    vmul.s16      q11, q5, q12          @ Dequant row5*scale matrix row 5
+    vld1.32       {q9}, [r0]!           @ Source row 5
+    vshl.s32      q2, q2, q15           @
+    vshl.s32      q3, q3, q15           @
+    vld1.32       {q13}, [r6]!          @ Scaling factors row 6
+    vshl.s32      q6, q6, q15           @
+    vshl.s32      q7, q7, q15           @
+    vmull.s16     q4, d28, d20          @ i = 32..35
+    vqrshrn.s32   d4, q2, #0x6          @ D4  = c[i] = ((q[i] + 32) >> 6) where i = 16..19
+    vqrshrn.s32   d5, q3, #0x6          @ D5  = c[i] = ((q[i] + 32) >> 6) where i = 20..23
+    vmull.s16     q5, d29, d21          @ i =36..39
+    vld1.32       {q10}, [r5]!          @ Dequant values row 6
+    vqrshrn.s32   d6, q6, #0x6          @ D6  = c[i] = ((q[i] + 32) >> 6) where i = 24..27
+    vqrshrn.s32   d7, q7, #0x6          @ D7  = c[i] = ((q[i] + 32) >> 6) where i = 28..31
+    vld1.32       {q14}, [r6]!          @ Scaling factors row 7
+    vmull.s16     q6, d18, d22          @
+    vld1.32       {q8}, [r0]!           @ Source row 6
+    vmull.s16     q7, d19, d23          @
+    vld1.32       {q11}, [r5]!          @ Dequant values row 7
+    vshl.s32      q4, q4, q15           @
+    vld1.32       {q9}, [r0]!           @ Source row 7
+    vshl.s32      q5, q5, q15           @
+
+    vshl.s32      q6, q6, q15           @
+    vshl.s32      q7, q7, q15           @
+    vmul.s16      q10, q10, q13         @ Dequant*scaling row 6
+    vmul.s16      q11, q11, q14         @ Dequant*scaling row 7
+    vqrshrn.s32   d8, q4, #0x6          @ D8  = c[i] = ((q[i] + 32) >> 6) where i = 32..35
+    vqrshrn.s32   d9, q5, #0x6          @ D9  = c[i] = ((q[i] + 32) >> 6) where i = 36..39
+    vqrshrn.s32   d10, q6, #0x6         @ D10  = c[i] = ((q[i] + 32) >> 6) where i = 40..43
+    vqrshrn.s32   d11, q7, #0x6         @ D11  = c[i] = ((q[i] + 32) >> 6) where i = 44..47
+    vmull.s16     q6, d16, d20          @ i= 48..51
+    vmull.s16     q7, d17, d21          @ i= 52..55
+    vmull.s16     q8, d18, d22          @ i=56..59
+    vmull.s16     q9, d19, d23          @ i=60..63
+    vshl.s32      q6, q6, q15           @
+    vzip.s16      q0, q1                @Transpose
+    vshl.s32      q7, q7, q15           @
+    vshl.s32      q8, q8, q15           @
+    vzip.s16      q2, q3                @
+    vshl.s32      q9, q9, q15           @
+    vqrshrn.s32   d12, q6, #0x6         @ D12  = c[i] = ((q[i] + 32) >> 6) where i = 48..51
+    vzip.s16      q4, q5                @Transpose
+    vqrshrn.s32   d13, q7, #0x6         @ D13  = c[i] = ((q[i] + 32) >> 6) where i = 52..55
+    vqrshrn.s32   d14, q8, #0x6         @ D14  = c[i] = ((q[i] + 32) >> 6) where i = 56..59
+    vzip.s32      q0, q2                @Transpose
+    vqrshrn.s32   d15, q9, #0x6         @ D15  = c[i] = ((q[i] + 32) >> 6) where i = 60..63
+
+@========= PROCESS IDCT FROM HERE =======
+
+@Steps for Stage 2:
+@------------------
+
+@   TRANSPOSE 8x8 coeffs to actual order
+
+    vzip.s16      q6, q7                @
+
+    vzip.s32      q1, q3                @
+    vzip.s32      q4, q6                @
+    vzip.s32      q5, q7                @
+
+    vswp          d1, d8                @ Q0/Q1 = Row order x0/x1
+    vswp          d3, d10               @ Q2/Q3 = Row order x2/x3
+    vswp          d5, d12               @ Q4/Q5 = Row order x4/x5
+    vswp          d7, d14               @ Q6/Q7 = Row order x6/x7
+
+    vswp          q1, q4                @
+    vshr.s16      q10, q2, #0x1         @
+    vswp          q3, q6                @
+
+@Steps for Stage 1:
+@------------------
+
+    vadd.s16      q8, q0, q4            @ Q8 = y0
+    vsub.s16      q9, q0, q4            @ Q9 = y2
+
+    vsra.s16      q2, q6, #0x1          @ Q2 = y6
+    vsub.s16      q6, q10, q6           @ Q6 = y4
+
+    vaddl.s16     q12, d14, d2          @ y3 (0-3) 1+7
+    vaddl.s16     q13, d15, d3          @ y3 (4-7) 1+7
+
+    vsubl.s16     q10, d14, d2          @ y5 (0-3) 7-1
+    vsubl.s16     q11, d15, d3          @ y5 (4-7) 7-1
+
+    vadd.s16      q0, q8, q2            @ Q0 = z0
+    vsub.s16      q4, q8, q2            @ Q4 = z6
+
+    vadd.s16      q8, q9, q6            @ Q8 = z2
+    vsub.s16      q2, q9, q6            @ Q2 = z4
+
+    vsubw.s16     q12, q12, d6          @ y3 (0-3) 1+7-3
+    vsubw.s16     q13, q13, d7          @ y3 (0-7) 1+7-3
+
+    vshr.s16      q6, q3, #0x1          @
+
+    vaddw.s16     q10, q10, d10         @
+    vaddw.s16     q11, q11, d11         @
+
+    vshr.s16      q9, q5, #0x1          @
+
+    vsubw.s16     q12, q12, d12         @
+    vsubw.s16     q13, q13, d13         @
+
+    vaddw.s16     q10, q10, d18         @
+    vaddw.s16     q11, q11, d19         @
+
+    vqmovn.s32    d12, q12              @
+    vaddl.s16     q12, d10, d6          @
+    vqmovn.s32    d13, q13              @ Q6 = y3
+    vaddl.s16     q13, d11, d7          @
+    vqmovn.s32    d18, q10              @
+    vsubl.s16     q10, d10, d6          @
+    vqmovn.s32    d19, q11              @ Q9 = y5
+    vsubl.s16     q11, d11, d7          @
+
+    vshr.s16      q3, q6, #0x2          @
+
+    vsra.s16      q6, q9, #0x2          @ Q6 = z3
+
+    vaddw.s16     q12, q12, d2          @
+    vaddw.s16     q13, q13, d3          @
+
+    vshr.s16      q1, #0x1              @
+
+    vsub.s16      q5, q3, q9            @ Q5 = z5
+
+    vsubw.s16     q10, q10, d14         @
+    vsubw.s16     q11, q11, d15         @
+
+    vshr.s16      q7, #0x1              @
+
+    vaddw.s16     q12, q12, d2          @
+    vaddw.s16     q13, q13, d3          @
+
+    vsubw.s16     q10, q10, d14         @
+    vsubw.s16     q11, q11, d15         @
+
+
+    vqmovn.s32    d14, q12              @
+    vadd.s16      q1, q8, q5            @ Q1 = x1
+    vqmovn.s32    d15, q13              @ Q7 = y7
+    vsub.s16      q3, q8, q5            @ Q3 = x6
+    vqmovn.s32    d18, q10              @
+    vsub.s16      q5, q2, q6            @ Q5 = x5
+    vqmovn.s32    d19, q11              @ Q9 = y1
+    vadd.s16      q2, q2, q6            @ Q2 = x2
+
+    vshr.s16      q12, q9, #0x2         @
+    vsra.s16      q9, q7, #0x2          @ Q9 = z1
+
+    vsub.s16      q11, q7, q12          @ Q11 = z7
+
+    vadd.s16      q6, q4, q9            @ Q6 = x3
+    vsub.s16      q4, q4, q9            @ Q4 = x4
+
+    vsub.s16      q7, q0, q11           @ Q7 = x7
+    vadd.s16      q0, q0, q11           @ Q0 = x0
+
+    vswp.s16      q3, q6                @ Q3 = x3, Q6 = x6
+
+
+@Steps for Stage 2:
+@------------------
+
+@   TRANSPOSE 8x8 coeffs to actual order
+
+    vzip.s16      q0, q1                @
+    vzip.s16      q2, q3                @
+    vzip.s16      q4, q5                @
+    vzip.s16      q6, q7                @
+
+    vzip.s32      q0, q2                @
+    vzip.s32      q1, q3                @
+    vzip.s32      q4, q6                @
+    vzip.s32      q5, q7                @
+
+    vswp          d1, d8                @ Q0/Q1 = Row order x0/x1
+    vswp          d3, d10               @ Q2/Q3 = Row order x2/x3
+    vswp          d5, d12               @ Q4/Q5 = Row order x4/x5
+    vswp          d7, d14               @ Q6/Q7 = Row order x6/x7
+
+    vswp          q1, q4                @
+    vshr.s16      q10, q2, #0x1         @
+    vswp          q3, q6                @
+
+@Steps for Stage 3:
+@------------------
+
+@Repeat stage 1 again for vertical transform
+
+    vadd.s16      q8, q0, q4            @ Q8 = y0
+    vld1.32       d28, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vsub.s16      q9, q0, q4            @ Q9 = y2
+
+    vsra.s16      q2, q6, #0x1          @ Q2 = y6
+    vsub.s16      q6, q10, q6           @ Q6 = y4
+
+    vaddl.s16     q12, d14, d2          @
+    vld1.32       d29, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vaddl.s16     q13, d15, d3          @
+
+    vsubl.s16     q10, d14, d2          @
+    vld1.32       d30, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vsubl.s16     q11, d15, d3          @
+
+    vadd.s16      q0, q8, q2            @ Q0 = z0
+    vld1.32       d31, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vsub.s16      q4, q8, q2            @ Q4 = z6
+
+    vadd.s16      q8, q9, q6            @ Q8 = z2
+    vsub.s16      q2, q9, q6            @ Q2 = z4
+
+    vsubw.s16     q12, q12, d6          @
+    vsubw.s16     q13, q13, d7          @
+
+    vshr.s16      q6, q3, #0x1          @
+
+    vaddw.s16     q10, q10, d10         @
+    vaddw.s16     q11, q11, d11         @
+
+    vshr.s16      q9, q5, #0x1          @
+
+    vsubw.s16     q12, q12, d12         @
+    vsubw.s16     q13, q13, d13         @
+
+    vaddw.s16     q10, q10, d18         @
+    vaddw.s16     q11, q11, d19         @
+
+    vqmovn.s32    d12, q12              @
+    vaddl.s16     q12, d10, d6          @
+    vqmovn.s32    d13, q13              @ Q6 = y3
+    vaddl.s16     q13, d11, d7          @
+    vqmovn.s32    d18, q10              @
+    vsubl.s16     q10, d10, d6          @
+    vqmovn.s32    d19, q11              @ Q9 = y5
+    vsubl.s16     q11, d11, d7          @
+
+    vshr.s16      q3, q6, #0x2          @
+
+    vsra.s16      q6, q9, #0x2          @ Q6 = z3
+
+    vaddw.s16     q12, q12, d2          @
+    vaddw.s16     q13, q13, d3          @
+
+    vshr.s16      q1, #0x1              @
+
+    vsub.s16      q5, q3, q9            @ Q5 = z5
+
+    vsubw.s16     q10, q10, d14         @
+    vsubw.s16     q11, q11, d15         @
+
+    vshr.s16      q7, #0x1              @
+
+    vaddw.s16     q12, q12, d2          @
+    vaddw.s16     q13, q13, d3          @
+
+    vsubw.s16     q10, q10, d14         @
+    vsubw.s16     q11, q11, d15         @
+
+    vqmovn.s32    d14, q12              @
+    vadd.s16      q1, q8, q5            @ Q1 = x1
+    vqmovn.s32    d15, q13              @ Q7 = y7
+    vsub.s16      q3, q8, q5            @ Q3 = x6
+    vqmovn.s32    d18, q10              @
+    vsub.s16      q5, q2, q6            @ Q5 = x5
+    vqmovn.s32    d19, q11              @ Q9 = y1
+    vadd.s16      q2, q2, q6            @ Q2 = x2
+
+    vshr.s16      q12, q9, #0x2         @
+    vsra.s16      q9, q7, #0x2          @ Q9 = z1
+
+    vsub.s16      q11, q7, q12          @ Q11 = z7
+
+    vadd.s16      q6, q4, q9            @ Q6 = x3
+    vsub.s16      q4, q4, q9            @ Q4 = x4
+
+    vsub.s16      q7, q0, q11           @ Q7 = x7
+    vadd.s16      q0, q0, q11           @ Q0 = x0
+
+    vswp.s16      q3, q6                @ Q3 <-> Q6
+
+    vrshr.s16     q1, q1, #6            @
+    vld1.32       d16, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vrshr.s16     q2, q2, #6            @
+    vrshr.s16     q4, q4, #6            @
+    vld1.32       d17, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vrshr.s16     q5, q5, #6            @
+    vrshr.s16     q7, q7, #6            @
+    vld1.32       d18, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vrshr.s16     q0, q0, #6            @
+    vrshr.s16     q3, q3, #6            @
+    vld1.32       d19, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vrshr.s16     q6, q6, #6            @
+
+@ Code Added to pack sign and magnitudes
+
+    vaddw.u8      q0, q0, d28
+    vaddw.u8      q1, q1, d29
+    vaddw.u8      q2, q2, d30
+    vaddw.u8      q3, q3, d31
+    vqmovun.s16   d0, q0
+    vaddw.u8      q4, q4, d16
+    vqmovun.s16   d1, q1
+    vaddw.u8      q5, q5, d17
+    vqmovun.s16   d2, q2
+    vaddw.u8      q6, q6, d18
+    vqmovun.s16   d3, q3
+    vaddw.u8      q7, q7, d19
+
+    vqmovun.s16   d4, q4
+    vst1.32       d0, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+    vqmovun.s16   d5, q5
+    vst1.32       d1, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+    vqmovun.s16   d6, q6
+    vst1.32       d2, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+    vqmovun.s16   d7, q7
+    vst1.32       d3, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+    vst1.32       d4, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+
+    vst1.32       d5, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+
+
+    vst1.32       d6, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+
+
+    vst1.32       d7, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+
+idct_8x8_end:
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, r15}
+
diff --git a/common/arm/ih264_iquant_itrans_recon_dc_a9.s b/common/arm/ih264_iquant_itrans_recon_dc_a9.s
new file mode 100755
index 0000000..8d71bdb
--- /dev/null
+++ b/common/arm/ih264_iquant_itrans_recon_dc_a9.s
@@ -0,0 +1,399 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ *  ih264_iquant_itrans_recon_dc_a9.s
+@ *
+@ * @brief
+@ *  Contains function definitions for single stage  inverse transform
+@ *
+@ * @author
+@ *  Mohit
+@ *
+@ * @par List of Functions:
+@ *  - ih264_iquant_itrans_recon_4x4_dc_a9()
+@ *  - ih264_iquant_itrans_recon_8x8_dc_a9()
+@ *  - ih264_iquant_itrans_recon_chroma_4x4_dc_a9()
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@*/
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
+@ *  for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is
+@ *  non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
+@ *
+@ * @par Description:
+@ *  Performs inverse transform Ci4 and adds the residue to get the
+@ *  reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ *  Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ *  Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ *  Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ *     QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ *  Prediction stride
+@ *
+@ * @param[in] out_strd
+@ *  Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*16
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns  Void
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
+@                                   UWORD8 *pu1_pred,
+@                                   UWORD8 *pu1_out,
+@                                   WORD32 pred_strd,
+@                                   WORD32 out_strd,
+@                                   const UWORD16 *pu2_iscal_mat,
+@                                   const UWORD16 *pu2_weigh_mat,
+@                                   UWORD32 u4_qp_div_6,
+@                                   WORD32 *pi4_tmp,
+@                                   WORD32 iq_start_idx
+@                                   WORD16 *pi2_dc_ld_addr)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 =>  pred_strd
+@r4 =>  out_strd
+@r5 =>  *pu2_iscal_mat
+@r6 =>  *pu2_weigh_mat
+@r7 =>  u4_qp_div_6
+@r9 =>  iq_start_idx
+@unused =>  pi2_dc_ld_addr
+
+.text
+.p2align 2
+
+    .global ih264_iquant_itrans_recon_4x4_dc_a9
+
+ih264_iquant_itrans_recon_4x4_dc_a9:
+
+@Only one shift is done in horizontal inverse because,
+@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+    stmfd         sp!, {r4-r10, r14}    @stack stores the values of the arguments
+    ldr           r5, [sp, #36]         @Loads *pu2_iscal_mat
+    ldr           r6, [sp, #40]         @Loads *pu2_weigh_mat
+    ldrsh         r8, [r0]              @load pi2_src[0], SH for signed halfword load
+    ldrh          r6, [r6]              @load pu2_weight_mat[0] , H for unsigned halfword load
+    ldrh          r5, [r5]              @load pu2_iscal_mat[0] , H for unsigned halfword load
+@=======================DEQUANT FROM HERE===================================
+    mul           r6, r6, r5            @pu2_iscal_mat[0]*pu2_weigh_mat[0]
+    ldr           r7, [sp, #44]         @Loads u4_qp_div_6
+    mul           r6, r6, r8            @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0]
+    ldr           r4, [sp, #32]         @Loads out_strd
+    ldr           r9, [sp, #52]         @Loads iq_start_idx
+
+    lsl           r6, r6, r7            @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6
+    add           r6, r6, #8            @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact
+    asr           r6, r6, #4            @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4)
+
+    subs          r9, r9, #1            @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
+    ldreqsh       r10, [r0]             @ Loads signed halfword pi2_src[0], if r9==1
+    moveq         r6, r10               @ Restore dc value in case of intra, i.e. r9 == 1
+
+    add           r6, r6, #32           @i_macro = q0 + 32
+    asr           r6, r6, #6            @i_macro >>6 = DC output of 2-stage transform
+    vdup.s16      q0, r6                @copy transform output to Q0
+
+    vld1.32       d30[0], [r1], r3      @I row Load pu1_pred buffer
+
+    vld1.32       d30[1], [r1], r3      @II row Load pu1_pred buffer
+
+    vld1.32       d31[0], [r1], r3      @III row Load pu1_pred buf
+
+    vld1.32       d31[1], [r1], r3      @IV row Load pu1_pred buffer
+    vaddw.u8      q10, q0, d30
+
+    vaddw.u8      q11, q0, d31
+
+    vqmovun.s16   d0, q10
+
+    vst1.32       d0[0], [r2], r4       @I row store the value
+    vqmovun.s16   d1, q11
+    vst1.32       d0[1], [r2], r4       @II row store the value
+    vst1.32       d1[0], [r2], r4       @III row store the value
+    vst1.32       d1[1], [r2]           @IV row store the value
+
+    ldmfd         sp!, {r4-r10, r15}    @Reload the registers from SP
+
+
+
+
+@/*
+@ *******************************************************************************
+@ *
+@ * @brief
+@ *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
+@ *  for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is
+@ *  non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
+@ *
+@ * @par Description:
+@ *  Performs inverse transform Ci8 and adds the residue to get the
+@ *  reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ *  Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ *  Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ *  Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ *     QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ *  Prediction stride
+@ *
+@ * @param[in] out_strd
+@ *  Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*64
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns  Void
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src,
+@                                   UWORD8 *pu1_pred,
+@                                   UWORD8 *pu1_out,
+@                                   WORD32 pred_strd,
+@                                   WORD32 out_strd,
+@                                   const UWORD16 *pu2_iscal_mat,
+@                                   const UWORD16 *pu2_weigh_mat,
+@                                   UWORD32 u4_qp_div_6,
+@                                   WORD32 *pi4_tmp,
+@                                   WORD32 iq_start_idx)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 =>  pred_strd
+@r4 =>  out_strd
+@r5 =>  *pu2_iscal_mat
+@r6 =>  *pu2_weigh_mat
+@r7 =>  u4_qp_div_6
+
+
+    .global ih264_iquant_itrans_recon_8x8_dc_a9
+ih264_iquant_itrans_recon_8x8_dc_a9:
+
+    stmfd         sp!, {r4-r8, r14}     @stack stores the values of the arguments
+    ldr           r5, [sp, #28]         @Loads *pu2_iscal_mat
+    ldr           r6, [sp, #32]         @Loads *pu2_weigh_mat
+    ldrsh         r8, [r0]              @load pi2_src[0], SH for signed halfword load
+    ldrh          r6, [r6]              @load pu2_weight_mat[0] , H for unsigned halfword load
+    ldrh          r5, [r5]              @load pu2_iscal_mat[0] , H for unsigned halfword load
+@=======================DEQUANT FROM HERE===================================
+    mul           r6, r6, r5            @pu2_iscal_mat[0]*pu2_weigh_mat[0]
+    ldr           r7, [sp, #36]         @Loads u4_qp_div_6
+    mul           r6, r6, r8            @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0]
+    ldr           r4, [sp, #24]         @Loads out_strd
+
+    vpush         {d8-d15}
+    lsl           r6, r6, r7            @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6
+    add           r6, r6, #32           @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact
+    asr           r6, r6, #6            @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4)
+    add           r6, r6, #32           @i_macro = q0 + 32
+    asr           r6, r6, #6            @i_macro >>6 = DC output of 2-stage transform
+    vdup.s16      q8, r6                @copy transform output to Q0
+
+    vld1.32       d24, [r1], r3         @ Q12 = 0x070605....0x070605....
+
+    vld1.32       d25, [r1], r3         @ Q12 = 0x070605....0x070605....
+
+    vld1.32       d26, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vaddw.u8      q0, q8, d24
+    vld1.32       d27, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vaddw.u8      q1, q8, d25
+    vld1.32       d28, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vaddw.u8      q2, q8, d26
+    vld1.32       d29, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vaddw.u8      q3, q8, d27
+    vld1.32       d30, [r1], r3         @ Q12 = 0x070605....0x070605....
+    vaddw.u8      q4, q8, d28
+    vld1.32       d31, [r1], r3         @ Q12 = 0x070605....0x070605....
+
+@ Code Added to pack sign and magnitudes
+
+
+    vqmovun.s16   d0, q0
+    vaddw.u8      q5, q8, d29
+    vqmovun.s16   d1, q1
+    vaddw.u8      q6, q8, d30
+    vqmovun.s16   d2, q2
+    vqmovun.s16   d3, q3
+    vaddw.u8      q7, q8, d31
+    vqmovun.s16   d4, q4
+    vqmovun.s16   d5, q5
+    vst1.32       d0, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+    vqmovun.s16   d6, q6
+    vst1.32       d1, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+    vqmovun.s16   d7, q7
+    vst1.32       d2, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+    vst1.32       d3, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+    vst1.32       d4, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+    vst1.32       d5, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+    vst1.32       d6, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+    vst1.32       d7, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r8, r15}
+
+
+@ /*
+@ ********************************************************************************
+@ *
+@ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
+@ * prediction buffer if only dc value is present for residue
+@ *
+@ * @par Description:
+@ *  The quantized residue is first inverse quantized,
+@ *  This inverse quantized content is added to the prediction buffer to recon-
+@ *  struct the end output
+@ *
+@ * @param[in] pi2_src
+@ *  quantized dc coeffiient
+@ *
+@ * @param[in] pu1_pred
+@ *  prediction 4x4 block in interleaved format
+@ *
+@ * @param[in] pred_strd,
+@ *  Prediction buffer stride in interleaved format
+@ *
+@ * @param[in] out_strd
+@ *  recon buffer Stride
+@ *
+@ * @returns none
+@ *
+@ * @remarks none
+@ *
+@ *******************************************************************************
+@ */
+@ void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
+@                                             UWORD8 *pu1_pred,
+@                                             UWORD8 *pu1_out,
+@                                             WORD32 pred_strd,
+@                                             WORD32 out_strd,
+@                                             const UWORD16 *pu2_iscal_mat,
+@                                             const UWORD16 *pu2_weigh_mat,
+@                                             UWORD32 u4_qp_div_6,
+@                                             WORD16 *pi2_tmp,
+@                                             WORD16 *pi2_dc_src)
+@ Register Usage
+@ r0 : pi2_src
+@ r1 : pu1_pred
+@ r2 : pu1_out
+@ r3 : pred_strd
+@ Neon registers d0-d7, d16-d30 are used
+@ No need for pushing  arm and neon registers
+    .global ih264_iquant_itrans_recon_chroma_4x4_dc_a9
+ih264_iquant_itrans_recon_chroma_4x4_dc_a9:
+
+    ldr           r0, [sp, #20]
+    vld1.s16      d0, [r0]              @load pi2_dc_src
+
+    ldr           r0, [sp]              @load out_strd
+
+    vld2.s8       {d2, d3}, [r1], r3    @load pred plane 1 => d2 &pred palne 2 => d3
+    vld2.s8       {d3, d4}, [r1], r3
+    vrshr.s16     d0, d0, #6            @i_macro = ((q0 + 32) >> 6);
+    vld2.s8       {d4, d5}, [r1], r3
+    vld2.s8       {d5, d6}, [r1], r3
+
+    vdup.s16      q0, d0[0]             @duplicate pi2_sr[0]
+    mov           r1, r2                @backup pu1_out
+
+    vtrn.32       d2, d3                @mov the 4 coeffs of current block to d2
+    vtrn.32       d4, d5
+
+    vmov.u16      q15, #0x00ff
+
+    vld1.u8       d18, [r2], r0         @load out [8 bit size) -8 coeffs
+    vaddw.u8      q1, q0, d2            @Add pred
+    vld1.u8       d19, [r2], r0
+    vaddw.u8      q2, q0, d4
+    vld1.u8       d20, [r2], r0
+    vld1.u8       d21, [r2], r0
+
+    vqmovun.s16   d2, q1
+    vqmovun.s16   d4, q2
+
+    vmovl.u8      q1, d2
+    vmovl.u8      q2, d4
+
+    vbit.u8       q9, q1, q15
+    vbit.u8       q10, q2, q15
+
+    vst1.u8       d18, [r1], r0         @store  out
+    vst1.u8       d19, [r1], r0
+    vst1.u8       d20, [r1], r0
+    vst1.u8       d21, [r1], r0
+
+    bx            lr
+
+
+
+
+
+
+
diff --git a/common/arm/ih264_itrans_recon_a9.s b/common/arm/ih264_itrans_recon_a9.s
new file mode 100755
index 0000000..1d74da5
--- /dev/null
+++ b/common/arm/ih264_itrans_recon_a9.s
@@ -0,0 +1,216 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ *  ih264_itrans_recon_neon_a9.s
+@ *
+@ * @brief
+@ *  Contains function definitions for single stage  inverse transform
+@ *
+@ *
+@ * @par List of Functions:
+@ *  - ih264_itrans_recon_4x4_a9()
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@*/
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ *  This function performs Inverse transform type Ci4 for 4*4 block
+@ *
+@ * @par Description:
+@ *  Performs inverse transform Ci4 and adds the residue to get the
+@ *  reconstructed block
+@ *
+@ * @param[in] pi16_levelBlock
+@ *  Input 4x4 coefficients
+@ *
+@ * @param[in] puc_predBuffer
+@ *  Prediction 4x4 block
+@ *
+@ * @param[out] puc_reconPic
+@ *  Output 4x4 block
+@ *
+@ * @param[in] ui16_picWidth
+@ *  Input stride
+@ *
+@ * @param[in] pred_strd
+@ *  Prediction stride
+@ *
+@ * @param[in] dst_strd
+@ *  Output Stride
+@ *
+@ * @param[in] zero_cols
+@ *  Zero columns in pi2_src
+@ *
+@ * @returns  Void
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_itrans_recon_4x4(
+@       WORD16 *pi2_src,
+@       UWORD8 *pu1_pred,
+@       UWORD8 *pu1_recon,
+@       WORD32 src_strd,
+@       WORD32 pred_strd,
+@       WORD32 dst_strd,
+@       UWORD32 q_lev,          //quantizer level
+@       WORD32 *pi4_tmp)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_recon
+@r3 =>  src_strd
+@r4 =>  pred_strd
+@r5 =>  dst_strd
+@r6 =>  q_lev
+@r7 =>  *pi4_tmp
+
+.text
+.p2align 2
+
+
+    .global ih264_itrans_recon_4x4_a9
+
+ih264_itrans_recon_4x4_a9:
+    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
+    lsl           r3, r3, #1
+
+    vld1.16       d0, [r0], r3          @0th row pi2_src_tmp[0]
+    ldr           r4, [sp, #40]         @Loads pred_strd
+
+    vld1.16       d1, [r0], r3          @I row pi2_src_tmp[0]
+    ldr           r5, [sp, #44]         @Loads *dst_strd
+
+    vld1.16       d2, [r0], r3          @II row pi2_src_tmp[0]
+
+    vld1.16       d3, [r0]              @III row pi2_src_tmp[0]
+    ldr           r7, [sp, #52]         @Loads *pi4_tmp
+
+    vpush         {d8-d15}
+
+    vtrn.16       d0, d1                @Transpose to get all the 0th element in the single D register
+    vtrn.16       d2, d3
+    vtrn.32       d0, d2
+    vtrn.32       d1, d3                @D0 --> pi2_src_tmp[0], D1 --> pi2_src_tmp[1]
+                                        @D2 --> pi2_src_tmp[2], D3 --> pi2_src_tmp[3]
+
+    vaddl.s16     q3, d0, d2            @x0 = (pi2_src_tmp[0] +  pi2_src_tmp[2])
+    vsubl.s16     q4, d0, d2            @x1 = (pi2_src_tmp[0] -  pi2_src_tmp[2])
+    vshr.s16      d4, d1, #1            @pi2_src_tmp[1] >> 1
+    vshr.s16      d5, d3, #1            @pi2_src_tmp[3] >> 1
+
+    vsubl.s16     q5, d4, d3            @x2 = D_SHIFT(pi2_src_tmp[1],1,shft) -  pi2_src_tmp[3]
+
+    vaddl.s16     q6, d1, d5            @x3 = pi2_src_tmp[1] + D_SHIFT(pi2_src_tmp[3],1,shft)
+
+    vadd.s32      q8, q4, q5            @x1 + x2
+    vsub.s32      q9, q4, q5            @x1 - x2
+
+    vadd.s32      q7, q3, q6            @x0 + x3
+    vsub.s32      q10, q3, q6           @x0 - x3
+
+    vtrn.32       q7, q8                @Transpose the register to have the adjacent values
+
+    vtrn.32       q9, q10
+    vadd.s32      d6, d14, d15          @x0(0,1) = (pi4_tblk[0,1]     +  pi4_tblk[8,9])
+
+    vsub.s32      d7, d14, d15          @x1(0,1) = (pi4_tblk[0,1]     -  pi4_tblk[8,9])
+
+    vshr.s32      d4, d16, #1           @pi4_tblk[4,5] >> 1
+    vshr.s32      d5, d17, #1           @pi4_tblk[12,13] >> 1
+
+    vsub.s32      d8, d4, d17           @x2(0,1) = D_SHIFT(pi4_tblk[4,5],1,shft) -  pi4_tblk[12,13]
+    vadd.s32      d9, d16, d5           @x3(0,1) =  pi4_tblk[4,5] + D_SHIFT(pi4_tblk[12,13],1,shft)
+
+    vadd.s32      d10, d18, d19         @x0(2,3) = (pi4_tblk[2,3]     +  pi4_tblk[10,11])
+    vsub.s32      d11, d18, d19         @x1(2,3) = (pi4_tblk[2,3]     -  pi4_tblk[10,11])
+    vshr.s32      d4, d20, #1           @pi4_tblk[6,7] >> 1
+    vshr.s32      d5, d21, #1           @pi4_tblk[14,15] >> 1
+
+    vld1.32       d30[0], [r1], r4      @I row Load pu1_pred buffer
+    vsub.s32      d12, d4, d21          @x2(2,3) = D_SHIFT(pi4_tblk[6,7],1,shft) -  pi4_tblk[14,15]
+
+    vmovl.u8      q15, d30              @I row Convert 8 bit pred buffer to 16 bit
+    vadd.s32      d13, d20, d5          @x3(2,3) =  pi4_tblk[6,7] + D_SHIFT(pi4_tblk[14,15],1,shft)
+
+    vadd.s32      d16, d6, d9           @I row i_macro(0,1) = x0(0,1) + x3(0,1)
+
+    vld1.32       d28[0], [r1], r4      @II row Load pu1_pred buffer
+    vadd.s32      d17, d10, d13         @I row i_macro(2,3) = x0(2,3) + x3(2,3)
+
+    vqrshrn.s32   d16, q8, #6           @I row i_macro = D_SHIFT(i_macro,6,shft)
+
+    vmovl.u8      q14, d28              @II row Convert 8 bit pred buffer to 16 bit
+    vadd.u16      d16, d16, d30         @I row i_macro += *pu1_pred_tmp
+
+    vqmovun.s16   d16, q8               @I row CLIP_U8(i_macro)
+    vadd.s32      d18, d7, d8           @II row i_macro(0,1) = x1(0,1) + x2(0,1)
+
+    vld1.32       d26[0], [r1], r4      @III row Load pu1_pred buffer
+    vadd.s32      d19, d11, d12         @II row i_macro(2,3) = x1(2,3) + x2(2,3)
+
+    vqrshrn.s32   d18, q9, #6           @II row i_macro = D_SHIFT(i_macro,6,shft)
+
+    vmovl.u8      q13, d26              @III row Convert 8 bit pred buffer to 16 bit
+    vadd.u16      d18, d18, d28         @II row i_macro += *pu1_pred_tmp
+
+    vst1.32       d16[0], [r2], r5      @I row store the value
+    vsub.s32      d20, d7, d8           @III row i_macro(0,1) = x1(0,1) - x2(0,1)
+
+    vqmovun.s16   d18, q9               @II row CLIP_U8(i_macro)
+    vsub.s32      d21, d11, d12         @III row i_macro(2,3) = x1(2,3) - x2(2,3)
+
+    vld1.32       d24[0], [r1], r4      @IV row Load pu1_pred buffer
+    vqrshrn.s32   d20, q10, #6          @III row i_macro = D_SHIFT(i_macro,6,shft)
+
+    vmovl.u8      q12, d24              @IV row Convert 8 bit pred buffer to 16 bit
+    vadd.u16      d20, d20, d26         @III row i_macro += *pu1_pred_tmp
+
+    vqmovun.s16   d20, q10              @III row CLIP_U8(i_macro)
+    vsub.s32      d22, d6, d9           @IV row i_macro(0,1) = x0(0,1) - x3(0,1)
+
+    vst1.32       d18[0], [r2], r5      @II row store the value
+    vsub.s32      d23, d10, d13         @IV row i_macro(2,3) = x0(2,3) - x3(2,3)
+
+    vqrshrn.s32   d22, q11, #6          @IV row i_macro = D_SHIFT(i_macro,6,shft)
+
+    vst1.32       d20[0], [r2], r5      @III row store the value
+    vadd.u16      d22, d22, d24         @IV row i_macro += *pu1_pred_tmp
+
+    vqmovun.s16   d22, q11              @IV row CLIP_U8(i_macro)
+    vst1.32       d22[0], [r2], r5      @IV row store the value
+
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
+
+
+
+
diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s
new file mode 100755
index 0000000..2808897
--- /dev/null
+++ b/common/arm/ih264_mem_fns_neon.s
@@ -0,0 +1,268 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ *  ih264_mem_fns_neon.s
+@ *
+@ * @brief
+@ *  Contains function definitions for memory manipulation
+@ *
+@ * @author
+@ *  Naveen SR
+@ *
+@ * @par List of Functions:
+@ *  - ih264_memcpy_mul_8_a9q()
+@ *  - ih264_memcpy_a9q()
+@ *  - ih264_memset_mul_8_a9q()
+@ *  - ih264_memset_a9q()
+@ *  - ih264_memset_16bit_mul_8_a9q()
+@ *  - ih264_memset_a9q()
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   memcpy of a 1d array
+@*
+@* @par Description:
+@*   Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+@*
+@* @param[in] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[in] num_bytes
+@*  number of bytes to copy
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
+@                    UWORD8 *pu1_src,
+@                   UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@   r0 => *pu1_dst
+@   r1 => *pu1_src
+@   r2 => num_bytes
+
+.text
+.p2align 2
+
+
+    .global ih264_memcpy_mul_8_a9q
+
+ih264_memcpy_mul_8_a9q:
+
+loop_neon_memcpy_mul_8:
+    @ Memcpy 8 bytes
+    vld1.8        d0, [r1]!
+    vst1.8        d0, [r0]!
+
+    subs          r2, r2, #8
+    bne           loop_neon_memcpy_mul_8
+    bx            lr
+
+
+
+@*******************************************************************************
+@*/
+@void ih264_memcpy(UWORD8 *pu1_dst,
+@                  UWORD8 *pu1_src,
+@                  UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@   r0 => *pu1_dst
+@   r1 => *pu1_src
+@   r2 => num_bytes
+
+
+
+    .global ih264_memcpy_a9q
+
+ih264_memcpy_a9q:
+    subs          r2, #8
+    blt           memcpy
+loop_neon_memcpy:
+    @ Memcpy 8 bytes
+    vld1.8        d0, [r1]!
+    vst1.8        d0, [r0]!
+
+    subs          r2, #8
+    bge           loop_neon_memcpy
+    cmp           r2, #-8
+    bxeq          lr
+
+memcpy:
+    add           r2, #8
+
+loop_memcpy:
+    ldrb          r3, [r1], #1
+    strb          r3, [r0], #1
+    subs          r2, #1
+    bne           loop_memcpy
+    bx            lr
+
+
+
+
+@void ih264_memset_mul_8(UWORD8 *pu1_dst,
+@                       UWORD8 value,
+@                       UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@   r0 => *pu1_dst
+@   r1 => value
+@   r2 => num_bytes
+
+
+
+    .global ih264_memset_mul_8_a9q
+
+ih264_memset_mul_8_a9q:
+
+@ Assumptions: numbytes is either 8, 16 or 32
+    vdup.8        d0, r1
+loop_memset_mul_8:
+    @ Memset 8 bytes
+    vst1.8        d0, [r0]!
+
+    subs          r2, r2, #8
+    bne           loop_memset_mul_8
+
+    bx            lr
+
+
+
+
+@void ih264_memset(UWORD8 *pu1_dst,
+@                       UWORD8 value,
+@                       UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@   r0 => *pu1_dst
+@   r1 => value
+@   r2 => num_bytes
+
+
+
+    .global ih264_memset_a9q
+
+ih264_memset_a9q:
+    subs          r2, #8
+    blt           memset
+    vdup.8        d0, r1
+loop_neon_memset:
+    @ Memcpy 8 bytes
+    vst1.8        d0, [r0]!
+
+    subs          r2, #8
+    bge           loop_neon_memset
+    cmp           r2, #-8
+    bxeq          lr
+
+memset:
+    add           r2, #8
+
+loop_memset:
+    strb          r1, [r0], #1
+    subs          r2, #1
+    bne           loop_memset
+    bx            lr
+
+
+
+
+@void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
+@                                   UWORD16 value,
+@                                   UWORD8 num_words)
+@**************Variables Vs Registers*************************
+@   r0 => *pu2_dst
+@   r1 => value
+@   r2 => num_words
+
+
+
+    .global ih264_memset_16bit_mul_8_a9q
+
+ih264_memset_16bit_mul_8_a9q:
+
+@ Assumptions: num_words is either 8, 16 or 32
+
+    @ Memset 8 words
+    vdup.16       d0, r1
+loop_memset_16bit_mul_8:
+    vst1.16       d0, [r0]!
+    vst1.16       d0, [r0]!
+
+    subs          r2, r2, #8
+    bne           loop_memset_16bit_mul_8
+
+    bx            lr
+
+
+
+
+@void ih264_memset_16bit(UWORD16 *pu2_dst,
+@                       UWORD16 value,
+@                       UWORD8 num_words)
+@**************Variables Vs Registers*************************
+@   r0 => *pu2_dst
+@   r1 => value
+@   r2 => num_words
+
+
+
+    .global ih264_memset_16bit_a9q
+
+ih264_memset_16bit_a9q:
+    subs          r2, #8
+    blt           memset_16bit
+    vdup.16       d0, r1
+loop_neon_memset_16bit:
+    @ Memset 8 words
+    vst1.16       d0, [r0]!
+    vst1.16       d0, [r0]!
+
+    subs          r2, #8
+    bge           loop_neon_memset_16bit
+    cmp           r2, #-8
+    bxeq          lr
+
+memset_16bit:
+    add           r2, #8
+
+loop_memset_16bit:
+    strh          r1, [r0], #2
+    subs          r2, #1
+    bne           loop_memset_16bit
+    bx            lr
+
+
+
+
diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s
new file mode 100755
index 0000000..9bab268
--- /dev/null
+++ b/common/arm/ih264_padding_neon.s
@@ -0,0 +1,646 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ *  ih264_padding_neon.s
+@ *
+@ * @brief
+@ *  Contains function definitions padding
+@ *
+@ * @author
+@ *  Ittiam
+@ *
+@ * @par List of Functions:
+@ *  - ih264_pad_top_a9q()
+@ *  - ih264_pad_left_luma_a9q()
+@ *  - ih264_pad_left_chroma_a9q()
+@ *  - ih264_pad_right_luma_a9q()
+@ *  - ih264_pad_right_chroma_a9q()
+@ *
+@ * @remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@*/
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief pad at the top of a 2d array
+@*
+@* @par Description:
+@*  The top row of a 2d array is replicated for pad_size times at the top
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @param[in] pad_size
+@*  integer -padding size of the array
+@*
+@* @returns none
+@*
+@* @remarks none
+@*
+@*******************************************************************************
+@*/
+@void ih264_pad_top(UWORD8 *pu1_src,
+@                   WORD32 src_strd,
+@                   WORD32 wd,
+@                   WORD32 pad_size)
+@**************Variables Vs Registers*************************
+@   r0 => *pu1_src
+@   r1 => src_strd
+@   r2 => wd
+@   r3 => pad_size
+
+.text
+.p2align 2
+
+    .global ih264_pad_top_a9q
+
+ih264_pad_top_a9q:
+
+    stmfd         sp!, {r4-r11, lr}     @stack stores the values of the arguments
+
+    sub           r5, r0, r1
+    rsb           r6, r1, #0
+
+loop_neon_memcpy_mul_16:
+    @ Load 16 bytes
+    vld1.8        {d0, d1}, [r0]!
+    mov           r4, r5
+    mov           r7, r3
+    add           r5, r5, #16
+
+loop_neon_pad_top:
+    vst1.8        {d0, d1}, [r4], r6
+    subs          r7, r7, #1
+    bne           loop_neon_pad_top
+
+    subs          r2, r2, #16
+    bne           loop_neon_memcpy_mul_16
+
+    ldmfd         sp!, {r4-r11, pc}     @Reload the registers from SP
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   Padding (luma block) at the left of a 2d array
+@*
+@* @par Description:
+@*   The left column of a 2d array is replicated for pad_size times at the left
+@*
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @param[in] pad_size
+@*  integer -padding size of the array
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@#if PAD_LEFT_LUMA == C
+@void ih264_pad_left_luma(UWORD8 *pu1_src,
+@                        WORD32 src_strd,
+@                        WORD32 ht,
+@                        WORD32 pad_size)
+@**************Variables Vs Registers*************************
+@   r0 => *pu1_src
+@   r1 => src_strd
+@   r2 => ht
+@   r3 => pad_size
+
+
+    .global ih264_pad_left_luma_a9q
+
+ih264_pad_left_luma_a9q:
+
+    stmfd         sp!, {r4-r11, lr}     @stack stores the values of the arguments
+
+
+    sub           r4, r0, r3
+    sub           r6, r1, #16
+    subs          r5, r3, #16
+    bne           loop_32
+loop_16:                                @  /*hard coded for width=16  ,height =8,16*/
+    ldrb          r8, [r0], r1
+    ldrb          r9, [r0], r1
+    vdup.u8       q0, r8
+    ldrb          r10, [r0], r1
+    vst1.8        {q0}, [r4], r1        @ 16 bytes store
+    vdup.u8       q1, r9
+    vst1.8        {q1}, [r4], r1        @ 16 bytes store
+    ldrb          r11, [r0], r1
+    vdup.u8       q2, r10
+    vdup.u8       q3, r11
+    vst1.8        {q2}, [r4], r1        @ 16 bytes store
+    ldrb          r8, [r0], r1
+    vst1.8        {q3}, [r4], r1        @ 16 bytes store
+    ldrb          r9, [r0], r1
+    vdup.u8       q0, r8
+    ldrb          r10, [r0], r1
+    vst1.8        {q0}, [r4], r1        @ 16 bytes store
+    vdup.u8       q1, r9
+    ldrb          r11, [r0], r1
+    vst1.8        {q1}, [r4], r1        @ 16 bytes store
+    vdup.u8       q2, r10
+    vdup.u8       q3, r11
+    subs          r2, r2, #8
+    vst1.8        {q2}, [r4], r1        @ 16 bytes store
+    vst1.8        {q3}, [r4], r1        @ 16 bytes store
+    bne           loop_16
+    b             end_func
+
+loop_32:                                @  /*hard coded for width=32 ,height =8,16*/
+    ldrb          r8, [r0], r1
+    ldrb          r9, [r0], r1
+    vdup.u8       q0, r8
+    ldrb          r10, [r0], r1
+    vst1.8        {q0}, [r4]!           @ 16 bytes store
+    vdup.u8       q1, r9
+    vst1.8        {q0}, [r4], r6
+    vst1.8        {q1}, [r4]!           @ 16 bytes store
+    vdup.u8       q2, r10
+    vst1.8        {q1}, [r4], r6        @ 16 bytes store
+    ldrb          r11, [r0], r1
+    vst1.8        {q2}, [r4]!           @ 16 bytes store
+    vdup.u8       q3, r11
+    vst1.8        {q2}, [r4], r6        @ 16 bytes store
+    ldrb          r8, [r0], r1
+    vst1.8        {q3}, [r4]!           @ 16 bytes store
+    vdup.u8       q0, r8
+    ldrb          r9, [r0], r1
+    vst1.8        {q3}, [r4], r6        @ 16 bytes store
+    ldrb          r10, [r0], r1
+    vst1.8        {q0}, [r4]!           @ 16 bytes store
+    vdup.u8       q1, r9
+    vst1.8        {q0}, [r4], r6        @ 16 bytes store
+    ldrb          r11, [r0], r1
+    vst1.8        {q1}, [r4]!           @ 16 bytes store
+    vdup.u8       q2, r10
+    vst1.8        {q1}, [r4], r6        @ 16 bytes store
+    vst1.8        {q2}, [r4]!           @ 16 bytes store
+    vdup.u8       q3, r11
+    vst1.8        {q2}, [r4], r6        @ 16 bytes store
+    subs          r2, r2, #8
+    vst1.8        {q3}, [r4]!           @ 16 bytes store
+    vst1.8        {q3}, [r4], r6        @ 16 bytes store
+    bne           loop_32
+
+
+
+end_func:
+    ldmfd         sp!, {r4-r11, pc}     @Reload the registers from SP
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   Padding (chroma block) at the left of a 2d array
+@*
+@* @par Description:
+@*   The left column of a 2d array is replicated for pad_size times at the left
+@*
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array (each colour component)
+@*
+@* @param[in] pad_size
+@*  integer -padding size of the array
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@#if PAD_LEFT_CHROMA == C
+@void ih264_pad_left_chroma(UWORD8 *pu1_src,
+@                            WORD32 src_strd,
+@                            WORD32 ht,
+@                            WORD32 pad_size)
+@{
+@   r0 => *pu1_src
+@   r1 => src_strd
+@   r2 => ht
+@   r3 => pad_size
+
+
+
+    .global ih264_pad_left_chroma_a9q
+
+ih264_pad_left_chroma_a9q:
+
+    stmfd         sp!, {r4-r11, lr}     @stack stores the values of the arguments
+
+    sub           r4, r0, r3
+    sub           r6, r1, #16
+
+
+loop_32_l_c:                            @  /*hard coded for width=32  ,height =4,8,12*/
+    ldrh          r8, [r0], r1
+    ldrh          r9, [r0], r1
+    vdup.u16      q0, r8
+    ldrh          r10, [r0], r1
+    vst1.8        {q0}, [r4]!           @ 16 bytes store
+    vdup.u16      q1, r9
+    vst1.8        {q0}, [r4], r6        @ 16 bytes store
+    ldrh          r11, [r0], r1
+    vst1.8        {q1}, [r4]!           @ 16 bytes store
+    vdup.u16      q2, r10
+    vst1.8        {q1}, [r4], r6        @ 16 bytes store
+    vdup.u16      q3, r11
+    vst1.8        {q2}, [r4]!           @ 16 bytes store
+    vst1.8        {q2}, [r4], r6        @ 16 bytes store
+    subs          r2, r2, #4
+    vst1.8        {q3}, [r4]!           @ 16 bytes store
+    vst1.8        {q3}, [r4], r6        @ 16 bytes store
+
+
+    beq           end_func_l_c          @/* Branching when ht=4*/
+
+    ldrh          r8, [r0], r1
+    ldrh          r9, [r0], r1
+    vdup.u16      q0, r8
+    ldrh          r10, [r0], r1
+    vst1.8        {q0}, [r4]!           @ 16 bytes store
+    vdup.u16      q1, r9
+    vst1.8        {q0}, [r4], r6
+    ldrh          r11, [r0], r1
+    vst1.8        {q1}, [r4]!           @ 16 bytes store
+    vdup.u16      q2, r10
+    vst1.8        {q1}, [r4], r6        @ 16 bytes store
+    vdup.u16      q3, r11
+    vst1.8        {q2}, [r4]!           @ 16 bytes store
+    vst1.8        {q2}, [r4], r6        @ 16 bytes store
+    subs          r2, r2, #4
+    vst1.8        {q3}, [r4]!           @ 16 bytes store
+    vst1.8        {q3}, [r4], r6        @ 16 bytes store
+
+    beq           end_func_l_c          @/* Branching when ht=8*/
+    bne           loop_32_l_c
+
+    ldrh          r8, [r0], r1
+    ldrh          r9, [r0], r1
+    vdup.u16      q0, r8
+    ldrh          r10, [r0], r1
+    vst1.8        {q0}, [r4]!           @ 16 bytes store
+    vdup.u16      q1, r9
+    vst1.8        {q0}, [r4], r6
+    ldrh          r11, [r0], r1
+    vst1.8        {q1}, [r4]!           @ 16 bytes store
+    vdup.u16      q2, r10
+    vst1.8        {q1}, [r4], r6        @ 16 bytes store
+    vdup.u16      q3, r11
+    vst1.8        {q2}, [r4]!           @ 16 bytes store
+    vst1.8        {q2}, [r4], r6        @ 16 bytes store
+    vst1.8        {q3}, [r4]!           @ 16 bytes store
+    vst1.8        {q3}, [r4], r6        @ 16 bytes store
+
+end_func_l_c:
+    ldmfd         sp!, {r4-r11, pc}     @Reload the registers from SP
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Padding (luma block) at the right of a 2d array
+@*
+@* @par Description:
+@* The right column of a 2d array is replicated for pad_size times at the right
+@*
+@*
+@* @param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @param[in] pad_size
+@*  integer -padding size of the array
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@#if PAD_RIGHT_LUMA == C
+@void ih264_pad_right_luma(UWORD8 *pu1_src,
+@                        WORD32 src_strd,
+@                        WORD32 ht,
+@                        WORD32 pad_size)
+@{
+@    WORD32 row;
+@
+@    for(row = 0; row < ht; row++)
+@    {
+@        memset(pu1_src, *(pu1_src -1), pad_size);
+@
+@        pu1_src += src_strd;
+@    }
+@}
+@
+@   r0 => *pu1_src
+@   r1 => src_strd
+@   r2 => ht
+@   r3 => pad_size
+
+
+
+    .global ih264_pad_right_luma_a9q
+
+ih264_pad_right_luma_a9q:
+
+    stmfd         sp!, {r4-r11, lr}     @stack stores the values of the arguments
+
+    mov           r4, r0
+    sub           r6, r1, #16
+    sub           r0, r0, #1
+    subs          r5, r3, #16
+    bne           loop_32
+loop_16_r: @  /*hard coded for width=16  ,height =8,16*/
+    ldrb          r8, [r0], r1
+    ldrb          r9, [r0], r1
+    vdup.u8       q0, r8
+    ldrb          r10, [r0], r1
+    vst1.8        {q0}, [r4], r1        @ 16 bytes store
+    vdup.u8       q1, r9
+    vst1.8        {q1}, [r4], r1        @ 16 bytes store
+    ldrb          r11, [r0], r1
+    vdup.u8       q2, r10
+    vdup.u8       q3, r11
+    vst1.8        {q2}, [r4], r1        @ 16 bytes store
+    ldrb          r8, [r0], r1
+    vst1.8        {q3}, [r4], r1        @ 16 bytes store
+    ldrb          r9, [r0], r1
+    vdup.u8       q0, r8
+    ldrb          r10, [r0], r1
+    vst1.8        {q0}, [r4], r1        @ 16 bytes store
+    vdup.u8       q1, r9
+    ldrb          r11, [r0], r1
+    vst1.8        {q1}, [r4], r1        @ 16 bytes store
+    vdup.u8       q2, r10
+    vdup.u8       q3, r11
+    subs          r2, r2, #8
+    vst1.8        {q2}, [r4], r1        @ 16 bytes store
+    vst1.8        {q3}, [r4], r1        @ 16 bytes store
+    bne           loop_16_r
+    b             end_func_r
+
+loop_32_r:                              @  /*hard coded for width=32  ,height =8,16*/
+    ldrb          r8, [r0], r1
+    ldrb          r9, [r0], r1
+    vdup.u8       q0, r8
+    ldrb          r10, [r0], r1
+    vst1.8        {q0}, [r4]!           @ 16 bytes store
+    vdup.u8       q1, r9
+    vst1.8        {q0}, [r4], r6
+    vst1.8        {q1}, [r4]!           @ 16 bytes store
+    vdup.u8       q2, r10
+    vst1.8        {q1}, [r4], r6        @ 16 bytes store
+    ldrb          r11, [r0], r1
+    vst1.8        {q2}, [r4]!           @ 16 bytes store
+    vdup.u8       q3, r11
+    vst1.8        {q2}, [r4], r6        @ 16 bytes store
+    ldrb          r8, [r0], r1
+    vst1.8        {q3}, [r4]!           @ 16 bytes store
+    ldrb          r9, [r0], r1
+    vdup.u8       q0, r8
+    vst1.8        {q3}, [r4], r6        @ 16 bytes store
+    ldrb          r10, [r0], r1
+    vst1.8        {q0}, [r4]!           @ 16 bytes store
+    vdup.u8       q1, r9
+    vst1.8        {q0}, [r4], r6        @ 16 bytes store
+    ldrb          r11, [r0], r1
+    vst1.8        {q1}, [r4]!           @ 16 bytes store
+    vdup.u8       q2, r10
+    vst1.8        {q1}, [r4], r6        @ 16 bytes store
+    vst1.8        {q2}, [r4]!           @ 16 bytes store
+    vdup.u8       q3, r11
+    vst1.8        {q2}, [r4], r6        @ 16 bytes store
+    subs          r2, r2, #8
+    vst1.8        {q3}, [r4]!           @ 16 bytes store
+    vst1.8        {q3}, [r4], r6        @ 16 bytes store
+    bne           loop_32_r
+
+
+
+end_func_r:
+    ldmfd         sp!, {r4-r11, pc}     @Reload the registers from SP
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@;* Padding (chroma block) at the right of a 2d array
+@*
+@* @par Description:
+@* The right column of a 2d array is replicated for pad_size times at the right
+@*
+@*
+@* @param[in] pu1_src
+@;*  UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] ht
+@;*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array (each colour component)
+@*
+@* @param[in] pad_size
+@*  integer -padding size of the array
+@*
+@* @param[in] ht
+@;*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@#if PAD_RIGHT_CHROMA == C
+@void ih264_pad_right_chroma(UWORD8 *pu1_src,
+@                        WORD32 src_strd,
+@                        WORD32 ht,
+@                        WORD32 pad_size)
+@   r0 => *pu1_src
+@   r1 => src_strd
+@   r2 => ht
+@   r3 => pad_size
+
+
+
+    .global ih264_pad_right_chroma_a9q
+
+ih264_pad_right_chroma_a9q:
+
+    stmfd         sp!, {r4-r11, lr}     @stack stores the values of the arguments
+
+    mov           r4, r0
+    sub           r6, r1, #16
+    sub           r0, r0, #2
+loop_32_r_c: @  /*hard coded for width=32 ,height =8,4*/
+    ldrh          r8, [r0], r1
+    ldrh          r9, [r0], r1
+    vdup.u16      q0, r8
+    ldrh          r10, [r0], r1
+    vst1.8        {q0}, [r4]!           @ 16 bytes store
+    vdup.u16      q1, r9
+    vst1.8        {q0}, [r4], r6
+    vst1.8        {q1}, [r4]!           @ 16 bytes store
+    vdup.u16      q2, r10
+    vst1.8        {q1}, [r4], r6        @ 16 bytes store
+    subs          r2, r2, #4
+    ldrh          r11, [r0], r1
+    vst1.8        {q2}, [r4]!           @ 16 bytes store
+    vdup.u16      q3, r11
+    vst1.8        {q2}, [r4], r6        @ 16 bytes store
+    vst1.8        {q3}, [r4]!           @ 16 bytes store
+    vst1.8        {q3}, [r4], r6        @ 16 bytes store
+
+    beq           end_func_r_c          @/* Branching when ht=4*/
+
+    ldrh          r8, [r0], r1
+    vdup.u16      q0, r8
+    ldrh          r9, [r0], r1
+    ldrh          r10, [r0], r1
+    vst1.8        {q0}, [r4]!           @ 16 bytes store
+    vdup.u16      q1, r9
+    vst1.8        {q0}, [r4], r6        @ 16 bytes store
+    ldrh          r11, [r0], r1
+    vst1.8        {q1}, [r4]!           @ 16 bytes store
+    vdup.u16      q2, r10
+    vst1.8        {q1}, [r4], r6        @ 16 bytes store
+    vst1.8        {q2}, [r4]!           @ 16 bytes store
+    vdup.u16      q3, r11
+    vst1.8        {q2}, [r4], r6        @ 16 bytes store
+    subs          r2, r2, #4
+    vst1.8        {q3}, [r4]!           @ 16 bytes store
+    vst1.8        {q3}, [r4], r6        @ 16 bytes store
+
+    beq           end_func_r_c          @/* Branching when ht=8*/
+    bne           loop_32_r_c
+
+    ldrh          r8, [r0], r1
+    vdup.u16      q0, r8
+    ldrh          r9, [r0], r1
+    ldrh          r10, [r0], r1
+    vst1.8        {q0}, [r4]!           @ 16 bytes store
+    vdup.u16      q1, r9
+    vst1.8        {q0}, [r4], r6        @ 16 bytes store
+    ldrh          r11, [r0], r1
+    vst1.8        {q1}, [r4]!           @ 16 bytes store
+    vdup.u16      q2, r10
+    vst1.8        {q1}, [r4], r6        @ 16 bytes store
+    vst1.8        {q2}, [r4]!           @ 16 bytes store
+    vdup.u16      q3, r11
+    vst1.8        {q2}, [r4], r6        @ 16 bytes store
+    vst1.8        {q3}, [r4]!           @ 16 bytes store
+    vst1.8        {q3}, [r4], r6        @ 16 bytes store
+
+end_func_r_c:
+    ldmfd         sp!, {r4-r11, pc}     @Reload the registers from SP
+
+
+
+
+
diff --git a/common/arm/ih264_platform_macros.h b/common/arm/ih264_platform_macros.h
new file mode 100755
index 0000000..1f67403
--- /dev/null
+++ b/common/arm/ih264_platform_macros.h
@@ -0,0 +1,152 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+*  ih264_platform_macros.h
+*
+* @brief
+*  Platform specific Macro definitions used in the codec
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_PLATFORM_MACROS_H_
+#define _IHEVC_PLATFORM_MACROS_H_
+
+#ifndef  ARMV8
+void ih264_arm_dsb(void);
+
+#define DATA_SYNC()  ih264_arm_dsb()
+static __inline WORD32 CLIP_U8(WORD32 x)
+{
+    asm("usat %0, #8, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_S8(WORD32 x)
+{
+    asm("ssat %0, #8, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_U10(WORD32 x)
+{
+    asm("usat %0, #10, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_S10(WORD32 x)
+{
+    asm("ssat %0, #10, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_U12(WORD32 x)
+{
+    asm("usat %0, #12, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_S12(WORD32 x)
+{
+    asm("ssat %0, #12, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_U16(WORD32 x)
+{
+    asm("usat %0, #16, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+static __inline WORD32 CLIP_S16(WORD32 x)
+{
+    asm("ssat %0, #16, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+
+static __inline UWORD32 ITT_BIG_ENDIAN(UWORD32 x)
+{
+    asm("rev %0, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+#else
+#define DATA_SYNC() ;
+
+#define CLIP_U8(x) CLIP3(0, 255, (x))
+#define CLIP_S8(x) CLIP3(-128, 127, (x))
+
+#define CLIP_U10(x) CLIP3(0, 1023, (x))
+#define CLIP_S10(x) CLIP3(-512, 511, (x))
+
+#define CLIP_U12(x) CLIP3(0, 4095, (x))
+#define CLIP_S12(x) CLIP3(-2048, 2047, (x))
+
+#define CLIP_U16(x) CLIP3(0, 65535, (x))
+#define CLIP_S16(x) CLIP3(-32768, 32767, (x))
+
+#define ITT_BIG_ENDIAN(x)   ((x & 0x000000ff) << 24)                |   \
+                            ((x & 0x0000ff00) << 8)    |   \
+                            ((x & 0x00ff0000) >> 8)    |   \
+                            ((UWORD32)x >> 24);
+#endif
+
+#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0)
+#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0)
+
+#define SHR_NEG(val,shift)  ((shift>0)?(val>>shift):(val<<(-shift)))
+#define SHL_NEG(val,shift)  ((shift<0)?(val>>(-shift)):(val<<shift))
+
+#define INLINE inline
+
+static INLINE UWORD32 CLZ(UWORD32 u4_word)
+{
+    if(u4_word)
+        return (__builtin_clz(u4_word));
+    else
+        return 32;
+}
+static INLINE UWORD32 CTZ(UWORD32 u4_word)
+{
+    if(0 == u4_word)
+        return 31;
+    else
+    {
+        unsigned int index;
+        index = __builtin_ctz(u4_word);
+        return (UWORD32)index;
+    }
+}
+
+
+#define NOP(nop_cnt)    {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
+
+
+#define MEM_ALIGN8 __attribute__ ((aligned (8)))
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+#define MEM_ALIGN32 __attribute__ ((aligned (32)))
+
+#endif /* _IHEVC_PLATFORM_MACROS_H_ */
diff --git a/common/arm/ih264_resi_trans_a9.s b/common/arm/ih264_resi_trans_a9.s
new file mode 100755
index 0000000..08821f5
--- /dev/null
+++ b/common/arm/ih264_resi_trans_a9.s
@@ -0,0 +1,604 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@*******************************************************************************
+@* @file
+@*  ih264_resi_trans_a9.s
+@*
+@* @brief
+@*  Contains function definitions for residual and forward trans
+@*
+@* @author
+@*  Ittiam
+@*
+@* @par List of Functions:
+@*  ih264_resi_trans_4x4_a9
+@*  ih264_resi_trans_8x8_a9
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+
+
+.text
+.p2align 2
+@*****************************************************************************
+@*
+@* Function Name     : ih264_resi_trans_4x4_a9
+@* Description       : This function does cf4 of H264 followed by and approximate scaling
+@*
+@* Arguments         :
+@                       R0 :pointer to src buffer
+@                       R1 :pointer to pred buffer
+@                       R2 :pointer to dst buffer
+@                       R3 :src_stride
+@                       STACk :pred_stride,dst_stride
+
+@* Values Returned   : NONE
+@*
+@* Register Usage    :
+@* Stack Usage       :
+@* Cycles            : Around
+@* Interruptiaility  : Interruptable
+@*
+@* Known Limitations
+@*   \Assumptions    :
+@*
+@* Revision History  :
+@*         DD MM YYYY    Author(s)   Changes
+@*         30 12 2009    100633      First version
+@*
+@*****************************************************************************
+
+
+    .global ih264_resi_trans_4x4_a9
+    .extern g_scal_coff_h264_4x4
+g_scal_coff_h264_4x4_addr:
+    .long g_scal_coff_h264_4x4 - 4x4lbl - 8
+
+ih264_resi_trans_4x4_a9:
+
+    @R0 :pointer to src buffer
+    @R1 :pointer to pred buffer
+    @R2 :pointer to dst buffer
+    @R3 :src_stride
+    @STACk :pred_stride,dst_stride
+
+    push          {r4-r12, lr}          @push all the variables first
+
+    mov           r6, sp
+    add           r6, r6, #40           @decrement stack pointer,to accomodate two variables
+    ldmfd         r6, {r4-r5}           @load the strides into registers
+                                        @R4 pred_stride
+                                        @R5 dst_stride
+
+
+    @we have to give the stride as post inrement in VLDR1
+    @but since thr stride is from end of row 1 to start of row 2,
+    @we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes)
+    @ADD R3,#4
+    @ADD R4,#4
+    @ADD R5,#4
+    @in case of dst the stride represnts 16 bit ie 2*8bits
+    @hence we need to add #4 to it and thenm multiply by 2
+    @--------------------function loading done------------------------
+
+    @lets find residual
+    @data is like 1a -> d0[1:31]  d0[32:64]
+    @                    a b c d   # # # #
+    vld1.u8       d30, [r0], r3         @load 4 pixels of row1 current buffer
+    vld1.u8       d31, [r1], r4         @load 4 pixels of row1 pred buffer
+    @ data is like 1a -> q4[1:63]  q4[64:148]
+    @                    d8[1:63]  d9[1:63]
+    @                    a b c d   # # # #
+
+    vld1.u8       d28, [r0], r3         @load row 2 of src to d28[0]
+    vld1.u8       d29, [r1], r4         @load row2 of pred to d29[0]
+
+    vld1.u8       d26, [r0], r3         @load row 3 of src to d26[0]
+    vsubl.u8      q0, d30, d31          @curr - pred for row one
+
+    vld1.u8       d27, [r1], r4         @load row 3of pred t0 d27[0]
+    vsubl.u8      q1, d28, d29          @find row 2 of src -pred to d0
+
+    vld1.u8       d24, [r0], r3         @load row 4 of src to d24[0]
+
+    vld1.u8       d25, [r1], r4         @load row 4 of src tp d25[0]
+    vsubl.u8      q2, d26, d27          @load src-pred row 3 to d[2]
+
+    lsl           r5, r5, #2            @ multiply dst stride by since we are storing 32 bit values
+    ldr           r6, g_scal_coff_h264_4x4_addr
+4x4lbl:
+    add           r6, r6, pc            @  load the address of global array
+
+    vsubl.u8      q3, d24, d25          @load row 4 of src - pred to q6
+
+    @after this
+    @D0  -> 1a
+    @D2 -> 2a
+    @D4 -> 3a
+    @D6 -> 4a
+
+    @transpose the matrix so that we can do the horizontal transform first
+    @#1 #2  #3  #4
+    @a  b   c   d       ---- D0
+    @e  f   g   h       -----D2
+    @i  j   k   l       -----D4
+    @m  n   o   p       -----D6
+    @transpose the inner 2x2 blocks
+    vtrn.16       d0, d2
+    vld1.s16      {q10}, [r6]!          @   load the scaling values 0-7;
+    vtrn.16       d4, d6
+    @a  e   c   g
+    @b  f   d   h
+    @i  m   k   o
+    @j  n   l   p
+    vtrn.32       d0, d4
+    vtrn.32       d2, d6
+    @a  e   i   m  #1  -- D0 --- x4
+    @b  f   j   n  #2  -- D2 --- x5
+    @c  g   k   o  #3  -- D4 ----x6
+    @d  h   l   p  #4  -- D6 ----x7
+
+    @we have loaded the residuals into the registers , now we need to add and subtract them
+    @let us do the horiz transform first
+
+    vsub.s16      d5, d2, d4            @x2 = x5-x6
+    vsub.s16      d7, d0, d6            @x3 = x4-x7;
+
+    vadd.s16      d3, d2, d4            @x1 = x5+x6
+    vadd.s16      d1, d0, d6            @x0 = x4+x7
+
+
+    vshl.s16      d31, d7, #1           @
+    vshl.s16      d30, d5, #1           @
+
+    vadd.s16      d0, d1, d3            @x0 + x1;
+    vsub.s16      d4, d1, d3            @x0 - x1;
+
+    vadd.s16      d2, d31, d5           @U_SHIFT(x3,1,shft) + x2;
+    vsub.s16      d6, d7, d30           @x3 - U_SHIFT(x2,1,shft);
+
+    @taking transform again so as to make do vert transform
+    vtrn.16       d0, d2
+    vtrn.16       d4, d6
+
+    vtrn.32       d0, d4
+    vtrn.32       d2, d6
+
+    @let us do vertical transform
+    @same code as horiz
+
+    vadd.s16      d1, d0, d6            @x0 = x4+x7
+    vadd.s16      d3, d2, d4            @x1 = x5+x6
+    vsub.s16      d7, d0, d6            @x3 = x4-x7;
+    vsub.s16      d5, d2, d4            @x2 = x5-x6
+
+
+@Since we are going to do scal / quant or whatever, we are going to divide by
+@a 32 bit number. So we have to expand the values
+
+    @VADDL.S16 Q12,D1,D3;x0 + x1
+    @VSUBL.S16 Q14,D1,D3;x0 - x1
+
+    @VSHL.S16  D8,D5,#1;
+    @VSHL.S16  D9,D7,#1;
+
+    @VADDL.S16 Q13,D9,D5 ; + x2
+    @VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft)
+
+@scaling follows
+
+@now we need to do the scaling,so load the scaling matrix
+@mutliplying by the scaling coeffient; store the results from q5-q8 ;
+
+    vadd.s16      d24, d3, d1           @x4 = x0 + x1
+    vsub.s16      d28, d1, d3           @x6 = x0 - x1
+
+    vshl.s16      d0, d7, #1            @ U_SHIFT(x3,1,shft)
+    vmull.s16     q4, d24, d20          @x4*s0
+
+    vshl.s16      d2, d5, #1            @ U_SHIFT(x2,1,shft)
+
+    vadd.s16      d26, d0, d5           @x5 = U_SHIFT(x3,1,shft) + x2
+    vmull.s16     q5, d26, d21          @x5*s1
+
+    vst1.s32      {q4}, [r2], r5        @save 4 pixels of row1 current buffer and increment pointer by stride
+
+    vld1.s16      {q10}, [r6]           @load 8-16 scaling coeffcients
+
+    vsub.s16      d30, d7, d2           @x7 = x3 - U_SHIFT(x2,1,shft)
+
+    vmull.s16     q6, d28, d20          @x6*s2
+    vst1.s32      {q5}, [r2], r5
+
+    vmull.s16     q7, d30, d21          @x7*s3
+
+
+    vst1.s32      {q6}, [r2], r5
+    vst1.s32      {q7}, [r2]
+
+    pop           {r4-r12, pc}          @pop back all variables
+
+
+
+
+@*****************************************************************************
+@* Function Name     : ih264_resi_trans_8x8_a9
+@* Description       : This function does cf8 followd by an approximate normalization of H264
+@*
+@* Arguments         :
+@*                      R0 :pointer to src buffer
+@                       R1 :pointer to pred buffer
+@                       R2 :pointer to dst buffer
+@                       R3 :src_stride
+@                       STACk :pred_stride,dst_st
+@*
+@*
+@* Values Returned   : NONE
+@*
+@* Register Usage    :
+@* Stack Usage       :
+@* Cycles            : Around
+@* Interruptiaility  : Interruptable
+@*
+@* Known Limitations
+@*   \Assumptions    :
+@*
+@* Revision History  :
+@*         DD MM YYYY    Author(s)   Changes
+@*         30 12 2009    100633      First version
+@*
+@*****************************************************************************
+
+
+    .global ih264_resi_trans_8x8_a9
+    .extern g_scal_coff_h264_8x8
+g_scal_coff_h264_8x8_addr:
+    .long g_scal_coff_h264_8x8 - 8x8lbl - 8
+
+
+ih264_resi_trans_8x8_a9:
+
+    @R0 :pointer to src buffer
+    @R1 :pointer to pred buffer
+    @R2 :pointer to dst buffer
+    @R3 :src_stride
+    @STACk :pred_stride,dst_stride
+
+    push          {r4-r12, lr}          @push all the variables first
+
+    mov           r6, sp
+    add           r6, r6, #40           @decrement stack pointer,to accomodate two variables
+    ldmfd         r6, {r4-r5}           @load the strides into registers
+                                        @R4 pred_stride
+                                        @R5 dst_stride
+
+    @we have to give the stride as post inrement in vst1
+    @in case of dst the stride represnts 16 bit ie 2*8bits
+    @hence we need to add #4 to it and thenm multiply by 2
+    @--------------------function loading done------------------------
+
+    @lets find residual
+    @data is like 1a -> d0[1:31]  d0[32:64]
+    @                    a b c d   # # # #
+    vld1.u8       d30, [r0], r3         @load 4 pixels of row1 current buffer
+    vld1.u8       d31, [r1], r4         @load 4 pixels of row1 pred buffer
+
+    vld1.u8       d28, [r0], r3         @src  rw2
+    vld1.u8       d29, [r1], r4         @pred rw2
+    vsubl.u8      q0, d30, d31          @src-pred rw1
+
+    vld1.u8       d26, [r0], r3
+    vld1.u8       d27, [r1], r4
+    vsubl.u8      q1, d28, d29
+
+    vld1.u8       d24, [r0], r3
+    vld1.u8       d25, [r1], r4
+    vsubl.u8      q2, d26, d27
+
+    vld1.u8       d22, [r0], r3
+    vld1.u8       d23, [r1], r4
+    vsubl.u8      q3, d24, d25
+
+    vld1.u8       d20, [r0], r3
+    vld1.u8       d21, [r1], r4
+    vsubl.u8      q4, d22, d23
+
+    vld1.u8       d18, [r0], r3
+    vld1.u8       d19, [r1], r4
+    vsubl.u8      q5, d20, d21
+
+    vld1.u8       d16, [r0], r3
+    vld1.u8       d17, [r1], r4
+    vsubl.u8      q6, d18, d19
+
+    lsl           r5, r5, #2
+
+
+    vsubl.u8      q7, d16, d17
+
+    @after this
+    @Q0 -> 1a
+    @Q1 -> 2a
+    @Q2 -> 3a
+    @Q3 -> 4a
+    @Q4 -> 5a
+    @Q5 -> 6a
+    @Q6 -> 7a
+    @Q7 -> 8a
+
+    @transpose the matrix so that we can do the horizontal transform first
+
+    @transpose the inner 2x2 blocks
+    vtrn.16       q0, q1
+    vtrn.16       q2, q3
+    vtrn.16       q4, q5
+    vtrn.16       q6, q7
+
+    @transpose the inner 4x4 blocks
+    vtrn.32       q0, q2
+    vtrn.32       q1, q3
+
+    vtrn.32       q4, q6
+    vtrn.32       q5, q7
+
+    @transpose the outer 8x8 blocks
+    vswp          d1, d8
+    vswp          d7, d14
+    vswp          d3, d10
+    vswp          d5, d12
+    @transpose done
+
+@@this point we will have data in Q0-Q7
+@Q7 will be populated within 2 clock cycle
+@all others are availabe @ this clock cycle
+
+    @we have loaded the residuals into the registers , now we need to add and subtract them
+    @let us do the horiz transform first
+
+    vadd.s16      q8, q0, q7            @      a0 = r0 + r7;
+    vadd.s16      q9, q1, q6            @      a1 = r1 + r6;
+    vadd.s16      q10, q2, q5           @     a2 = r2 + r5;
+    vadd.s16      q11, q3, q4           @     a3 = r3 + r4;
+
+    vsub.s16      q12, q0, q7           @     b0 = r0 - r7;
+    vsub.s16      q13, q1, q6           @     b1 = r1 - r6;
+    vsub.s16      q15, q3, q4           @     b3 = r3 - r4;
+    vsub.s16      q14, q2, q5           @     b2 = r2 - r5;
+
+    vadd.s16      q1, q8, q11           @     a4 = a0 + a3;
+    vadd.s16      q3, q9, q10           @     a5 = a1 + a2;
+    vsub.s16      q7, q9, q10           @     a7 = a1 - a2;
+    vsub.s16      q5, q8, q11           @     a6 = a0 - a3;
+
+    ldr           r6, g_scal_coff_h264_8x8_addr
+8x8lbl:
+    add           r6, r6, pc            @  load the address of global array
+
+    vadd.s16      q0, q1, q3            @      pi2_res[0] = a4 + a5;
+    vshr.s16      q8, q7, #1            @      pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
+
+    vsub.s16      q4, q1, q3            @      pi2_res[4] = a4 - a5;
+
+    vadd.s16      q2, q5, q8            @
+
+
+    vshr.s16      q9, q5, #1            @      pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
+    vsub.s16      q6, q9, q7            @
+
+@do not change Q0,Q2.Q4,Q6 they contain results
+@Q1,Q3,Q5,Q7 TO STORE RESULTS
+@Q8 Q9 Q10 Q11 USE @WILL
+
+    vshr.s16      q1, q12, #1           @     D_SHIFT(b0,1,shft)
+    vshr.s16      q3, q13, #1           @     D_SHIFT(b1,1,shft)
+    vshr.s16      q5, q14, #1           @     D_SHIFT(b2,1,shft)
+    vshr.s16      q7, q15, #1           @     D_SHIFT(b3,1,shft)
+
+    vadd.s16      q8, q1, q12           @     (D_SHIFT(b0,1,shft) + b0);
+    vadd.s16      q9, q3, q13           @     (D_SHIFT(b1,1,shft) + b1);
+    vadd.s16      q10, q5, q14          @    (D_SHIFT(b2,1,shft) + b2);
+    vadd.s16      q11, q7, q15          @    (D_SHIFT(b3,1,shft) + b3);
+
+    vadd.s16      q1, q14, q8           @     b2 + (D_SHIFT(b0,1,shft) + b0);
+    vsub.s16      q5, q15, q9           @     b3 - (D_SHIFT(b1,1,shft) + b1);
+    vadd.s16      q3, q15, q10          @    b3 + (D_SHIFT(b2,1,shft) + b2);
+    vsub.s16      q7, q11, q14          @    -b2 + (D_SHIFT(b3,1,shft) + b3);
+
+    vadd.s16      q8, q13, q1           @     b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
+    vsub.s16      q9, q12, q3           @     b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
+    vadd.s16      q10, q12, q5          @    b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
+    vadd.s16      q11, q13, q7          @    b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
+
+    vshr.s16      q15, q8, #2           @     D_SHIFT(b4,2,shft)
+    vshr.s16      q14, q9, #2           @     D_SHIFT(b5,2,shft);
+    vshr.s16      q13, q10, #2          @    D_SHIFT(b6,2,shft);
+    vshr.s16      q12, q11, #2          @    D_SHIFT(b7,2,shft);
+
+
+    vadd.s16      q3, q9, q13           @     pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
+    vsub.s16      q5, q10, q14          @    pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
+    vadd.s16      q1, q8, q12           @     pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
+    vsub.s16      q7, q15, q11          @    pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
+
+    @------------horiz transform done-------------------------
+    @results are in Q0-Q7
+    @all other neon registes can be used at will
+
+@doing vertical transform
+@code exact copy of horiz transform above
+
+    @transpose the inner 2x2 blocks
+    vtrn.16       q0, q1
+    vtrn.16       q2, q3
+    vtrn.16       q4, q5
+    vtrn.16       q6, q7
+
+    @transpose the inner 4x4 blocks
+    vtrn.32       q0, q2
+    vtrn.32       q1, q3
+
+    vtrn.32       q4, q6
+    vtrn.32       q5, q7
+
+    @transpose the outer 8x8 blocks
+    vswp          d1, d8
+    vswp          d3, d10
+    vswp          d5, d12
+    vswp          d7, d14
+
+    @transpose done
+
+    vadd.s16      q8, q0, q7            @      a0 = r0 + r7;
+    vadd.s16      q9, q1, q6            @      a1 = r1 + r6;
+    vadd.s16      q10, q2, q5           @     a2 = r2 + r5;
+    vadd.s16      q11, q3, q4           @     a3 = r3 + r4;
+
+    vsub.s16      q12, q0, q7           @     b0 = r0 - r7;
+    vsub.s16      q13, q1, q6           @     b1 = r1 - r6;
+    vsub.s16      q14, q2, q5           @     b2 = r2 - r5;
+    vsub.s16      q15, q3, q4           @     b3 = r3 - r4;
+
+    vadd.s16      q1, q8, q11           @     a4 = a0 + a3;
+    vadd.s16      q3, q9, q10           @     a5 = a1 + a2;
+    vsub.s16      q5, q8, q11           @     a6 = a0 - a3;
+    vsub.s16      q7, q9, q10           @     a7 = a1 - a2;
+
+
+    vadd.s16      q0, q1, q3            @      pi2_res[0] = a4 + a5;
+
+    vshr.s16      q8, q7, #1            @      pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
+    @DSHIFT_TO_0 Q8,Q7,#1,#0
+    vadd.s16      q2, q5, q8            @
+
+    vsub.s16      q4, q1, q3            @      pi2_res[4] = a4 - a5;
+
+    vshr.s16      q9, q5, #1            @      pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
+    vsub.s16      q6, q9, q7            @
+
+@do not change Q0,Q2.Q4,Q6 they contain results
+@Q1,Q3,Q5,Q7 TO STORE RESULTS
+@Q8 Q9 Q10 Q11 USE @WILL
+
+    vshr.s16      q1, q12, #1           @     D_SHIFT(b0,1,shft)
+    vshr.s16      q3, q13, #1           @     D_SHIFT(b1,1,shft)
+    vshr.s16      q5, q14, #1           @     D_SHIFT(b2,1,shft)
+    vshr.s16      q7, q15, #1           @     D_SHIFT(b3,1,shft)
+
+
+    vadd.s16      q8, q1, q12           @     (D_SHIFT(b0,1,shft) + b0);
+    vadd.s16      q9, q3, q13           @     (D_SHIFT(b1,1,shft) + b1);
+    vadd.s16      q10, q5, q14          @    (D_SHIFT(b2,1,shft) + b2);
+    vadd.s16      q11, q7, q15          @    (D_SHIFT(b3,1,shft) + b3);
+
+    vadd.s16      q1, q14, q8           @     b2 + (D_SHIFT(b0,1,shft) + b0);
+    vadd.s16      q3, q15, q10          @    b3 + (D_SHIFT(b2,1,shft) + b2);
+    vsub.s16      q5, q15, q9           @     b3 - (D_SHIFT(b1,1,shft) + b1);
+    vsub.s16      q7, q11, q14          @    -b2 + (D_SHIFT(b3,1,shft) + b3);
+
+    vadd.s16      q8, q13, q1           @     b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
+    vsub.s16      q9, q12, q3           @     b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
+    vadd.s16      q10, q12, q5          @    b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
+    vadd.s16      q11, q13, q7          @    b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
+
+    vshr.s16      q15, q8, #2           @     D_SHIFT(b4,2,shft)
+    vshr.s16      q14, q9, #2           @     D_SHIFT(b5,2,shft);
+    vshr.s16      q13, q10, #2          @    D_SHIFT(b6,2,shft);
+    vshr.s16      q12, q11, #2          @    D_SHIFT(b7,2,shft);
+
+
+@since we are going to scal by small values, we need not expand the guys to 32 bit bit values
+    vsub.s16      q5, q10, q14          @    pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
+    vsub.s16      q7, q15, q11          @    pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
+    vadd.s16      q3, q9, q13           @     pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
+    vadd.s16      q1, q8, q12           @     pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
+
+    @------------vert transform done-------------------------
+    @results are in Q0-Q7
+    @all other neon registes can be used at will
+
+    @scaling
+    @since the 8x8 scaling matrix repeats in 1x4,1x4 block ,
+    @we need only load 4 values for each row and in total 4 rows
+    vld1.s16      {q14-q15}, [r6]       @
+
+    @since we need to get a 32 bit o/p for two 16 bit multiplications
+    @we need a VMULL instruction
+@-----------------------------first and second row
+
+    vmull.s16     q8, d0, d28           @scale the first row first 4 elem
+    vmull.s16     q9, d28, d1           @scale the second row last 4 elemts
+
+    vmull.s16     q10, d2, d29          @ scale second row first 4 elem
+    vmull.s16     q11, d29, d3          @scale the second row last 4 elem
+    vmull.s16     q12, d4, d30          @scale third row first  4 elem
+
+    vst1.s32      {q8, q9}, [r2], r5    @ write the first row complete
+
+    vmull.s16     q13, d30, d5          @scale the third row last 4 elem
+    vmull.s16     q8, d6, d31           @scale the fourth row first 4 elem
+
+
+    vst1.s32      {q10, q11}, [r2], r5  @store the second row complete
+
+@------------------------------- 3rd and 4th row
+
+    vmull.s16     q9, d31, d7           @scale the fourth row second column
+
+    vst1.s32      {q12, q13}, [r2], r5  @store the third row complete
+
+    vmull.s16     q10, d8, d28          @scale the 5th row fisrst 4 elms
+    vmull.s16     q11, d28, d9          @scale the 5th row second 4 elems
+
+    vmull.s16     q12, d10, d29         @scale the 6th row first4 elements
+
+
+    vst1.s32      {q8, q9}, [r2], r5    @store fifth row
+
+@--------------------------------5th and 6th row
+
+    vmull.s16     q13, d29, d11         @scale 6th row sendond 4 elems
+
+    vmull.s16     q8, d12, d30          @scale 7th rw first 4 elms
+
+    vst1.s32      {q10, q11}, [r2], r5  @store 6th row second 4 elements
+
+    vmull.s16     q9, d30, d13          @scale 7th rw second 4 elms
+    vmull.s16     q10, d14, d31         @scale 8th rw forst 4 elms
+
+
+    vst1.s32      {q12, q13}, [r2], r5  @store 6th row
+
+@----------------------------------7th and 8th row
+    vmull.s16     q11, d31, d15         @scale 8th row second 4 elms
+
+    vst1.s32      {q8, q9}, [r2], r5    @store 7th row
+    vst1.s32      {q10, q11}, [r2], r5  @store 8th row
+
+@----------------------------------done writing
+
+    pop           {r4-r12, pc}          @pop back all variables
+
+
+
+
+
+
diff --git a/common/arm/ih264_resi_trans_quant_a9.s b/common/arm/ih264_resi_trans_quant_a9.s
new file mode 100755
index 0000000..caf362e
--- /dev/null
+++ b/common/arm/ih264_resi_trans_quant_a9.s
@@ -0,0 +1,694 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@*******************************************************************************
+@* @file
+@*  ih264_resi_trans_quant_a9.s
+@*
+@* @brief
+@*  Contains function definitions for residual and forward trans
+@*
+@* @author
+@*  Ittiam
+@*
+@* @par List of Functions:
+@*  ih264_resi_trans_quant_4x4_a9
+@*  ih264_resi_trans_quant_8x8_a9
+@*  ih264_resi_trans_quant_chroma_4x4_a9
+@*  ih264_hadamard_quant_4x4_a9
+@*  ih264_hadamard_quant_2x2_uv_a9
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+
+
+.text
+.p2align 2
+@*****************************************************************************
+@*
+@* Function Name     : ih264_resi_trans_quant_4x4_a9
+@* Description       : This function does cf4 of H264
+@*
+@* Arguments         :  R0 :pointer to src buffer
+@                       R1 :pointer to pred buffer
+@                       R2 :pointer to dst buffer
+@                       R3 :source stride
+@                       STACK : pred stride,
+@                               dst stride,
+@                               pointer to scaling matrix,
+@                               pointer to threshold matrix,
+@                               qbits,
+@                               rounding factor,
+@                               pointer to store nnz
+@                               pointer to store non quantized dc value
+@ Values Returned   : NONE
+@
+@ Register Usage    :
+@ Stack Usage       : 40 bytes
+@ Cycles            : Around
+@ Interruptiaility  : Interruptable
+@
+@ Known Limitations
+@   \Assumptions    :
+@
+@ Revision History  :
+@         DD MM YYYY    Author(s)   Changes
+@         1 12 2013    100633      First version
+@         20 1 2014    100633      Changes the API, Optimization
+@
+@*****************************************************************************
+
+    .global ih264_resi_trans_quant_4x4_a9
+ih264_resi_trans_quant_4x4_a9:
+
+    @R0     :pointer to src buffer
+    @R1     :pointer to pred buffer
+    @R2     :pointer to dst buffer
+    @R3     :Source stride
+    @STACk  :pred stride
+    @       :scale matirx,
+    @       :threshold matrix
+    @       :qbits
+    @       :round factor
+    @       :nnz
+
+    push          {r4-r12, lr}          @push all the variables first
+
+    add           r11, sp, #40          @decrement stack pointer,to accomodate two variables
+    ldmfd         r11, {r4-r10}         @load the strides into registers
+
+    @R0     :pointer to src buffer
+    @R1     :pointer to pred buffer
+    @R2     :pointer to dst buffer
+    @R3     :Source stride
+    @R4     :Pred stride
+    @R5     :scale matirx,
+    @R6     :threshold matrix
+    @R7     :qbits
+    @R8     :round factor
+    @R9     :nnz
+
+    vpush         {d8-d15}
+
+    mov           r11, #0
+    sub           r7, r11, r7           @Negate the qbit value for usiing LSL
+
+    @------------Fucntion Loading done----------------;
+
+    vld1.u8       d30, [r0], r3         @load first 8 pix src row 1
+
+    vld1.u8       d31, [r1], r4         @load first 8 pix pred row 1
+
+    vld1.u8       d28, [r0], r3         @load first 8 pix src row 2
+
+    vld1.u8       d29, [r1], r4         @load first 8 pix pred row 2
+
+    vld1.u8       d26, [r0], r3         @load first 8 pix src row 3
+
+    vld1.u8       d27, [r1], r4         @load first 8 pix pred row 3
+    vsubl.u8      q0, d30, d31          @find residue row 1
+
+    vld1.u8       d24, [r0], r3         @load first 8 pix src row 4
+
+    vld1.u8       d25, [r1], r4         @load first 8 pix pred row 4
+    vsubl.u8      q1, d28, d29          @find residue row 2
+
+    vsubl.u8      q2, d26, d27          @find residue row 3
+    vsubl.u8      q3, d24, d25          @find residue row 4
+
+    vtrn.16       d0, d2                @T12
+    vtrn.16       d4, d6                @T23
+    vtrn.32       d0, d4                @T13
+    vtrn.32       d2, d6                @T14
+
+    vadd.s16      d8 , d0, d6           @x0 = x4+x7
+    vadd.s16      d9 , d2, d4           @x1 = x5+x6
+    vsub.s16      d10, d2, d4           @x2 = x5-x6
+    vsub.s16      d11, d0, d6           @x3 = x4-x7
+
+    vshl.s16      d12, d10, #1          @U_SHIFT(x2,1,shft)
+    vshl.s16      d13, d11, #1          @U_SHIFT(x3,1,shft)
+
+    vadd.s16      d14, d8, d9           @x4 = x0 + x1;
+    vsub.s16      d16, d8, d9           @x6 = x0 - x1;
+    vadd.s16      d15, d13, d10         @x5 = U_SHIFT(x3,1,shft) + x2;
+    vsub.s16      d17, d11, d12         @x7 = x3 - U_SHIFT(x2,1,shft);
+
+    @taking transpose again so as to make do vert transform
+    vtrn.16       d14, d15              @T12
+    vtrn.16       d16, d17              @T23
+    vtrn.32       d14, d16              @T13
+    vtrn.32       d15, d17              @T24
+
+    @let us do vertical transform
+    @same code as horiz
+    vadd.s16      d18, d14, d17         @x0 = x4+x7
+    vadd.s16      d19, d15, d16         @x1 = x5+x6
+    vsub.s16      d20, d15, d16         @x2 = x5-x6
+    vsub.s16      d21, d14, d17         @x3 = x4-x7
+
+    vshl.s16      d22, d20, #1          @U_SHIFT(x2,1,shft)
+    vshl.s16      d23, d21, #1          @U_SHIFT(x3,1,shft)
+
+    vdup.s32      q4, r8                @Load rounding value row 1
+
+    vadd.s16      d24, d18, d19         @x5 = x0 + x1;
+    vsub.s16      d26, d18, d19         @x7 = x0 - x1;
+    vadd.s16      d25, d23, d20         @x6 = U_SHIFT(x3,1,shft) + x2;
+    vsub.s16      d27, d21, d22         @x8 = x3 - U_SHIFT(x2,1,shft);
+    vdup.s32      q10, r7               @Load qbit values
+
+    vst1.s16      d24[0], [r10]         @Store the dc value to alternate dc sddress
+
+@core tranform is done for 4x8 block 1
+    vld1.s16      {q14-q15}, [r5]       @load the scaling values
+
+    vabs.s16      q0, q12               @Abs val of row 1 blk 1
+
+    vabs.s16      q1, q13               @Abs val of row 2 blk 1
+
+    vmov.s32      q5, q4                @copy round fact for row 2
+
+    vmov.s32      q6, q4                @copy round fact for row 2
+    vclt.s16      q2, q12, #0           @Get the sign of row 1 blk 1
+
+    vmov.s32      q7, q4                @copy round fact for row 2
+    vclt.s16      q3, q13, #0           @Get the sign of row 2 blk 1
+
+    vmlal.s16     q4, d0, d28           @Multiply and add row 1
+    vmlal.s16     q5, d1, d29           @Multiply and add row 2
+    vmlal.s16     q6, d2, d30           @Multiply and add row 3
+    vmlal.s16     q7, d3, d31           @Multiply and add row 4
+
+    vshl.s32      q11, q4, q10          @Shift row 1
+    vshl.s32      q12, q5, q10          @Shift row 2
+    vshl.s32      q13, q6, q10          @Shift row 3
+    vshl.s32      q14, q7, q10          @Shift row 4
+
+    vmovn.s32     d30, q11              @Narrow row 1
+    vmovn.s32     d31, q12              @Narrow row 2
+    vmovn.s32     d0 , q13              @Narrow row 3
+    vmovn.s32     d1 , q14              @Narrow row 4
+
+    vneg.s16      q1, q15               @Get negative
+    vneg.s16      q4, q0                @Get negative
+
+    vceq.s16      q5, q15, #0           @I  compare with zero row 1 and 2 blk 1
+    vceq.s16      q6, q0 , #0           @I  compare with zero row 1 and 2 blk 1
+
+    vbsl.s16      q2, q1, q15           @Restore sign of row 1 and 2
+    vbsl.s16      q3, q4, q0            @Restore sign of row 3 and 4
+
+
+    vmovn.u16     d14, q5               @I  Narrow the comparison for row 1 and 2 blk 1
+    vmovn.u16     d15, q6               @I  Narrow the comparison for row 1 and 2 blk 2
+
+    vshr.u8       q8, q7, #7            @I  Reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
+
+    vpadd.u8      d18, d16, d17         @I pair add nnz 1
+    vpadd.u8      d20, d18, d19         @I Pair add nnz 2
+    vpadd.u8      d22, d20, d21         @I Pair add nnz 3
+    vpadd.u8      d24, d22, d23         @I Pair add nnz4
+    vst1.s16      {q2-q3}, [r2]         @Store blk
+
+    vmov.u8       d25, #16              @I Get max nnz
+    vsub.u8       d26, d25, d24         @I invert current nnz
+
+    vst1.u8       d26[0], [r9]          @I  Write nnz
+
+    vpop          {d8-d15}
+    pop           {r4-r12, pc}
+
+
+
+@*****************************************************************************
+@*
+@* Function Name     : ih264_resi_trans_quant_chroma_4x4_a9
+@* Description       : This function does residue calculation, forward transform
+@*                     and quantization for 4x4 chroma block.
+@*
+@* Arguments         :  R0 :pointer to src buffer
+@                       R1 :pointer to pred buffer
+@                       R2 :pointer to dst buffer
+@                       R3 :source stride
+@                       STACK : pred stride,
+@                               dst stride,
+@                               pointer to scaling matrix,
+@                               pointer to threshold matrix,
+@                               qbits,
+@                               rounding factor,
+@                               pointer to store nnz
+@                               pointer to store unquantized dc values
+@ Values Returned   : NONE
+@
+@ Register Usage    :
+@ Stack Usage       : 40 bytes
+@ Cycles            : Around
+@ Interruptiaility  : Interruptable
+@
+@ Known Limitations
+@   \Assumptions    :
+@
+@ Revision History  :
+@         DD MM YYYY    Author(s)   Changes
+@         11 2 2015    100664      First version
+@
+@*****************************************************************************
+
+    .global ih264_resi_trans_quant_chroma_4x4_a9
+ih264_resi_trans_quant_chroma_4x4_a9:
+
+    @R0     :pointer to src buffer
+    @R1     :pointer to pred buffer
+    @R2     :pointer to dst buffer
+    @R3     :Source stride
+    @STACk  :pred stride
+    @       :scale matirx,
+    @       :threshold matrix
+    @       :qbits
+    @       :round factor
+    @       :nnz
+    @       :pu1_dc_alt_addr
+    push          {r4-r12, lr}          @push all the variables first
+
+    add           r11, sp, #40          @decrement stack pointer,to accomodate two variables
+    ldmfd         r11, {r4-r10}         @load the strides into registers
+
+    @R0     :pointer to src buffer
+    @R1     :pointer to pred buffer
+    @R2     :pointer to dst buffer
+    @R3     :Source stride
+    @R4     :Pred stride
+    @R5     :scale matirx,
+    @R6     :threshold matrix
+    @R7     :qbits
+    @R8     :round factor
+    @R9     :nnz
+    vpush         {d8-d15}
+    mov           r11, #0
+    sub           r7, r11, r7           @Negate the qbit value for usiing LSL
+
+    @------------Fucntion Loading done----------------;
+
+    vld2.u8       {d10, d11}, [r0], r3  @load first 8 pix src row 1
+
+    vld2.u8       {d11, d12}, [r1], r4  @load first 8 pix pred row 1
+
+    vld2.u8       {d28, d29}, [r0], r3  @load first 8 pix src row 2
+
+    vld2.u8       {d29, d30}, [r1], r4  @load first 8 pix pred row 2
+
+    vld2.u8       {d25, d26}, [r0], r3  @load first 8 pix src row 3
+
+    vld2.u8       {d26, d27}, [r1], r4  @load first 8 pix pred row 3
+    vsubl.u8      q0, d10, d11          @find residue row 1
+
+    vld2.u8       {d22, d23}, [r0], r3  @load first 8 pix src row 4
+
+    vld2.u8       {d23, d24}, [r1], r4  @load first 8 pix pred row 4
+    vsubl.u8      q1, d28, d29          @find residue row 2
+
+    vsubl.u8      q2, d25, d26          @find residue row 3
+    vsubl.u8      q3, d22, d23          @find residue row 4
+
+    vtrn.16       d0, d2                @T12
+    vtrn.16       d4, d6                @T23
+    vtrn.32       d0, d4                @T13
+    vtrn.32       d2, d6                @T14
+
+    vadd.s16      d8 , d0, d6           @x0 = x4+x7
+    vadd.s16      d9 , d2, d4           @x1 = x5+x6
+    vsub.s16      d10, d2, d4           @x2 = x5-x6
+    vsub.s16      d11, d0, d6           @x3 = x4-x7
+
+    vshl.s16      d12, d10, #1          @U_SHIFT(x2,1,shft)
+    vshl.s16      d13, d11, #1          @U_SHIFT(x3,1,shft)
+
+    vadd.s16      d14, d8, d9           @x4 = x0 + x1;
+    vsub.s16      d16, d8, d9           @x6 = x0 - x1;
+    vadd.s16      d15, d13, d10         @x5 = U_SHIFT(x3,1,shft) + x2;
+    vsub.s16      d17, d11, d12         @x7 = x3 - U_SHIFT(x2,1,shft);
+
+    @taking transpose again so as to make do vert transform
+    vtrn.16       d14, d15              @T12
+    vtrn.16       d16, d17              @T23
+    vtrn.32       d14, d16              @T13
+    vtrn.32       d15, d17              @T24
+
+    @let us do vertical transform
+    @same code as horiz
+    vadd.s16      d18, d14, d17         @x0 = x4+x7
+    vadd.s16      d19, d15, d16         @x1 = x5+x6
+    vsub.s16      d20, d15, d16         @x2 = x5-x6
+    vsub.s16      d21, d14, d17         @x3 = x4-x7
+
+    vshl.s16      d22, d20, #1          @U_SHIFT(x2,1,shft)
+    vshl.s16      d23, d21, #1          @U_SHIFT(x3,1,shft)
+
+    vdup.s32      q4, r8                @Load rounding value row 1
+
+    vadd.s16      d24, d18, d19         @x5 = x0 + x1;
+    vsub.s16      d26, d18, d19         @x7 = x0 - x1;
+    vadd.s16      d25, d23, d20         @x6 = U_SHIFT(x3,1,shft) + x2;
+    vsub.s16      d27, d21, d22         @x8 = x3 - U_SHIFT(x2,1,shft);
+    vdup.s32      q10, r7               @Load qbit values
+
+    vst1.s16      d24[0], [r10]         @Store Unquantized dc value to dc alte address
+
+@core tranform is done for 4x8 block 1
+    vld1.s16      {q14-q15}, [r5]       @load the scaling values
+
+    vabs.s16      q0, q12               @Abs val of row 1 blk 1
+
+    vabs.s16      q1, q13               @Abs val of row 2 blk 1
+
+    vmov.s32      q5, q4                @copy round fact for row 2
+
+    vmov.s32      q6, q4                @copy round fact for row 2
+    vclt.s16      q2, q12, #0           @Get the sign of row 1 blk 1
+
+    vmov.s32      q7, q4                @copy round fact for row 2
+    vclt.s16      q3, q13, #0           @Get the sign of row 2 blk 1
+
+    vmlal.s16     q4, d0, d28           @Multiply and add row 1
+    vmlal.s16     q5, d1, d29           @Multiply and add row 2
+    vmlal.s16     q6, d2, d30           @Multiply and add row 3
+    vmlal.s16     q7, d3, d31           @Multiply and add row 4
+
+    vshl.s32      q11, q4, q10          @Shift row 1
+    vshl.s32      q12, q5, q10          @Shift row 2
+    vshl.s32      q13, q6, q10          @Shift row 3
+    vshl.s32      q14, q7, q10          @Shift row 4
+
+    vmovn.s32     d30, q11              @Narrow row 1
+    vmovn.s32     d31, q12              @Narrow row 2
+    vmovn.s32     d0 , q13              @Narrow row 3
+    vmovn.s32     d1 , q14              @Narrow row 4
+
+    vneg.s16      q1, q15               @Get negative
+    vneg.s16      q4, q0                @Get negative
+
+    vceq.s16      q5, q15, #0           @I  compare with zero row 1 and 2 blk 1
+    vceq.s16      q6, q0 , #0           @I  compare with zero row 1 and 2 blk 1
+
+    vbsl.s16      q2, q1, q15           @Restore sign of row 1 and 2
+    vbsl.s16      q3, q4, q0            @Restore sign of row 3 and 4
+
+    vmovn.u16     d14, q5               @I  Narrow the comparison for row 1 and 2 blk 1
+    vmovn.u16     d15, q6               @I  Narrow the comparison for row 1 and 2 blk 2
+
+    vshr.u8       q8, q7, #7            @I  Reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
+
+    vpadd.u8      d18, d16, d17         @I pair add nnz 1
+    vpadd.u8      d20, d18, d19         @I Pair add nnz 2
+    vpadd.u8      d22, d20, d21         @I Pair add nnz 3
+    vpadd.u8      d24, d22, d23         @I Pair add nnz4
+    vst1.s16      {q2-q3}, [r2]         @Store blk
+
+    vmov.u8       d25, #16              @I Get max nnz
+    vsub.u8       d26, d25, d24         @I invert current nnz
+
+    vst1.u8       d26[0], [r9]          @I  Write nnz
+
+    vpop          {d8-d15}
+    pop           {r4-r12, pc}
+
+
+
+@*****************************************************************************
+@*
+@* Function Name     : ih264_hadamard_quant_4x4_a9
+@* Description       : This function does forward hadamard transform and
+@*                     quantization for luma dc block
+@*
+@* Arguments         :  R0 :pointer to src buffer
+@                       R1 :pointer to dst buffer
+@                       R2 :pu2_scale_matrix
+@                       R2 :pu2_threshold_matrix
+@                       STACk : u4_qbits
+@                               u4_round_factor
+@                               pu1_nnz
+@ Values Returned   : NONE
+@
+@ Register Usage    :
+@ Stack Usage       : 0 bytes
+@ Cycles            : Around
+@ Interruptiaility  : Interruptable
+@
+@ Known Limitations
+@   \Assumptions    :
+@
+@ Revision History  :
+@         DD MM YYYY    Author(s)   Changes
+@         20 2 2015    100633      First version
+@
+@*****************************************************************************
+@ih264_hadamard_quant_4x4_a9(WORD16 *pi2_src, WORD16 *pi2_dst,
+@                           const UWORD16 *pu2_scale_matrix,
+@                           const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
+@                           UWORD32 u4_round_factor,UWORD8  *pu1_nnz
+@                           )
+    .global ih264_hadamard_quant_4x4_a9
+ih264_hadamard_quant_4x4_a9:
+
+@Registert usage
+@   r0 : src
+@   r1 : dst
+@   r2 : *pu2_scale_matrix
+@   r3 : *pu2_threshold_matrix
+
+    vld4.s16      {d0, d1, d2, d3}, [r0]! @Load 4x4 block
+    vpush         {d8-d15}
+
+    vld1.u16      d30[0], [r2]          @load pu2_scale_matrix[0]
+
+    vaddl.s16     q3, d0, d3            @x0 = x4 + x7;
+    vaddl.s16     q4, d1, d2            @x1 = x5 + x6;
+    vsubl.s16     q5, d1, d2            @x2 = x5 - x6;
+    vsubl.s16     q6, d0, d3            @x3 = x4 - x7;
+
+    vdup.u16      d30, d30[0]           @pu2_scale_matrix[0]
+
+    vadd.s32      q7, q3, q4            @pi2_dst[0] = x0 + x1;
+    vadd.s32      q8, q6, q5            @pi2_dst[1] = x3 + x2;
+    add           r3, sp, #68           @Get address of u4_round_factor
+    vsub.s32      q9, q3, q4            @pi2_dst[2] = x0 - x1;
+    vsub.s32      q10, q6, q5           @pi2_dst[3] = x3 - x2;
+
+    vtrn.s32      q7, q8                @transpose 4x4 block
+    vtrn.s32      q9, q10
+    vld1.s32      d0[0], [r3]           @load   u4_round_factor
+    vswp          d15, d18
+    vswp          d17, d20
+
+    add           r3, sp, #64           @Get address of u4_qbits
+    vadd.s32      q11, q7, q10          @x0 = x4 + x7;
+    vadd.s32      q12, q8, q9           @x1 = x5 + x6;
+    vld1.s32      d31[0], [r3]          @load  u4_qbits
+    vsub.s32      q13, q8, q9           @x2 = x5 - x6;
+    vsub.s32      q14, q7, q10          @x3 = x4 - x7;
+
+    vdup.s32      q7, d0[0]             @u4_round_factor
+
+    vadd.s32      q0, q11, q12          @(x0 + x1)
+    vadd.s32      q1, q14, q13          @(x3 + x2)
+    vsub.s32      q2, q11, q12          @(x0 - x1)
+    vsub.s32      q3, q14, q13          @(x3 - x2)
+
+    vdup.s32      q11, d31[0]           @u4_round_factor
+
+    vshrn.s32     d0, q0, #1            @i4_value = (x0 + x1) >> 1;
+    vshrn.s32     d1, q1, #1            @i4_value = (x3 + x2) >> 1;
+    vshrn.s32     d2, q2, #1            @i4_value = (x0 - x1) >> 1;
+    vshrn.s32     d3, q3, #1            @i4_value = (x3 - x2) >> 1;
+
+    vabs.s16      q5, q0
+    vabs.s16      q6, q1
+
+    vmov.s32      q8, q7                @Get the round fact
+    vmov.s32      q9, q7
+    vmov.s32      q10, q7
+
+    vclt.s16      q3, q0, #0            @get the sign row 1,2
+    vclt.s16      q4, q1, #0
+
+    vneg.s32      q11, q11              @-u4_round_factor
+
+    vmlal.u16     q7, d10, d30
+    vmlal.u16     q8, d11, d30
+    vmlal.u16     q9, d12, d30
+    vmlal.u16     q10, d13, d30
+
+    vshl.u32      q7, q7, q11
+    vshl.u32      q8, q8, q11
+    vshl.u32      q9, q9, q11
+    vshl.u32      q10, q10, q11
+
+    vqmovn.u32    d22, q7
+    vqmovn.u32    d23, q8
+    vqmovn.u32    d24, q9
+    vqmovn.u32    d25, q10
+
+    vneg.s16      q13, q11
+    vneg.s16      q14, q12
+
+    vbsl.s16      q3, q13, q11
+    vbsl.s16      q4, q14, q12
+
+    vceq.s16      q5, q11, #0
+    vceq.s16      q6, q12, #0
+
+    vst1.s16      {q3}, [r1]!
+
+    vshrn.u16     d14, q5, #8
+    vshrn.u16     d15, q6, #8
+
+    ldr           r3, [sp, #72]         @Load *pu1_nnz
+
+    vshr.u8       q7, q7, #7
+
+    vst1.s16      {q4}, [r1]!
+
+    vadd.u8       d16, d14, d15
+    vmov.u8       d20, #16
+    vpadd.u8      d17, d16, d16
+    vpadd.u8      d18, d17, d17
+    vpadd.u8      d19, d18, d18
+    vsub.u8       d20, d20, d19
+    vst1.u8       d20[0], [r3]
+
+    vpop          {d8-d15}
+    bx            lr
+
+
+
+
+@*****************************************************************************
+@*
+@* Function Name     : ih264_hadamard_quant_2x2_uv_a9
+@* Description       : This function does forward hadamard transform and
+@*                     quantization for dc block of chroma for both planes
+@*
+@* Arguments         :  R0 :pointer to src buffer
+@                       R1 :pointer to dst buffer
+@                       R2 :pu2_scale_matrix
+@                       R2 :pu2_threshold_matrix
+@                       STACk : u4_qbits
+@                               u4_round_factor
+@                               pu1_nnz
+@ Values Returned   : NONE
+@
+@ Register Usage    :
+@ Stack Usage       : 0 bytes
+@ Cycles            : Around
+@ Interruptiaility  : Interruptable
+@
+@ Known Limitations
+@   \Assumptions    :
+@
+@ Revision History  :
+@         DD MM YYYY    Author(s)   Changes
+@         20 2 2015    100633      First version
+@
+@*****************************************************************************
+@ ih264_hadamard_quant_2x2_uv_a9(WORD16 *pi2_src, WORD16 *pi2_dst,
+@                             const UWORD16 *pu2_scale_matrix,
+@                             const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
+@                             UWORD32 u4_round_factor,UWORD8  *pu1_nnz
+@                             )
+
+    .global ih264_hadamard_quant_2x2_uv_a9
+ih264_hadamard_quant_2x2_uv_a9:
+
+    vpush         {d8-d15}
+    vld2.s16      {d0-d1}, [r0]         @load src
+
+    add           r3, sp, #68           @Get address of u4_round_factor
+
+    vaddl.s16     q3, d0, d1            @x0 = x4 + x5;, x2 = x6 + x7;
+    vld1.u16      d30[0], [r2]          @load pu2_scale_matrix[0]
+    vsubl.s16     q4, d0, d1            @x1 = x4 - x5;  x3 = x6 - x7;
+
+    add           r0, sp, #64           @Get affress of u4_qbits
+    vld1.s32      d28[0], [r3]          @load   u4_round_factor
+    vtrn.s32      q3, q4                @q1 -> x0 x1, q2 -> x2 x3
+
+    vadd.s32      q0, q3, q4            @ (x0 + x2) (x1 + x3)  (y0 + y2); (y1 + y3);
+    vld1.s32      d24[0], [r0]          @load  u4_qbits
+    vsub.s32      q1, q3, q4            @ (x0 - x2) (x1 - x3)  (y0 - y2); (y1 - y3);
+
+    vdup.u16      d30, d30[0]           @pu2_scale_matrix
+
+    vabs.s32      q2, q0
+    vabs.s32      q3, q1
+
+    vdup.s32      q14, d28[0]           @u4_round_factor
+
+    vmovl.u16     q15, d30              @pu2_scale_matrix
+
+    vclt.s32      q4, q0, #0            @get the sign row 1,2
+    vdup.s32      q12, d24[0]           @u4_round_factor
+    vclt.s32      q5, q1, #0
+
+    vqmovn.u32    d8, q4
+    vqmovn.s32    d9, q5
+
+    vmov.s32      q13, q14              @Get the round fact
+    vneg.s32      q12, q12              @-u4_round_factor
+
+    vmla.u32      q13, q2, q15
+    vmla.u32      q14, q3, q15
+
+    vshl.u32      q13, q13, q12         @>>qbit
+    vshl.u32      q14, q14, q12         @>>qbit
+
+    vqmovn.u32    d10, q13
+    vqmovn.u32    d11, q14
+
+    vneg.s16      q6, q5
+
+    vbsl.s16      q4, q6, q5            @*sign
+
+    vtrn.s32      d8, d9
+
+    vceq.s16      q7, q4, #0            @Compute nnz
+
+    vshrn.u16     d14, q7, #8           @reduce nnz comparison to 1 bit
+
+    ldr           r3, [sp, #72]         @Load *pu1_nnz
+    vshr.u8       d14, d14, #7          @reduce nnz comparison to 1 bit
+    vmov.u8       d20, #4               @Since we add zeros, we need to subtract from 4 to get nnz
+    vpadd.u8      d17, d14, d14         @Sum up nnz
+
+    vst1.s16      {q4}, [r1]!           @Store the block
+
+    vpadd.u8      d17, d17, d17         @Sum up nnz
+    vsub.u8       d20, d20, d17         @4- numzeros
+    vst1.u16      d20[0], [r3]          @store nnz
+
+    vpop          {d8-d15}
+    bx            lr
+
+
+
+
+
diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s
new file mode 100755
index 0000000..ccae779
--- /dev/null
+++ b/common/arm/ih264_weighted_bi_pred_a9q.s
@@ -0,0 +1,642 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_weighted_bi_pred_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for weighted biprediction.
+@*
+@* @author
+@*  Kaushik Senthoor R
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_weighted_bi_pred_luma_a9q()
+@*  - ih264_weighted_bi_pred_chroma_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@*******************************************************************************
+@* @function
+@*  ih264_weighted_bi_pred_luma_a9q()
+@*
+@* @brief
+@*  This routine performs the weighted biprediction as described in sec
+@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
+@*
+@* @par Description:
+@*  This function gets two ht x wd blocks, calculates the weighted samples,
+@* rounds off, adds offset and stores it in the destination block.
+@*
+@* @param[in] pu1_src1
+@*  UWORD8 Pointer to the buffer containing the input block 1.
+@*
+@* @param[in] pu1_src2
+@*  UWORD8 Pointer to the buffer containing the input block 2.
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd1
+@*  Stride of the input buffer 1
+@*
+@* @param[in] src_strd2
+@*  Stride of the input buffer 2
+@*
+@* @param[in] dst_strd
+@*  Stride of the destination buffer
+@*
+@* @param[in] log_wd
+@*  number of bits to be rounded off
+@*
+@* @param[in] wt1
+@*  weight for the weighted prediction
+@*
+@* @param[in] wt2
+@*  weight for the weighted prediction
+@*
+@* @param[in] ofst1
+@*  offset 1 used after rounding off
+@*
+@* @param[in] ofst2
+@*  offset 2 used after rounding off
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
+@*
+@*******************************************************************************
+@*/
+@void ih264_weighted_bi_pred_luma_a9q(UWORD8 *pu1_src1,
+@                                     UWORD8 *pu1_src2,
+@                                     UWORD8 *pu1_dst,
+@                                     WORD32 src_strd1,
+@                                     WORD32 src_strd2,
+@                                     WORD32 dst_strd,
+@                                     WORD32 log_wd,
+@                                     WORD32 wt1,
+@                                     WORD32 wt2,
+@                                     WORD32 ofst1,
+@                                     WORD32 ofst2,
+@                                     WORD32 ht,
+@                                     WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@   r0      => pu1_src1
+@   r1      => pu1_src2
+@   r2      => pu1_dst
+@   r3      => src_strd1
+@   [sp]    => src_strd2 (r4)
+@   [sp+4]  => dst_strd  (r5)
+@   [sp+8]  => log_wd    (r6)
+@   [sp+12] => wt1       (r7)
+@   [sp+16] => wt2       (r8)
+@   [sp+20] => ofst1     (r9)
+@   [sp+24] => ofst2     (r10)
+@   [sp+28] => ht        (r11)
+@   [sp+32] => wd        (r12)
+@
+.text
+.p2align 2
+
+    .global ih264_weighted_bi_pred_luma_a9q
+
+ih264_weighted_bi_pred_luma_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
+    ldr           r6, [sp, #48]         @Load log_wd in r6
+    ldr           r7, [sp, #52]         @Load wt1 in r7
+    ldr           r8, [sp, #56]         @Load wt2 in r8
+    ldr           r9, [sp, #60]         @Load ofst1 in r9
+
+    add           r6, r6, #1            @r6  = log_wd + 1
+    sxtb          r7, r7                @sign-extend 16-bit wt1 to 32-bit
+    ldr           r4, [sp, #40]         @Load src_strd2 in r4
+    ldr           r5, [sp, #44]         @Load dst_strd in r5
+    sxtb          r9, r9                @sign-extend 8-bit ofst1 to 32-bit
+    rsb           r10, r6, #0           @r13 = -(log_wd + 1)
+    ldr           r11, [sp, #68]        @Load ht in r11
+    ldr           r12, [sp, #72]        @Load wd in r12
+    vdup.16       q0, r10               @Q0  = -(log_wd + 1) (32-bit)
+    add           r9, r9, #1            @r9 = ofst1 + 1
+
+    ldr           r10, [sp, #64]        @Load ofst2 in r10
+    sxtb          r8, r8                @sign-extend 16-bit wt2 to 32-bit
+    cmp           r12, #16              @check if wd is 16
+    vpush         {d8-d15}
+    sxtb          r10, r10              @sign-extend 8-bit ofst2 to 32-bit
+    add           r9, r9, r10           @r9 = ofst1 + ofst2 + 1
+    vmov          d2, r7, r8            @D2 = {wt1(32-bit), wt2(32-bit)}
+    asr           r9, r9, #1            @r9 = ofst = (ofst1 + ofst2 + 1) >> 1
+    vdup.8        d3, r9                @D3 = ofst (8-bit)
+    beq           loop_16               @branch if wd is 16
+
+    cmp           r12, #8               @check if wd is 8
+    beq           loop_8                @branch if wd is 8
+
+loop_4:                                 @each iteration processes four rows
+
+    vld1.32       d4[0], [r0], r3       @load row 1 in source 1
+    vld1.32       d4[1], [r0], r3       @load row 2 in source 1
+    vld1.32       d6[0], [r1], r4       @load row 1 in source 2
+    vld1.32       d6[1], [r1], r4       @load row 2 in source 2
+
+    vmovl.u8      q2, d4                @converting rows 1,2 in source 1 to 16-bit
+    vld1.32       d8[0], [r0], r3       @load row 3 in source 1
+    vld1.32       d8[1], [r0], r3       @load row 4 in source 1
+    vmovl.u8      q3, d6                @converting rows 1,2 in source 2 to 16-bit
+    vld1.32       d10[0], [r1], r4      @load row 3 in source 2
+    vld1.32       d10[1], [r1], r4      @load row 4 in source 2
+
+    vmovl.u8      q4, d8                @converting rows 3,4 in source 1 to 16-bit
+    vmovl.u8      q5, d10               @converting rows 3,4 in source 2 to 16-bit
+
+    vmul.s16      q2, q2, d2[0]         @weight 1 mult. for rows 1,2
+    vmla.s16      q2, q3, d2[2]         @weight 2 mult. for rows 1,2
+    vmul.s16      q4, q4, d2[0]         @weight 1 mult. for rows 3,4
+    vmla.s16      q4, q5, d2[2]         @weight 2 mult. for rows 3,4
+
+    subs          r11, r11, #4          @decrement ht by 4
+    vrshl.s16     q2, q2, q0            @rounds off the weighted samples from rows 1,2
+    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from rows 3,4
+
+    vaddw.s8      q2, q2, d3            @adding offset for rows 1,2
+    vaddw.s8      q4, q4, d3            @adding offset for rows 3,4
+
+    vqmovun.s16   d4, q2                @saturating rows 1,2 to unsigned 8-bit
+    vqmovun.s16   d8, q4                @saturating rows 3,4 to unsigned 8-bit
+
+    vst1.32       d4[0], [r2], r5       @store row 1 in destination
+    vst1.32       d4[1], [r2], r5       @store row 2 in destination
+    vst1.32       d8[0], [r2], r5       @store row 3 in destination
+    vst1.32       d8[1], [r2], r5       @store row 4 in destination
+
+    bgt           loop_4                @if greater than 0 repeat the loop again
+
+    b             end_loops
+
+loop_8:                                 @each iteration processes four rows
+
+    vld1.8        d4, [r0], r3          @load row 1 in source 1
+    vld1.8        d6, [r1], r4          @load row 1 in source 2
+    vld1.8        d8, [r0], r3          @load row 2 in source 1
+    vld1.8        d10, [r1], r4         @load row 2 in source 2
+    vmovl.u8      q2, d4                @converting row 1 in source 1 to 16-bit
+    vld1.8        d12, [r0], r3         @load row 3 in source 1
+    vld1.8        d14, [r1], r4         @load row 3 in source 2
+    vmovl.u8      q3, d6                @converting row 1 in source 2 to 16-bit
+    vld1.8        d16, [r0], r3         @load row 4 in source 1
+    vld1.8        d18, [r1], r4         @load row 4 in source 2
+
+    vmovl.u8      q4, d8                @converting row 2 in source 1 to 16-bit
+    vmovl.u8      q5, d10               @converting row 2 in source 2 to 16-bit
+
+    vmul.s16      q2, q2, d2[0]         @weight 1 mult. for row 1
+    vmla.s16      q2, q3, d2[2]         @weight 2 mult. for row 1
+    vmovl.u8      q6, d12               @converting row 3 in source 1 to 16-bit
+    vmovl.u8      q7, d14               @converting row 3 in source 2 to 16-bit
+    vmul.s16      q4, q4, d2[0]         @weight 1 mult. for row 2
+    vmla.s16      q4, q5, d2[2]         @weight 2 mult. for row 2
+    vmovl.u8      q8, d16               @converting row 4 in source 1 to 16-bit
+    vmovl.u8      q9, d18               @converting row 4 in source 2 to 16-bit
+
+    vmul.s16      q6, q6, d2[0]         @weight 1 mult. for row 3
+    vmla.s16      q6, q7, d2[2]         @weight 2 mult. for row 3
+    vmul.s16      q8, q8, d2[0]         @weight 1 mult. for row 4
+    vmla.s16      q8, q9, d2[2]         @weight 2 mult. for row 4
+
+    vrshl.s16     q2, q2, q0            @rounds off the weighted samples from row 1
+    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 2
+    vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 3
+    vaddw.s8      q2, q2, d3            @adding offset for row 1
+    vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 4
+    vaddw.s8      q4, q4, d3            @adding offset for row 2
+
+    vaddw.s8      q6, q6, d3            @adding offset for row 3
+    vqmovun.s16   d4, q2                @saturating row 1 to unsigned 8-bit
+    vaddw.s8      q8, q8, d3            @adding offset for row 4
+    vqmovun.s16   d8, q4                @saturating row 2 to unsigned 8-bit
+
+    vqmovun.s16   d12, q6               @saturating row 3 to unsigned 8-bit
+    vqmovun.s16   d16, q8               @saturating row 4 to unsigned 8-bit
+
+    vst1.8        d4, [r2], r5          @store row 1 in destination
+    vst1.8        d8, [r2], r5          @store row 2 in destination
+    subs          r11, r11, #4          @decrement ht by 4
+    vst1.8        d12, [r2], r5         @store row 3 in destination
+    vst1.8        d16, [r2], r5         @store row 4 in destination
+
+    bgt           loop_8                @if greater than 0 repeat the loop again
+
+    b             end_loops
+
+loop_16:                                @each iteration processes two rows
+
+    vld1.8        {q2}, [r0], r3        @load row 1 in source 1
+    vld1.8        {q3}, [r1], r4        @load row 1 in source 2
+    vld1.8        {q4}, [r0], r3        @load row 2 in source 1
+    vld1.8        {q5}, [r1], r4        @load row 2 in source 2
+    vmovl.u8      q10, d4               @converting row 1L in source 1 to 16-bit
+    vld1.8        {q6}, [r0], r3        @load row 3 in source 1
+    vld1.8        {q7}, [r1], r4        @load row 3 in source 2
+    vmovl.u8      q11, d6               @converting row 1L in source 2 to 16-bit
+    vld1.8        {q8}, [r0], r3        @load row 4 in source 1
+    vld1.8        {q9}, [r1], r4        @load row 4 in source 2
+
+    vmovl.u8      q2, d5                @converting row 1H in source 1 to 16-bit
+    vmovl.u8      q3, d7                @converting row 1H in source 2 to 16-bit
+
+    vmul.s16      q10, q10, d2[0]       @weight 1 mult. for row 1L
+    vmla.s16      q10, q11, d2[2]       @weight 2 mult. for row 1L
+    vmovl.u8      q12, d8               @converting row 2L in source 1 to 16-bit
+    vmovl.u8      q13, d10              @converting row 2L in source 2 to 16-bit
+
+    vmul.s16      q2, q2, d2[0]         @weight 1 mult. for row 1H
+    vmla.s16      q2, q3, d2[2]         @weight 2 mult. for row 1H
+    vmovl.u8      q4, d9                @converting row 2H in source 1 to 16-bit
+    vmovl.u8      q5, d11               @converting row 2H in source 2 to 16-bit
+
+    vmul.s16      q12, q12, d2[0]       @weight 1 mult. for row 2L
+    vmla.s16      q12, q13, d2[2]       @weight 2 mult. for row 2L
+    vmovl.u8      q14, d12              @converting row 3L in source 1 to 16-bit
+    vmovl.u8      q15, d14              @converting row 3L in source 2 to 16-bit
+
+    vmul.s16      q4, q4, d2[0]         @weight 1 mult. for row 2H
+    vmla.s16      q4, q5, d2[2]         @weight 2 mult. for row 2H
+    vmovl.u8      q6, d13               @converting row 3H in source 1 to 16-bit
+    vmovl.u8      q7, d15               @converting row 3H in source 2 to 16-bit
+
+    vmul.s16      q14, q14, d2[0]       @weight 1 mult. for row 3L
+    vmla.s16      q14, q15, d2[2]       @weight 2 mult. for row 3L
+    vmovl.u8      q11, d16              @converting row 4L in source 1 to 16-bit
+    vmovl.u8      q3, d18               @converting row 4L in source 2 to 16-bit
+
+    vmul.s16      q6, q6, d2[0]         @weight 1 mult. for row 3H
+    vmla.s16      q6, q7, d2[2]         @weight 2 mult. for row 3H
+    vmovl.u8      q8, d17               @converting row 4H in source 1 to 16-bit
+    vmovl.u8      q9, d19               @converting row 4H in source 2 to 16-bit
+
+    vmul.s16      q11, q11, d2[0]       @weight 1 mult. for row 4L
+    vmla.s16      q11, q3, d2[2]        @weight 2 mult. for row 4L
+    vrshl.s16     q10, q10, q0          @rounds off the weighted samples from row 1L
+
+    vmul.s16      q8, q8, d2[0]         @weight 1 mult. for row 4H
+    vmla.s16      q8, q9, d2[2]         @weight 2 mult. for row 4H
+    vrshl.s16     q2, q2, q0            @rounds off the weighted samples from row 1H
+
+    vrshl.s16     q12, q12, q0          @rounds off the weighted samples from row 2L
+    vaddw.s8      q10, q10, d3          @adding offset for row 1L
+    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 2H
+    vaddw.s8      q2, q2, d3            @adding offset for row 1H
+    vrshl.s16     q14, q14, q0          @rounds off the weighted samples from row 3L
+    vaddw.s8      q12, q12, d3          @adding offset for row 2L
+    vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 3H
+    vaddw.s8      q4, q4, d3            @adding offset for row 2H
+    vrshl.s16     q11, q11, q0          @rounds off the weighted samples from row 4L
+    vaddw.s8      q14, q14, d3          @adding offset for row 3L
+    vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 4H
+    vaddw.s8      q6, q6, d3            @adding offset for row 3H
+
+    vqmovun.s16   d26, q10              @saturating row 1L to unsigned 8-bit
+    vaddw.s8      q11, q11, d3          @adding offset for row 4L
+    vqmovun.s16   d27, q2               @saturating row 1H to unsigned 8-bit
+    vaddw.s8      q8, q8, d3            @adding offset for row 4H
+
+    vqmovun.s16   d10, q12              @saturating row 2L to unsigned 8-bit
+    vqmovun.s16   d11, q4               @saturating row 2H to unsigned 8-bit
+    vqmovun.s16   d30, q14              @saturating row 3L to unsigned 8-bit
+    vqmovun.s16   d31, q6               @saturating row 3H to unsigned 8-bit
+    vst1.8        {q13}, [r2], r5       @store row 1 in destination
+    vqmovun.s16   d14, q11              @saturating row 4L to unsigned 8-bit
+    vqmovun.s16   d15, q8               @saturating row 4H to unsigned 8-bit
+
+    vst1.8        {q5}, [r2], r5        @store row 2 in destination
+    subs          r11, r11, #4          @decrement ht by 4
+    vst1.8        {q15}, [r2], r5       @store row 3 in destination
+    vst1.8        {q7}, [r2], r5        @store row 4 in destination
+
+    bgt           loop_16               @if greater than 0 repeat the loop again
+
+end_loops:
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from sp
+
+
+@*******************************************************************************
+@* @function
+@*  ih264_weighted_bi_pred_chroma_a9q()
+@*
+@* @brief
+@*  This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
+@*
+@* @par Description:
+@*  This function gets two ht x wd blocks, calculates the weighted samples,
+@* rounds off, adds offset and stores it in the destination block for U and V.
+@*
+@* @param[in] pu1_src1
+@*  UWORD8 Pointer to the buffer containing the input block 1.
+@*
+@* @param[in] pu1_src2
+@*  UWORD8 Pointer to the buffer containing the input block 2.
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd1
+@*  Stride of the input buffer 1
+@*
+@* @param[in] src_strd2
+@*  Stride of the input buffer 2
+@*
+@* @param[in] dst_strd
+@*  Stride of the destination buffer
+@*
+@* @param[in] log_wd
+@*  number of bits to be rounded off
+@*
+@* @param[in] wt1
+@*  weights for the weighted prediction in U and V
+@*
+@* @param[in] wt2
+@*  weights for the weighted prediction in U and V
+@*
+@* @param[in] ofst1
+@*  offset 1 used after rounding off for U an dV
+@*
+@* @param[in] ofst2
+@*  offset 2 used after rounding off for U and V
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
+@*
+@*******************************************************************************
+@*/
+@void ih264_weighted_bi_pred_chroma_a9q(UWORD8 *pu1_src1,
+@                                       UWORD8 *pu1_src2,
+@                                       UWORD8 *pu1_dst,
+@                                       WORD32 src_strd1,
+@                                       WORD32 src_strd2,
+@                                       WORD32 dst_strd,
+@                                       WORD32 log_wd,
+@                                       WORD32 wt1,
+@                                       WORD32 wt2,
+@                                       WORD32 ofst1,
+@                                       WORD32 ofst2,
+@                                       WORD32 ht,
+@                                       WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@   r0      => pu1_src1
+@   r1      => pu1_src2
+@   r2      => pu1_dst
+@   r3      => src_strd1
+@   [sp]    => src_strd2 (r4)
+@   [sp+4]  => dst_strd  (r5)
+@   [sp+8]  => log_wd    (r6)
+@   [sp+12] => wt1       (r7)
+@   [sp+16] => wt2       (r8)
+@   [sp+20] => ofst1     (r9)
+@   [sp+24] => ofst2     (r10)
+@   [sp+28] => ht        (r11)
+@   [sp+32] => wd        (r12)
+@
+
+
+    .global ih264_weighted_bi_pred_chroma_a9q
+
+ih264_weighted_bi_pred_chroma_a9q:
+
+    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
+
+    ldr           r6, [sp, #48]         @Load log_wd in r6
+    ldr           r7, [sp, #52]         @Load wt1 in r7
+    ldr           r8, [sp, #56]         @Load wt2 in r8
+    add           r6, r6, #1            @r6  = log_wd + 1
+    ldr           r9, [sp, #60]         @Load ofst1 in r9
+    ldr           r10, [sp, #64]        @Load ofst2 in r10
+
+    rsb           r12, r6, #0           @r12 = -(log_wd + 1)
+    ldr           r4, [sp, #40]         @Load src_strd2 in r4
+    ldr           r5, [sp, #44]         @Load dst_strd in r5
+    vdup.16       q0, r12               @Q0  = -(log_wd + 1) (16-bit)
+
+    ldr           r11, [sp, #68]        @Load ht in r11
+    vdup.32       q1, r7                @Q1 = (wt1_u, wt1_v) (32-bit)
+    ldr           r12, [sp, #72]        @Load wd in r12
+    vdup.32       q2, r8                @Q2 = (wt2_u, wt2_v) (32-bit)
+    asr           r7, r9, #8            @r7 = ofst1_v
+    asr           r8, r10, #8           @r8 = ofst2_v
+    vpush         {d8-d15}
+    sxtb          r9, r9                @sign-extend 8-bit ofst1_u to 32-bit
+    sxtb          r10, r10              @sign-extend 8-bit ofst2_u to 32-bit
+    sxtb          r7, r7                @sign-extend 8-bit ofst1_v to 32-bit
+    sxtb          r8, r8                @sign-extend 8-bit ofst2_v to 32-bit
+
+    add           r9, r9, #1            @r9 = ofst1_u + 1
+    add           r7, r7, #1            @r7 = ofst1_v + 1
+    add           r9, r9, r10           @r9 = ofst1_u + ofst2_u + 1
+    add           r7, r7, r8            @r7 = ofst1_v + ofst2_v + 1
+    asr           r9, r9, #1            @r9 = ofst_u = (ofst1_u + ofst2_u + 1) >> 1
+    asr           r7, r7, #1            @r7 = ofst_v = (ofst1_v + ofst2_v + 1) >> 1
+    cmp           r12, #8               @check if wd is 8
+    pkhbt         r9, r9, r7, lsl #16   @r9 = {ofst_u(16-bit), ofst_v(16-bit)}
+    vdup.32       q3, r9                @Q3 = {ofst_u(16-bit), ofst_v(16-bit)}
+    beq           loop_8_uv             @branch if wd is 8
+
+    cmp           r12, #4               @check if wd is 4
+    beq           loop_4_uv             @branch if wd is 4
+
+loop_2_uv:                              @each iteration processes two rows
+
+    vld1.32       d8[0], [r0], r3       @load row 1 in source 1
+    vld1.32       d8[1], [r0], r3       @load row 2 in source 1
+    vld1.32       d10[0], [r1], r4      @load row 1 in source 2
+    vld1.32       d10[1], [r1], r4      @load row 2 in source 2
+
+    vmovl.u8      q4, d8                @converting rows 1,2 in source 1 to 16-bit
+    vmovl.u8      q5, d10               @converting rows 1,2 in source 2 to 16-bit
+
+    vmul.s16      q4, q4, q1            @weight 1 mult. for rows 1,2
+    vmla.s16      q4, q5, q2            @weight 2 mult. for rows 1,2
+
+    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from rows 1,2
+
+    vadd.s16      q4, q4, q3            @adding offset for rows 1,2
+
+    vqmovun.s16   d8, q4                @saturating rows 1,2 to unsigned 8-bit
+
+    vst1.32       d8[0], [r2], r5       @store row 1 in destination
+    vst1.32       d8[1], [r2], r5       @store row 2 in destination
+
+    subs          r11, r11, #2          @decrement ht by 2
+    bgt           loop_2_uv             @if greater than 0 repeat the loop again
+
+    b             end_loops_uv
+
+loop_4_uv:                              @each iteration processes two rows
+
+    vld1.8        d8, [r0], r3          @load row 1 in source 1
+    vld1.8        d10, [r1], r4         @load row 1 in source 2
+    vmovl.u8      q4, d8                @converting row 1 in source 1 to 16-bit
+    vld1.8        d12, [r0], r3         @load row 2 in source 1
+    vmovl.u8      q5, d10               @converting row 1 in source 2 to 16-bit
+    vld1.8        d14, [r1], r4         @load row 2 in source 2
+
+    vmovl.u8      q6, d12               @converting row 2 in source 1 to 16-bit
+    vmul.s16      q4, q4, q1            @weight 1 mult. for row 1
+    vmla.s16      q4, q5, q2            @weight 2 mult. for row 1
+    vmovl.u8      q7, d14               @converting row 2 in source 2 to 16-bit
+
+    vmul.s16      q6, q6, q1            @weight 1 mult. for row 2
+    vmla.s16      q6, q7, q2            @weight 2 mult. for row 2
+
+    subs          r11, r11, #2          @decrement ht by 2
+    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 1
+    vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 2
+    vadd.s16      q4, q4, q3            @adding offset for row 1
+    vadd.s16      q6, q6, q3            @adding offset for row 2
+
+    vqmovun.s16   d8, q4                @saturating row 1 to unsigned 8-bit
+    vqmovun.s16   d12, q6               @saturating row 2 to unsigned 8-bit
+
+    vst1.8        d8, [r2], r5          @store row 1 in destination
+    vst1.8        d12, [r2], r5         @store row 2 in destination
+
+    bgt           loop_4_uv             @if greater than 0 repeat the loop again
+
+    b             end_loops_uv
+
+loop_8_uv:                              @each iteration processes two rows
+
+    vld1.8        {q4}, [r0], r3        @load row 1 in source 1
+    vld1.8        {q5}, [r1], r4        @load row 1 in source 2
+    vld1.8        {q6}, [r0], r3        @load row 2 in source 1
+    vld1.8        {q7}, [r1], r4        @load row 2 in source 2
+    vmovl.u8      q12, d8               @converting row 1L in source 1 to 16-bit
+    vld1.8        {q8}, [r0], r3        @load row 3 in source 1
+    vld1.8        {q9}, [r1], r4        @load row 3 in source 2
+    vmovl.u8      q13, d10              @converting row 1L in source 2 to 16-bit
+    vld1.8        {q10}, [r0], r3       @load row 4 in source 1
+    vld1.8        {q11}, [r1], r4       @load row 4 in source 2
+
+    vmovl.u8      q4, d9                @converting row 1H in source 1 to 16-bit
+    vmovl.u8      q5, d11               @converting row 1H in source 2 to 16-bit
+
+    vmul.s16      q12, q12, q1          @weight 1 mult. for row 1L
+    vmla.s16      q12, q13, q2          @weight 2 mult. for row 1L
+    vmovl.u8      q14, d12              @converting row 2L in source 1 to 16-bit
+    vmovl.u8      q15, d14              @converting row 2L in source 2 to 16-bit
+
+    vmul.s16      q4, q4, q1            @weight 1 mult. for row 1H
+    vmla.s16      q4, q5, q2            @weight 2 mult. for row 1H
+    vmovl.u8      q6, d13               @converting row 2H in source 1 to 16-bit
+    vmovl.u8      q7, d15               @converting row 2H in source 2 to 16-bit
+
+    vmul.s16      q14, q14, q1          @weight 1 mult. for row 2L
+    vmla.s16      q14, q15, q2          @weight 2 mult. for row 2L
+    vmovl.u8      q13, d16              @converting row 3L in source 1 to 16-bit
+    vmovl.u8      q5, d18               @converting row 3L in source 2 to 16-bit
+
+    vmul.s16      q6, q6, q1            @weight 1 mult. for row 2H
+    vmla.s16      q6, q7, q2            @weight 2 mult. for row 2H
+    vmovl.u8      q8, d17               @converting row 3H in source 1 to 16-bit
+    vmovl.u8      q9, d19               @converting row 3H in source 2 to 16-bit
+
+    vmul.s16      q13, q13, q1          @weight 1 mult. for row 3L
+    vmla.s16      q13, q5, q2           @weight 2 mult. for row 3L
+    vmovl.u8      q15, d20              @converting row 4L in source 1 to 16-bit
+    vmovl.u8      q7, d22               @converting row 4L in source 2 to 16-bit
+
+    vmul.s16      q8, q8, q1            @weight 1 mult. for row 3H
+    vmla.s16      q8, q9, q2            @weight 2 mult. for row 3H
+    vmovl.u8      q10, d21              @converting row 4H in source 1 to 16-bit
+    vmovl.u8      q11, d23              @converting row 4H in source 2 to 16-bit
+
+    vmul.s16      q15, q15, q1          @weight 1 mult. for row 4L
+    vmla.s16      q15, q7, q2           @weight 2 mult. for row 4L
+    vrshl.s16     q12, q12, q0          @rounds off the weighted samples from row 1L
+
+    vmul.s16      q10, q10, q1          @weight 1 mult. for row 4H
+    vmla.s16      q10, q11, q2          @weight 2 mult. for row 4H
+    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 1H
+
+    vrshl.s16     q14, q14, q0          @rounds off the weighted samples from row 2L
+    vadd.s16      q12, q12, q3          @adding offset for row 1L
+    vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 2H
+    vadd.s16      q4, q4, q3            @adding offset for row 1H
+    vrshl.s16     q13, q13, q0          @rounds off the weighted samples from row 3L
+    vadd.s16      q14, q14, q3          @adding offset for row 2L
+    vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 3H
+    vadd.s16      q6, q6, q3            @adding offset for row 2H
+    vrshl.s16     q15, q15, q0          @rounds off the weighted samples from row 4L
+    vadd.s16      q13, q13, q3          @adding offset for row 3L
+    vrshl.s16     q10, q10, q0          @rounds off the weighted samples from row 4H
+    vadd.s16      q8, q8, q3            @adding offset for row 3H
+
+    vqmovun.s16   d10, q12              @saturating row 1L to unsigned 8-bit
+    vadd.s16      q15, q15, q3          @adding offset for row 4L
+    vqmovun.s16   d11, q4               @saturating row 1H to unsigned 8-bit
+    vadd.s16      q10, q10, q3          @adding offset for row 4H
+
+    vqmovun.s16   d18, q14              @saturating row 2L to unsigned 8-bit
+    vqmovun.s16   d19, q6               @saturating row 2H to unsigned 8-bit
+    vqmovun.s16   d14, q13              @saturating row 3L to unsigned 8-bit
+    vqmovun.s16   d15, q8               @saturating row 3H to unsigned 8-bit
+    vst1.8        {q5}, [r2], r5        @store row 1 in destination
+    vqmovun.s16   d22, q15              @saturating row 4L to unsigned 8-bit
+    vqmovun.s16   d23, q10              @saturating row 4H to unsigned 8-bit
+
+    vst1.8        {q9}, [r2], r5        @store row 2 in destination
+    subs          r11, r11, #4          @decrement ht by 4
+    vst1.8        {q7}, [r2], r5        @store row 3 in destination
+    vst1.8        {q11}, [r2], r5       @store row 4 in destination
+
+    bgt           loop_8_uv             @if greater than 0 repeat the loop again
+
+end_loops_uv:
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from sp
+
+
diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s
new file mode 100755
index 0000000..1ce94d0
--- /dev/null
+++ b/common/arm/ih264_weighted_pred_a9q.s
@@ -0,0 +1,479 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@*  ih264_weighted_pred_a9q.s
+@*
+@* @brief
+@*  Contains function definitions for weighted prediction.
+@*
+@* @author
+@*  Kaushik Senthoor R
+@*
+@* @par List of Functions:
+@*
+@*  - ih264_weighted_pred_luma_a9q()
+@*  - ih264_weighted_pred_chroma_a9q()
+@*
+@* @remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@*******************************************************************************
+@* @function
+@*  ih264_weighted_pred_luma_a9q()
+@*
+@* @brief
+@*  This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
+@*
+@* @par Description:
+@*  This function gets a ht x wd block, calculates the weighted sample, rounds
+@* off, adds offset and stores it in the destination block.
+@*
+@* @param[in] pu1_src:
+@*  UWORD8 Pointer to the buffer containing the input block.
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd
+@*  Stride of the input buffer
+@*
+@* @param[in] dst_strd
+@*  Stride of the destination buffer
+@*
+@* @param[in] log_wd
+@*  number of bits to be rounded off
+@*
+@* @param[in] wt
+@*  weight for the weighted prediction
+@*
+@* @param[in] ofst
+@*  offset used after rounding off
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
+@*
+@*******************************************************************************
+@*/
+@void ih264_weighted_pred_luma_a9q(UWORD8 *pu1_src,
+@                                  UWORD8 *pu1_dst,
+@                                  WORD32 src_strd,
+@                                  WORD32 dst_strd,
+@                                  WORD32 log_wd,
+@                                  WORD32 wt,
+@                                  WORD32 ofst,
+@                                  WORD32 ht,
+@                                  WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@   r0      => pu1_src
+@   r1      => pu1_dst
+@   r2      => src_strd
+@   r3      => dst_strd
+@   [sp]    => log_wd (r4)
+@   [sp+4]  => wt     (r5)
+@   [sp+8]  => ofst   (r6)
+@   [sp+12] => ht     (r7)
+@   [sp+16] => wd     (r8)
+@
+.text
+.p2align 2
+
+    .global ih264_weighted_pred_luma_a9q
+
+ih264_weighted_pred_luma_a9q:
+
+    stmfd         sp!, {r4-r9, r14}     @stack stores the values of the arguments
+    ldr           r5, [sp, #32]         @Load wt
+    ldr           r4, [sp, #28]         @Load log_wd in r4
+    ldr           r6, [sp, #36]         @Load ofst
+    ldr           r7, [sp, #40]         @Load ht
+    ldr           r8, [sp, #44]         @Load wd
+    vpush         {d8-d15}
+
+    vdup.16       d2, r5                @D2 = wt (16-bit)
+    rsb           r9, r4, #0            @r9 = -log_wd
+    vdup.8        d3, r6                @D3 = ofst (8-bit)
+    cmp           r8, #16               @check if wd is 16
+    vdup.16       q0, r9                @Q0 = -log_wd (16-bit)
+    beq           loop_16               @branch if wd is 16
+
+    cmp           r8, #8                @check if wd is 8
+    beq           loop_8                @branch if wd is 8
+
+loop_4:                                 @each iteration processes four rows
+
+    vld1.32       d4[0], [r0], r2       @load row 1 in source
+    vld1.32       d4[1], [r0], r2       @load row 2 in source
+    vld1.32       d6[0], [r0], r2       @load row 3 in source
+    vld1.32       d6[1], [r0], r2       @load row 4 in source
+
+    vmovl.u8      q2, d4                @converting rows 1,2 to 16-bit
+    vmovl.u8      q3, d6                @converting rows 3,4 to 16-bit
+
+    vmul.s16      q2, q2, d2[0]         @weight mult. for rows 1,2
+    vmul.s16      q3, q3, d2[0]         @weight mult. for rows 3,4
+
+    subs          r7, r7, #4            @decrement ht by 4
+    vrshl.s16     q2, q2, q0            @rounds off the weighted samples from rows 1,2
+    vrshl.s16     q3, q3, q0            @rounds off the weighted samples from rows 3,4
+
+    vaddw.s8      q2, q2, d3            @adding offset for rows 1,2
+    vaddw.s8      q3, q3, d3            @adding offset for rows 3,4
+
+    vqmovun.s16   d4, q2                @saturating rows 1,2 to unsigned 8-bit
+    vqmovun.s16   d6, q3                @saturating rows 3,4 to unsigned 8-bit
+
+    vst1.32       d4[0], [r1], r3       @store row 1 in destination
+    vst1.32       d4[1], [r1], r3       @store row 2 in destination
+    vst1.32       d6[0], [r1], r3       @store row 3 in destination
+    vst1.32       d6[1], [r1], r3       @store row 4 in destination
+
+    bgt           loop_4                @if greater than 0 repeat the loop again
+
+    b             end_loops
+
+loop_8:                                 @each iteration processes four rows
+
+    vld1.8        d4, [r0], r2          @load row 1 in source
+    vld1.8        d6, [r0], r2          @load row 2 in source
+    vld1.8        d8, [r0], r2          @load row 3 in source
+    vmovl.u8      q2, d4                @converting row 1 to 16-bit
+    vld1.8        d10, [r0], r2         @load row 4 in source
+    vmovl.u8      q3, d6                @converting row 2 to 16-bit
+
+    vmovl.u8      q4, d8                @converting row 3 to 16-bit
+    vmul.s16      q2, q2, d2[0]         @weight mult. for row 1
+    vmovl.u8      q5, d10               @converting row 4 to 16-bit
+    vmul.s16      q3, q3, d2[0]         @weight mult. for row 2
+    vmul.s16      q4, q4, d2[0]         @weight mult. for row 3
+    vmul.s16      q5, q5, d2[0]         @weight mult. for row 4
+
+    vrshl.s16     q2, q2, q0            @rounds off the weighted samples from row 1
+    vrshl.s16     q3, q3, q0            @rounds off the weighted samples from row 2
+    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 3
+    vaddw.s8      q2, q2, d3            @adding offset for row 1
+    vrshl.s16     q5, q5, q0            @rounds off the weighted samples from row 4
+    vaddw.s8      q3, q3, d3            @adding offset for row 2
+
+    vaddw.s8      q4, q4, d3            @adding offset for row 3
+    vqmovun.s16   d4, q2                @saturating row 1 to unsigned 8-bit
+    vaddw.s8      q5, q5, d3            @adding offset for row 4
+    vqmovun.s16   d6, q3                @saturating row 2 to unsigned 8-bit
+    vqmovun.s16   d8, q4                @saturating row 3 to unsigned 8-bit
+    vqmovun.s16   d10, q5               @saturating row 4 to unsigned 8-bit
+
+    vst1.8        d4, [r1], r3          @store row 1 in destination
+    vst1.8        d6, [r1], r3          @store row 2 in destination
+    subs          r7, r7, #4            @decrement ht by 4
+    vst1.8        d8, [r1], r3          @store row 3 in destination
+    vst1.8        d10, [r1], r3         @store row 4 in destination
+
+    bgt           loop_8                @if greater than 0 repeat the loop again
+
+    b             end_loops
+
+loop_16:                                @each iteration processes two rows
+
+    vld1.8        {q2}, [r0], r2        @load row 1 in source
+    vld1.8        {q3}, [r0], r2        @load row 2 in source
+    vmovl.u8      q6, d4                @converting row 1L to 16-bit
+    vld1.8        {q4}, [r0], r2        @load row 3 in source
+    vmovl.u8      q7, d5                @converting row 1H to 16-bit
+    vld1.8        {q5}, [r0], r2        @load row 4 in source
+
+    vmovl.u8      q8, d6                @converting row 2L to 16-bit
+    vmul.s16      q6, q6, d2[0]         @weight mult. for row 1L
+    vmovl.u8      q9, d7                @converting row 2H to 16-bit
+    vmul.s16      q7, q7, d2[0]         @weight mult. for row 1H
+    vmovl.u8      q10, d8               @converting row 3L to 16-bit
+    vmul.s16      q8, q8, d2[0]         @weight mult. for row 2L
+    vmovl.u8      q11, d9               @converting row 3H to 16-bit
+    vmul.s16      q9, q9, d2[0]         @weight mult. for row 2H
+    vmovl.u8      q12, d10              @converting row 4L to 16-bit
+    vmul.s16      q10, q10, d2[0]       @weight mult. for row 3L
+    vmovl.u8      q13, d11              @converting row 4H to 16-bit
+    vmul.s16      q11, q11, d2[0]       @weight mult. for row 3H
+
+    vmul.s16      q12, q12, d2[0]       @weight mult. for row 4L
+    vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 1L
+    vmul.s16      q13, q13, d2[0]       @weight mult. for row 4H
+
+    vrshl.s16     q7, q7, q0            @rounds off the weighted samples from row 1H
+    vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 2L
+    vaddw.s8      q6, q6, d3            @adding offset for row 1L
+    vrshl.s16     q9, q9, q0            @rounds off the weighted samples from row 2H
+    vaddw.s8      q7, q7, d3            @adding offset for row 1H
+    vqmovun.s16   d4, q6                @saturating row 1L to unsigned 8-bit
+    vrshl.s16     q10, q10, q0          @rounds off the weighted samples from row 3L
+    vaddw.s8      q8, q8, d3            @adding offset for row 2L
+    vqmovun.s16   d5, q7                @saturating row 1H to unsigned 8-bit
+    vrshl.s16     q11, q11, q0          @rounds off the weighted samples from row 3H
+    vaddw.s8      q9, q9, d3            @adding offset for row 2H
+    vqmovun.s16   d6, q8                @saturating row 2L to unsigned 8-bit
+    vrshl.s16     q12, q12, q0          @rounds off the weighted samples from row 4L
+    vaddw.s8      q10, q10, d3          @adding offset for row 3L
+    vqmovun.s16   d7, q9                @saturating row 2H to unsigned 8-bit
+    vrshl.s16     q13, q13, q0          @rounds off the weighted samples from row 4H
+    vaddw.s8      q11, q11, d3          @adding offset for row 3H
+
+    vqmovun.s16   d8, q10               @saturating row 3L to unsigned 8-bit
+    vaddw.s8      q12, q12, d3          @adding offset for row 4L
+    vqmovun.s16   d9, q11               @saturating row 3H to unsigned 8-bit
+    vaddw.s8      q13, q13, d3          @adding offset for row 4H
+
+    vqmovun.s16   d10, q12              @saturating row 4L to unsigned 8-bit
+    vst1.8        {q2}, [r1], r3        @store row 1 in destination
+    vqmovun.s16   d11, q13              @saturating row 4H to unsigned 8-bit
+    vst1.8        {q3}, [r1], r3        @store row 2 in destination
+    subs          r7, r7, #4            @decrement ht by 4
+    vst1.8        {q4}, [r1], r3        @store row 3 in destination
+    vst1.8        {q5}, [r1], r3        @store row 4 in destination
+
+    bgt           loop_16               @if greater than 0 repeat the loop again
+
+end_loops:
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r9, r15}     @Reload the registers from sp
+
+
+@*******************************************************************************
+@* @function
+@*  ih264_weighted_pred_chroma_a9q()
+@*
+@* @brief
+@*  This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
+@*
+@* @par Description:
+@*  This function gets a ht x wd block, calculates the weighted sample, rounds
+@* off, adds offset and stores it in the destination block for U and V.
+@*
+@* @param[in] pu1_src:
+@*  UWORD8 Pointer to the buffer containing the input block.
+@*
+@* @param[out] pu1_dst
+@*  UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd
+@*  Stride of the input buffer
+@*
+@* @param[in] dst_strd
+@*  Stride of the destination buffer
+@*
+@* @param[in] log_wd
+@*  number of bits to be rounded off
+@*
+@* @param[in] wt
+@*  weights for the weighted prediction for U and V
+@*
+@* @param[in] ofst
+@*  offsets used after rounding off for U and V
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*  None
+@*
+@* @remarks
+@*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
+@*
+@*******************************************************************************
+@*/
+@void ih264_weighted_pred_chroma_a9q(UWORD8 *pu1_src,
+@                                    UWORD8 *pu1_dst,
+@                                    WORD32 src_strd,
+@                                    WORD32 dst_strd,
+@                                    WORD32 log_wd,
+@                                    WORD32 wt,
+@                                    WORD32 ofst,
+@                                    WORD32 ht,
+@                                    WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@   r0      => pu1_src
+@   r1      => pu1_dst
+@   r2      => src_strd
+@   r3      => dst_strd
+@   [sp]    => log_wd (r4)
+@   [sp+4]  => wt     (r5)
+@   [sp+8]  => ofst   (r6)
+@   [sp+12] => ht     (r7)
+@   [sp+16] => wd     (r8)
+@
+
+
+    .global ih264_weighted_pred_chroma_a9q
+
+ih264_weighted_pred_chroma_a9q:
+
+    stmfd         sp!, {r4-r9, r14}     @stack stores the values of the arguments
+
+    ldr           r4, [sp, #28]         @Load log_wd in r4
+    ldr           r5, [sp, #32]         @Load wt = {wt_u (16-bit), wt_v (16-bit)}
+    ldr           r6, [sp, #36]         @Load ofst = {ofst_u (8-bit), ofst_v (8-bit)}
+    ldr           r8, [sp, #44]         @Load wd
+
+    rsb           r9, r4, #0            @r9 = -log_wd
+    vdup.32       q1, r5                @Q1 = {wt_u (16-bit), wt_v (16-bit)}
+    ldr           r7, [sp, #40]         @Load ht
+    vpush         {d8-d15}
+    vdup.16       d4, r6                @D4 = {ofst_u (8-bit), ofst_v (8-bit)}
+    cmp           r8, #8                @check if wd is 8
+    vdup.16       q0, r9                @Q0 = -log_wd (16-bit)
+    beq           loop_8_uv             @branch if wd is 8
+
+    cmp           r8, #4                @check if ws is 4
+    beq           loop_4_uv             @branch if wd is 4
+
+loop_2_uv:                              @each iteration processes two rows
+
+    vld1.32       d6[0], [r0], r2       @load row 1 in source
+    vld1.32       d6[1], [r0], r2       @load row 2 in source
+
+    vmovl.u8      q3, d6                @converting rows 1,2 to 16-bit
+
+    vmul.s16      q3, q3, q1            @weight mult. for rows 1,2
+
+    vrshl.s16     q3, q3, q0            @rounds off the weighted samples from rows 1,2
+
+    vaddw.s8      q3, q3, d4            @adding offset for rows 1,2
+
+    vqmovun.s16   d6, q3                @saturating rows 1,2 to unsigned 8-bit
+
+    subs          r7, r7, #2            @decrement ht by 2
+    vst1.32       d6[0], [r1], r3       @store row 1 in destination
+    vst1.32       d6[1], [r1], r3       @store row 2 in destination
+
+    bgt           loop_2_uv             @if greater than 0 repeat the loop again
+
+    b             end_loops_uv
+
+loop_4_uv:                              @each iteration processes two rows
+
+    vld1.8        d6, [r0], r2          @load row 1 in source
+    vld1.8        d8, [r0], r2          @load row 2 in source
+
+    vmovl.u8      q3, d6                @converting row 1 to 16-bit
+    vmovl.u8      q4, d8                @converting row 2 to 16-bit
+
+    vmul.s16      q3, q3, q1            @weight mult. for row 1
+    vmul.s16      q4, q4, q1            @weight mult. for row 2
+
+    subs          r7, r7, #2            @decrement ht by 2
+    vrshl.s16     q3, q3, q0            @rounds off the weighted samples from row 1
+    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 2
+
+    vaddw.s8      q3, q3, d4            @adding offset for row 1
+    vaddw.s8      q4, q4, d4            @adding offset for row 2
+
+    vqmovun.s16   d6, q3                @saturating row 1 to unsigned 8-bit
+    vqmovun.s16   d8, q4                @saturating row 2 to unsigned 8-bit
+
+    vst1.8        d6, [r1], r3          @store row 1 in destination
+    vst1.8        d8, [r1], r3          @store row 2 in destination
+
+    bgt           loop_4_uv             @if greater than 0 repeat the loop again
+
+    b             end_loops_uv
+
+loop_8_uv:                              @each iteration processes two rows
+
+    vld1.8        {q3}, [r0], r2        @load row 1 in source
+    vld1.8        {q4}, [r0], r2        @load row 2 in source
+    vmovl.u8      q7, d6                @converting row 1L to 16-bit
+    vld1.8        {q5}, [r0], r2        @load row 3 in source
+    vmovl.u8      q8, d7                @converting row 1H to 16-bit
+    vld1.8        {q6}, [r0], r2        @load row 4 in source
+
+    vmul.s16      q7, q7, q1            @weight mult. for row 1L
+    vmovl.u8      q9, d8                @converting row 2L to 16-bit
+    vmul.s16      q8, q8, q1            @weight mult. for row 1H
+    vmovl.u8      q10, d9               @converting row 2H to 16-bit
+    vmul.s16      q9, q9, q1            @weight mult. for row 2L
+    vmovl.u8      q11, d10              @converting row 3L to 16-bit
+    vmul.s16      q10, q10, q1          @weight mult. for row 2H
+    vmovl.u8      q12, d11              @converting row 3H to 16-bit
+    vmul.s16      q11, q11, q1          @weight mult. for row 3L
+    vmovl.u8      q13, d12              @converting row 4L to 16-bit
+    vmul.s16      q12, q12, q1          @weight mult. for row 3H
+    vmovl.u8      q14, d13              @converting row 4H to 16-bit
+
+    vmul.s16      q13, q13, q1          @weight mult. for row 4L
+    vrshl.s16     q7, q7, q0            @rounds off the weighted samples from row 1L
+    vmul.s16      q14, q14, q1          @weight mult. for row 4H
+
+    vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 1H
+    vrshl.s16     q9, q9, q0            @rounds off the weighted samples from row 2L
+    vaddw.s8      q7, q7, d4            @adding offset for row 1L
+    vrshl.s16     q10, q10, q0          @rounds off the weighted samples from row 2H
+    vaddw.s8      q8, q8, d4            @adding offset for row 1H
+    vqmovun.s16   d6, q7                @saturating row 1L to unsigned 8-bit
+    vrshl.s16     q11, q11, q0          @rounds off the weighted samples from row 3L
+    vaddw.s8      q9, q9, d4            @adding offset for row 2L
+    vqmovun.s16   d7, q8                @saturating row 1H to unsigned 8-bit
+    vrshl.s16     q12, q12, q0          @rounds off the weighted samples from row 3H
+    vaddw.s8      q10, q10, d4          @adding offset for row 2H
+    vqmovun.s16   d8, q9                @saturating row 2L to unsigned 8-bit
+    vrshl.s16     q13, q13, q0          @rounds off the weighted samples from row 4L
+    vaddw.s8      q11, q11, d4          @adding offset for row 3L
+    vqmovun.s16   d9, q10               @saturating row 2H to unsigned 8-bit
+    vrshl.s16     q14, q14, q0          @rounds off the weighted samples from row 4H
+    vaddw.s8      q12, q12, d4          @adding offset for row 3H
+
+    vqmovun.s16   d10, q11              @saturating row 3L to unsigned 8-bit
+    vaddw.s8      q13, q13, d4          @adding offset for row 4L
+    vqmovun.s16   d11, q12              @saturating row 3H to unsigned 8-bit
+    vaddw.s8      q14, q14, d4          @adding offset for row 4H
+
+    vqmovun.s16   d12, q13              @saturating row 4L to unsigned 8-bit
+    vst1.8        {q3}, [r1], r3        @store row 1 in destination
+    vqmovun.s16   d13, q14              @saturating row 4H to unsigned 8-bit
+    vst1.8        {q4}, [r1], r3        @store row 2 in destination
+    subs          r7, r7, #4            @decrement ht by 4
+    vst1.8        {q5}, [r1], r3        @store row 3 in destination
+    vst1.8        {q6}, [r1], r3        @store row 4 in destination
+
+    bgt           loop_8_uv             @if greater than 0 repeat the loop again
+
+end_loops_uv:
+
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r9, r15}     @Reload the registers from sp
+
+
author	Hamsalekha S <hamsalekha.s@ittiam.com>	2015-03-13 21:24:58 +0530
committer	Hamsalekha S <hamsalekha.s@ittiam.com>	2015-04-02 15:59:02 +0530
commit	8d3d303c7942ced6a987a52db8977d768dc3605f (patch)
tree	cc806c96794356996b13ba9970941d0aed74a97e /common/arm
parent	3956d913d37327dcb340f836e604b04bd478b158 (diff)
download	android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2 android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip