54 files changed, 1051 insertions, 1625 deletions
diff --git a/common/arm/ih264_arm_memory_barrier.s b/common/arm/ih264_arm_memory_barrier.s
index 523218f..3816409 100644
--- a/common/arm/ih264_arm_memory_barrier.s
+++ b/common/arm/ih264_arm_memory_barrier.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @*******************************************************************************
 @* @file
 @*  ih264_arm_memory_barrier.s
@@ -39,7 +39,6 @@
 .text
 .p2align 2
 
-
 @*****************************************************************************
 @*
 @* Function Name    : ih264_arm_dsb
diff --git a/common/arm/ih264_deblk_chroma_a9.s b/common/arm/ih264_deblk_chroma_a9.s
index 66102a7..8c9960a 100644
--- a/common/arm/ih264_deblk_chroma_a9.s
+++ b/common/arm/ih264_deblk_chroma_a9.s
@@ -54,7 +54,7 @@
 .text
 .p2align 2
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -84,7 +84,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_chroma_horz_bs4_bp_a9
 
@@ -130,7 +130,7 @@ ih264_deblk_chroma_horz_bs4_bp_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -160,7 +160,7 @@ ih264_deblk_chroma_horz_bs4_bp_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_chroma_vert_bs4_bp_a9
 
@@ -224,7 +224,7 @@ ih264_deblk_chroma_vert_bs4_bp_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -260,7 +260,7 @@ ih264_deblk_chroma_vert_bs4_bp_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_chroma_horz_bslt4_bp_a9
 
@@ -326,7 +326,7 @@ ih264_deblk_chroma_horz_bslt4_bp_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -362,7 +362,7 @@ ih264_deblk_chroma_horz_bslt4_bp_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_chroma_vert_bslt4_bp_a9
 
@@ -465,7 +465,7 @@ ih264_deblk_chroma_vert_bslt4_bp_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -495,7 +495,7 @@ ih264_deblk_chroma_vert_bslt4_bp_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_chroma_vert_bs4_mbaff_bp_a9
 
@@ -543,7 +543,7 @@ ih264_deblk_chroma_vert_bs4_mbaff_bp_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -579,7 +579,7 @@ ih264_deblk_chroma_vert_bs4_mbaff_bp_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9
 
@@ -656,7 +656,7 @@ ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -692,7 +692,7 @@ ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_chroma_horz_bs4_a9
 
@@ -743,7 +743,7 @@ ih264_deblk_chroma_horz_bs4_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -779,7 +779,7 @@ ih264_deblk_chroma_horz_bs4_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_chroma_vert_bs4_a9
 
@@ -848,7 +848,7 @@ ih264_deblk_chroma_vert_bs4_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -893,7 +893,7 @@ ih264_deblk_chroma_vert_bs4_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_chroma_horz_bslt4_a9
 
@@ -968,7 +968,7 @@ ih264_deblk_chroma_horz_bslt4_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -1013,7 +1013,7 @@ ih264_deblk_chroma_horz_bslt4_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_chroma_vert_bslt4_a9
 
@@ -1119,7 +1119,7 @@ ih264_deblk_chroma_vert_bslt4_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -1155,7 +1155,7 @@ ih264_deblk_chroma_vert_bslt4_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_chroma_vert_bs4_mbaff_a9
 
@@ -1206,7 +1206,7 @@ ih264_deblk_chroma_vert_bs4_mbaff_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -1251,7 +1251,7 @@ ih264_deblk_chroma_vert_bs4_mbaff_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_chroma_vert_bslt4_mbaff_a9
 
diff --git a/common/arm/ih264_deblk_luma_a9.s b/common/arm/ih264_deblk_luma_a9.s
index 3e6a4d9..9217ed2 100644
--- a/common/arm/ih264_deblk_luma_a9.s
+++ b/common/arm/ih264_deblk_luma_a9.s
@@ -47,7 +47,7 @@
 .text
 .p2align 2
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -83,7 +83,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_luma_horz_bslt4_a9
 
@@ -187,7 +187,7 @@ ih264_deblk_luma_horz_bslt4_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -217,7 +217,7 @@ ih264_deblk_luma_horz_bslt4_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_luma_horz_bs4_a9
 
@@ -353,7 +353,7 @@ ih264_deblk_luma_horz_bs4_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -389,7 +389,7 @@ ih264_deblk_luma_horz_bs4_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_luma_vert_bslt4_a9
 
@@ -574,7 +574,7 @@ ih264_deblk_luma_vert_bslt4_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -604,7 +604,7 @@ ih264_deblk_luma_vert_bslt4_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_luma_vert_bs4_a9
 
@@ -800,7 +800,7 @@ ih264_deblk_luma_vert_bs4_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -830,7 +830,7 @@ ih264_deblk_luma_vert_bs4_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_luma_vert_bs4_mbaff_a9
 
@@ -942,7 +942,7 @@ ih264_deblk_luma_vert_bs4_mbaff_a9:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -978,7 +978,7 @@ ih264_deblk_luma_vert_bs4_mbaff_a9:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
     .global ih264_deblk_luma_vert_bslt4_mbaff_a9
 
diff --git a/common/arm/ih264_default_weighted_pred_a9q.s b/common/arm/ih264_default_weighted_pred_a9q.s
index 94cda46..a4688f2 100644
--- a/common/arm/ih264_default_weighted_pred_a9q.s
+++ b/common/arm/ih264_default_weighted_pred_a9q.s
@@ -17,14 +17,13 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_default_weighted_pred_a9q.s
 @*
 @* @brief
 @*  Contains function definitions for default weighted prediction.
-@* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
 @*
 @* @author
 @*  Kaushik Senthoor R
@@ -38,7 +37,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @*******************************************************************************
 @* @function
 @*  ih264_default_weighted_pred_luma_a9q()
@@ -82,7 +81,7 @@
 @*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_default_weighted_pred_luma_a9q(UWORD8 *pu1_src1,
 @                                          UWORD8 *pu1_src2,
 @                                          UWORD8 *pu1_dst,
@@ -256,7 +255,7 @@ end_loops:
 @*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_default_weighted_pred_chroma_a9q(UWORD8 *pu1_src1,
 @                                            UWORD8 *pu1_src2,
 @                                            UWORD8 *pu1_dst,
diff --git a/common/arm/ih264_ihadamard_scaling_a9.s b/common/arm/ih264_ihadamard_scaling_a9.s
index 687099a..c7feddd 100644
--- a/common/arm/ih264_ihadamard_scaling_a9.s
+++ b/common/arm/ih264_ihadamard_scaling_a9.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @ *******************************************************************************
 @ * @file
 @ *  ih264_ihadamard_scaling_a9.s
@@ -37,7 +37,7 @@
 @ *  None
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
 @ * of a 16x16 intra prediction macroblock, and then performs scaling.
 @ * prediction buffer
@@ -69,10 +69,10 @@
 @ * @remarks none
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src,
 @       WORD16* pi2_out,
 @       const UWORD16 *pu2_iscal_mat,
@@ -161,7 +161,7 @@ ih264_ihadamard_scaling_4x4_a9:
 
 
 @ *******************************************************************************
-@ */
+@ *
 @ * @brief This function performs a 2x2 inverse hadamard transform for chroma block
 @ *
 @ * @par Description:
@@ -189,10 +189,10 @@ ih264_ihadamard_scaling_4x4_a9:
 @ * @remarks none
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
 @                                  WORD16* pi2_out,
 @                                  const UWORD16 *pu2_iscal_mat,
diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s
index afd2860..6681a7c 100644
--- a/common/arm/ih264_inter_pred_chroma_a9q.s
+++ b/common/arm/ih264_inter_pred_chroma_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_inter_pred_chroma_a9q.s
@@ -36,16 +36,16 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
 @
 
-@/**
-@/**
-@/**
+@**
+@**
+@**
 @
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -85,7 +85,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
 @void ih264_inter_pred_chroma(UWORD8 *pu1_src,
 @                             UWORD8 *pu1_dst,
@@ -112,8 +112,6 @@
 
 ih264_inter_pred_chroma_a9q:
 
-
-
     stmfd         sp!, {r4-r12, r14}    @store register values to stack
     vstmdb        sp!, {d8-d15}         @push neon registers to stack
     ldr           r4, [sp, #104]
diff --git a/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s
index ea6bba0..62b4b94 100644
--- a/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s
+++ b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_inter_pred_luma_horz_a9q.s
@@ -36,13 +36,13 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
 @
 
-@/**
-@/**
+@**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -76,7 +76,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
 @void ih264_inter_pred_luma_horz (
 @                            UWORD8 *pu1_src,
@@ -102,6 +102,9 @@
 
 ih264_inter_pred_luma_horz_a9q:
 
+
+
+
     stmfd         sp!, {r4-r12, r14}    @store register values to stack
     vstmdb        sp!, {d8-d15}         @push neon registers to stack
     ldr           r5, [sp, #104]        @Loads ht
@@ -116,7 +119,7 @@ ih264_inter_pred_luma_horz_a9q:
     beq           loop_4
 
 loop_16:                                @when  wd=16
-    @// Processing row0 and row1
+    @ Processing row0 and row1
     vld1.8        {d2, d3, d4}, [r0], r2 @// Load row0                        ;for checking loop
     vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
     vld1.8        {d5, d6, d7}, [r0], r2 @// Load row1
@@ -173,7 +176,7 @@ loop_16:                                @when  wd=16
     b             loop_16               @ loop if height == 8 or 16
 
 loop_8:
-@// Processing row0 and row1
+@ Processing row0 and row1
     vld1.8        {d5, d6}, [r0], r2    @// Load row1
     vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
     vld1.8        {d2, d3}, [r0], r2    @// Load row0
@@ -204,7 +207,7 @@ loop_8:
 
     beq           end_func              @ Branch if height==4
 
-    b             loop_8 @looping if height =8 or 16
+    b             loop_8                @looping if height =8 or 16
 
 loop_4:
     vld1.8        {d5, d6}, [r0], r2    @// Load row1
diff --git a/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s
index 5b29e02..65c40a6 100644
--- a/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s
+++ b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_inter_pred_luma_vert_a9q.s
@@ -36,14 +36,14 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
 @
 
-@/**
-@/**
-@/**
+@**
+@**
+@**
 @ *******************************************************************************
 @ *
 @ * @brief
@@ -195,10 +195,10 @@ loop_16:                                @when  wd=16
     subne         r0, r0, r2
     beq           end_func              @ Branch if height==4
 
-    b             loop_16 @ looping if height = 8 or 16
+    b             loop_16               @ looping if height = 8 or 16
 
 loop_8:
-@// Processing row0 and row1
+@ Processing row0 and row1
 
     vld1.u32      d0, [r0], r2          @ Vector load from src[0_0]
     vld1.u32      d1, [r0], r2          @ Vector load from src[1_0]
@@ -248,7 +248,7 @@ loop_8:
 
 
 loop_4:
-@// Processing row0 and row1
+@ Processing row0 and row1
 
     vld1.u32      d0[0], [r0], r2       @ Vector load from src[0_0]
     vld1.u32      d1[0], [r0], r2       @ Vector load from src[1_0]
diff --git a/common/arm/ih264_inter_pred_luma_bilinear_a9q.s b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s
index 6a3c83d..8f049f8 100644
--- a/common/arm/ih264_inter_pred_luma_bilinear_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_inter_pred_luma_bilinear_a9q.s
@@ -36,14 +36,14 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
 @
 
-@/**
-@/**
-@/**
+@**
+@**
+@**
 @ *******************************************************************************
 @ *  function:ih264_inter_pred_luma_bilinear
 @ *
@@ -89,7 +89,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
 @void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
 @                                   UWORD8 *pu1_src2,
@@ -192,7 +192,7 @@ loop_16:                                @when  wd=16
     subs          r12, r6, #8
     vst1.8        {q15}, [r2], r5       @//Store dest row7
 
-    beq           end_func @ end function if ht=8
+    beq           end_func              @ end function if ht=8
 
     vld1.8        {q0}, [r0], r3        @// Load row8 ;src1
     vaddl.u8      q10, d0, d4
@@ -275,7 +275,7 @@ loop_8: @wd=8;
     vqrshrun.s16  d31, q13, #1
     subs          r12, r6, #4
     vst1.8        {d31}, [r2], r5       @//Store dest row3
-    beq           end_func @ end function if ht=4
+    beq           end_func              @ end function if ht=4
 
     vld1.8        {d12}, [r1], r4       @// Load row4 ;src2
     vld1.8        {d8}, [r0], r3        @// Load row4 ;src1
@@ -298,7 +298,7 @@ loop_8: @wd=8;
     vqrshrun.s16  d31, q11, #1
     subs          r12, r6, #8
     vst1.8        {d31}, [r2], r5       @//Store dest row7
-    beq           end_func @ end function if ht=8
+    beq           end_func              @ end function if ht=8
 
     vld1.8        {d0}, [r0], r3        @// Load row8 ;src1
     vld1.8        {d4}, [r1], r4        @// Load row8  ;src2
@@ -367,7 +367,7 @@ loop_4:
     vqrshrun.s16  d31, q13, #1
     subs          r12, r6, #4
     vst1.32       d31[0], [r2], r5      @//Store dest row3
-    beq           end_func @ end function if ht=4
+    beq           end_func              @ end function if ht=4
 
     vld1.32       d12[0], [r1], r4      @// Load row4 ;src2
     vld1.32       d8[0], [r0], r3       @// Load row4 ;src1
diff --git a/common/arm/ih264_inter_pred_luma_copy_a9q.s b/common/arm/ih264_inter_pred_luma_copy_a9q.s
index 8ba2fbf..c0b0568 100644
--- a/common/arm/ih264_inter_pred_luma_copy_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_copy_a9q.s
@@ -17,8 +17,8 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
-@/**
+@**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -53,7 +53,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_inter_pred_luma_copy (
 @                            UWORD8 *pu1_src,
 @                            UWORD8 *pu1_dst,
@@ -182,7 +182,7 @@ end_inner_loop_wd_16:
     ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
 
 
-@ /*
+@ *
 @ ********************************************************************************
 @ *
 @ * @brief This function copies a 4x4 block to destination
@@ -208,7 +208,7 @@ end_inner_loop_wd_16:
 @ * Currently wd and height is not used, ie a 4x4 block is always copied
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @ void ih264_interleave_copy(WORD16 *pi2_src,
 @                            UWORD8 *pu1_out,
 @                            WORD32 pred_strd,
diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
index 43321a8..54183f0 100644
--- a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
@@ -36,14 +36,14 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
 @
 
-@/**
-@/**
-@/**
+@**
+@**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -88,7 +88,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/;
+@*;
 
 @void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
 @                                UWORD8 *pu1_dst,
diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
index 65a6de7..c8edf38 100644
--- a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
@@ -36,14 +36,14 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
 @
 
-@/**
-@/**
-@/**
+@**
+@**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -91,7 +91,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/;
+@*;
 
 @void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
 @                                UWORD8 *pu1_dst,
@@ -835,7 +835,7 @@ loop_8:
     vmov          q7, q14
     vst1.32       d30, [r1], r3         @ store row 3
 
-    bgt           loop_8 @if height =8 or 16  loop
+    bgt           loop_8                @if height =8 or 16  loop
     b             end_func
 
 loop_4_start:
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s
index c39ae01..ab1d1d1 100644
--- a/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_inter_pred_luma_horz_qpel_a9q.s
@@ -30,19 +30,19 @@
 @*
 @* @par List of Functions:
 @*
-@*  - ih264_inter_pred_luma_horz_qpe_a9ql()
+@*  - ih264_inter_pred_luma_horz_qpel_a9q()
 @*
 @* @remarks
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
 @
 
-@/**
-@/**
+@**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -79,7 +79,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
 @void ih264_inter_pred_luma_horz (
 @                            UWORD8 *pu1_src,
@@ -126,7 +126,7 @@ ih264_inter_pred_luma_horz_qpel_a9q:
     beq           loop_4
 
 loop_16:                                @when  wd=16
-    @// Processing row0 and row1
+    @ Processing row0 and row1
     vld1.8        {d2, d3, d4}, [r0], r2 @// Load row0
     vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
     vld1.8        {d5, d6, d7}, [r0], r2 @// Load row1
@@ -187,7 +187,7 @@ loop_16:                                @when  wd=16
     b             loop_16
 
 loop_8:
-@// Processing row0 and row1
+@ Processing row0 and row1
 
     vld1.8        {d5, d6}, [r0], r2    @// Load row1
     vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
@@ -221,7 +221,7 @@ loop_8:
     subs          r5, r5, #2            @ 2 rows done, decrement by 2
 
     beq           end_func              @ Branch if height==4
-    b             loop_8 @looping if height == 8 or 16
+    b             loop_8                @looping if height == 8 or 16
 
 loop_4:
     vld1.8        {d5, d6}, [r0], r2    @// Load row1
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
index 565cc80..3c63ca3 100644
--- a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
@@ -36,14 +36,14 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
 @
 
-@/**
-@/**
-@/**
+@**
+@**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -91,7 +91,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/;
+@*;
 
 @void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src,
 @                                UWORD8 *pu1_dst,
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
index 3c8b60a..cfe03a0 100644
--- a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
@@ -36,14 +36,11 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
 @
 
-@/**
-@/**
-@/**
 @*******************************************************************************
 @*
 @* @brief
@@ -90,7 +87,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/;
+@*;
 
 @void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src,
 @                                UWORD8 *pu1_dst,
diff --git a/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s
index d45055e..e2c68ef 100644
--- a/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_inter_pred_luma_vert_qpel_a9q.s
@@ -36,13 +36,11 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
 @
 
-@/**
-@/**
 @*******************************************************************************
 @*
 @* @brief
@@ -79,7 +77,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
 @void ih264_inter_pred_luma_vert (
 @                            UWORD8 *pu1_src,
@@ -211,12 +209,12 @@ loop_16:                                @when  wd=16
     subne         r0, r0, r2
     beq           end_func              @ Branch if height==4
 
-    b             loop_16 @ looping if height = 8 or 16
+    b             loop_16               @ looping if height = 8 or 16
 
 
 loop_8:
 
-    @// Processing row0 and row1
+    @ Processing row0 and row1
     vld1.u32      d0, [r0], r2          @ Vector load from src[0_0]
     vld1.u32      d1, [r0], r2          @ Vector load from src[1_0]
     vld1.u32      d2, [r0], r2          @ Vector load from src[2_0]
@@ -270,7 +268,7 @@ loop_8:
     b             loop_8                @looping if height == 8 or 16
 
 loop_4:
-@// Processing row0 and row1
+@ Processing row0 and row1
 
     vld1.u32      d0[0], [r0], r2       @ Vector load from src[0_0]
     vld1.u32      d1[0], [r0], r2       @ Vector load from src[1_0]
diff --git a/common/arm/ih264_intra_pred_chroma_a9q.s b/common/arm/ih264_intra_pred_chroma_a9q.s
index d03fc55..ccd5c0d 100644
--- a/common/arm/ih264_intra_pred_chroma_a9q.s
+++ b/common/arm/ih264_intra_pred_chroma_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_intra_pred_chroma_a9q.s
@@ -39,15 +39,11 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
-@/* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
+@* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
 @
 
-@/**
-@/**
-@/**
-@
 .text
 .p2align 2
 
@@ -60,7 +56,7 @@ scratch_chroma_intrapred_addr1:
 
 scratch_intrapred_chroma_plane_addr1:
     .long ih264_gai1_intrapred_chroma_plane_coeffs2 - scrlblc2 - 8
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_chroma_8x8_mode_dc
@@ -91,7 +87,7 @@ scratch_intrapred_chroma_plane_addr1:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
 @                                        UWORD8 *pu1_dst,
 @                                        WORD32 src_strd,
@@ -105,8 +101,6 @@ scratch_intrapred_chroma_plane_addr1:
 @   r3 =>  dst_strd
 @   r4 =>  ui_neighboravailability
 
-
-
     .global ih264_intra_pred_chroma_8x8_mode_dc_a9q
 
 ih264_intra_pred_chroma_8x8_mode_dc_a9q:
@@ -191,10 +185,10 @@ str_pred:
 
 
 
-@/******************************************************************************
+@******************************************************************************
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_chroma_8x8_mode_horz
@@ -226,7 +220,7 @@ str_pred:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src,
 @                                         UWORD8 *pu1_dst,
 @                                         WORD32 src_strd,
@@ -270,7 +264,7 @@ loop_8x8_horz:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_chroma_8x8_mode_vert
@@ -339,10 +333,10 @@ ih264_intra_pred_chroma_8x8_mode_vert_a9q:
 
 
 
-@/******************************************************************************
+@******************************************************************************
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_chroma_8x8_mode_plane
@@ -373,7 +367,7 @@ ih264_intra_pred_chroma_8x8_mode_vert_a9q:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src,
 @                                        UWORD8 *pu1_dst,
 @                                        WORD32 src_strd,
@@ -393,7 +387,6 @@ ih264_intra_pred_chroma_8x8_mode_plane_a9q:
     stmfd         sp!, {r4-r10, r12, lr}
     vpush         {d8-d15}
 
-
     vld1.32       d0, [r0]
     add           r10, r0, #10
     vld1.32       d1, [r10]
@@ -542,7 +535,6 @@ scrlblc2:
 
 end_func_plane:
 
-
     vpop          {d8-d15}
     ldmfd         sp!, {r4-r10, r12, pc}
 
diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
index e38e203..0dd82f3 100644
--- a/common/arm/ih264_intra_pred_luma_16x16_a9q.s
+++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_intra_pred_luma_16x16_a9q.s
@@ -39,14 +39,14 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 
-@/* All the functions here are replicated from ih264_intra_pred_filters.c
+@* All the functions here are replicated from ih264_intra_pred_filters.c
 @
 
-@/**
-@/**
-@/**
+@**
+@**
+@**
 @
 
 .text
@@ -57,10 +57,10 @@
 .hidden ih264_gai1_intrapred_luma_plane_coeffs
 scratch_intrapred_addr1:
     .long ih264_gai1_intrapred_luma_plane_coeffs - scrlbl1 - 8
-@/**
+@**
 @*******************************************************************************
 @*
-@*ih264_intra_pred_luma_16x16_mode_vert_a9q
+@*ih264_intra_pred_luma_16x16_mode_vert
 @*
 @* @brief
 @*   Perform Intra prediction for  luma_16x16 mode:vertical
@@ -135,13 +135,13 @@ ih264_intra_pred_luma_16x16_mode_vert_a9q:
 
 
 
-@/******************************************************************************
+@******************************************************************************
 
 
-@/**
+@**
 @*******************************************************************************
 @*
-@*ih264_intra_pred_luma_16x16_mode_horz_a9q
+@*ih264_intra_pred_luma_16x16_mode_horz
 @*
 @* @brief
 @*  Perform Intra prediction for  luma_16x16 mode:horizontal
@@ -170,7 +170,7 @@ ih264_intra_pred_luma_16x16_mode_vert_a9q:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
 @                                         UWORD8 *pu1_dst,
 @                                         WORD32 src_strd,
@@ -213,13 +213,13 @@ loop_16x16_horz:
 
 
 
-@/******************************************************************************
+@******************************************************************************
 
 
-@/**
+@**
 @*******************************************************************************
 @*
-@*ih264_intra_pred_luma_16x16_mode_dc_a9q
+@*ih264_intra_pred_luma_16x16_mode_dc
 @*
 @* @brief
 @*  Perform Intra prediction for  luma_16x16 mode:DC
@@ -247,7 +247,7 @@ loop_16x16_horz:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
 @                                       UWORD8 *pu1_dst,
 @                                       WORD32 src_strd,
@@ -300,7 +300,7 @@ top_available:                          @ONLY TOP AVAILABLE
     vdup.u8       q0, d0[0]
     b             str_pred
 
-left_available: @ONLY LEFT AVAILABLE
+left_available:                         @ONLY LEFT AVAILABLE
     vld1.u8       {q0}, [r0]
     vpaddl.u8     q0, q0
     vadd.u16      d0, d0, d1
@@ -337,13 +337,13 @@ str_pred:
 
 
 
-@/******************************************************************************
+@******************************************************************************
 
 
-@/**
+@**
 @*******************************************************************************
 @*
-@*ih264_intra_pred_luma_16x16_mode_plane_a9q
+@*ih264_intra_pred_luma_16x16_mode_plane
 @*
 @* @brief
 @*  Perform Intra prediction for  luma_16x16 mode:PLANE
@@ -371,7 +371,7 @@ str_pred:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
 @                                        UWORD8 *pu1_dst,
 @                                        WORD32 src_strd,
diff --git a/common/arm/ih264_intra_pred_luma_4x4_a9q.s b/common/arm/ih264_intra_pred_luma_4x4_a9q.s
index cb386ea..5cc7e23 100644
--- a/common/arm/ih264_intra_pred_luma_4x4_a9q.s
+++ b/common/arm/ih264_intra_pred_luma_4x4_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_intra_pred_luma_4x4_a9q.s
@@ -44,21 +44,16 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
-
-@/* All the functions here are replicated from ih264_intra_pred_filters.c
-@
+@*
 
-@/**
-@/**
-@/**
+@* All the functions here are replicated from ih264_intra_pred_filters.c
 @
 
 .text
 .p2align 2
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_4x4_mode_vert
@@ -128,10 +123,10 @@ ih264_intra_pred_luma_4x4_mode_vert_a9q:
 
 
 
-@/******************************************************************************
+@******************************************************************************
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_4x4_mode_horz
@@ -163,7 +158,7 @@ ih264_intra_pred_luma_4x4_mode_vert_a9q:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_intra_pred_luma_4x4_mode_horz(UWORD8 *pu1_src,
 @                                         UWORD8 *pu1_dst,
 @                                         WORD32 src_strd,
@@ -210,10 +205,10 @@ ih264_intra_pred_luma_4x4_mode_horz_a9q:
 
 
 
-@/******************************************************************************
+@******************************************************************************
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_4x4_mode_dc
@@ -244,7 +239,7 @@ ih264_intra_pred_luma_4x4_mode_horz_a9q:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_4x4_mode_dc(UWORD8 *pu1_src,
 @                                       UWORD8 *pu1_dst,
 @                                       WORD32 src_strd,
@@ -352,7 +347,7 @@ end_func:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_4x4_mode_diag_dl
@@ -383,7 +378,7 @@ end_func:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_4x4_mode_diag_dl(UWORD8 *pu1_src,
 @                                            UWORD8 *pu1_dst,
 @                                            WORD32 src_strd,
@@ -434,7 +429,7 @@ end_func_diag_dl:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_4x4_mode_diag_dr
@@ -465,7 +460,7 @@ end_func_diag_dl:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_4x4_mode_diag_dr(UWORD8 *pu1_src,
 @                                            UWORD8 *pu1_dst,
 @                                            WORD32 src_strd,
@@ -514,7 +509,7 @@ end_func_diag_dr:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_4x4_mode_vert_r
@@ -545,7 +540,7 @@ end_func_diag_dr:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_4x4_mode_vert_r(UWORD8 *pu1_src,
 @                                            UWORD8 *pu1_dst,
 @                                            WORD32 src_strd,
@@ -596,7 +591,7 @@ end_func_vert_r:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_4x4_mode_horz_d
@@ -627,7 +622,7 @@ end_func_vert_r:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_4x4_mode_horz_d(UWORD8 *pu1_src,
 @                                            UWORD8 *pu1_dst,
 @                                            WORD32 src_strd,
@@ -659,7 +654,7 @@ ih264_intra_pred_luma_4x4_mode_horz_d_a9q:
     vqrshrun.s16  d5, q12, #2
     sub           r5, r3, #2
     vmov.8        d6, d5
-    vtrn.8        d4, d5 @
+    vtrn.8        d4, d5                @
     vst1.u16      {d5[1]}, [r1]!
     vst1.16       {d6[2]}, [r1], r5
     vst1.u16      {d4[1]}, [r1]!
@@ -678,7 +673,7 @@ end_func_horz_d:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_4x4_mode_vert_l
@@ -709,7 +704,7 @@ end_func_horz_d:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_4x4_mode_vert_l(UWORD8 *pu1_src,
 @                                            UWORD8 *pu1_dst,
 @                                            WORD32 src_strd,
@@ -759,7 +754,7 @@ end_func_vert_l:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_4x4_mode_horz_u
@@ -790,7 +785,7 @@ end_func_vert_l:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_4x4_mode_horz_u(UWORD8 *pu1_src,
 @                                           UWORD8 *pu1_dst,
 @                                           WORD32 src_strd,
@@ -825,9 +820,9 @@ ih264_intra_pred_luma_4x4_mode_horz_u_a9q:
     vext.8        d6, d5, d4, #1
     vst1.8        {d4[2]}, [r1]!
     vst1.8        {d6[0]}, [r1]!
-    vtrn.8        d6, d5 @
+    vtrn.8        d6, d5                @
     sub           r5, r3, #2
-    vtrn.8        d4, d6 @
+    vtrn.8        d4, d6                @
     vdup.8        d7, r9
     vst1.16       {d6[0]}, [r1], r5
     vst1.16       {d6[0]}, [r1]!
diff --git a/common/arm/ih264_intra_pred_luma_8x8_a9q.s b/common/arm/ih264_intra_pred_luma_8x8_a9q.s
index 6da1c95..352d29d 100644
--- a/common/arm/ih264_intra_pred_luma_8x8_a9q.s
+++ b/common/arm/ih264_intra_pred_luma_8x8_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_intra_pred_luma_8x8_a9q.s
@@ -45,17 +45,11 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
-
-@/* All the functions here are replicated from ih264_intra_pred_filters.c
-@
+@*
 
-@/**
-@/**
-@/**
+@* All the functions here are replicated from ih264_intra_pred_filters.c
 @
 
-
 .text
 .p2align 2
 
@@ -64,7 +58,7 @@
 scratch_intrapred_addr_8x8:
     .long ih264_gai1_intrapred_luma_8x8_horz_u -  scrlb8x8l2 - 8
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_8x8_mode_ref_filtering
@@ -95,7 +89,7 @@ scratch_intrapred_addr_8x8:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_8x8_mode_ref_filtering(UWORD8 *pu1_src,
 @                                                 UWORD8 *pu1_dst)
 
@@ -111,7 +105,6 @@ ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q:
     stmfd         sp!, {r4-r12, r14}    @store register values to stack
     vpush         {d8-d15}
 
-
     vld1.u8       {q0}, [r0]!           @
     vld1.u8       {q1}, [r0]
     add           r0, r0, #8            @
@@ -141,6 +134,7 @@ ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q:
 
 
 end_func_ref_filt:
+
     vpop          {d8-d15}
     ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
 
@@ -149,7 +143,7 @@ end_func_ref_filt:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_8x8_mode_vert
@@ -219,10 +213,10 @@ ih264_intra_pred_luma_8x8_mode_vert_a9q:
 
 
 
-@/******************************************************************************
+@******************************************************************************
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_8x8_mode_horz
@@ -254,7 +248,7 @@ ih264_intra_pred_luma_8x8_mode_vert_a9q:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src,
 @                                         UWORD8 *pu1_dst,
 @                                         WORD32 src_strd,
@@ -299,10 +293,10 @@ loop_8x8_horz:
 
 
 
-@/******************************************************************************
+@******************************************************************************
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_8x8_mode_dc
@@ -333,7 +327,7 @@ loop_8x8_horz:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src,
 @                                       UWORD8 *pu1_dst,
 @                                       WORD32 src_strd,
@@ -413,7 +407,7 @@ str_pred:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_8x8_mode_diag_dl
@@ -444,7 +438,7 @@ str_pred:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src,
 @                                            UWORD8 *pu1_dst,
 @                                            WORD32 src_strd,
@@ -506,7 +500,7 @@ end_func_diag_dl:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_8x8_mode_diag_dr
@@ -537,7 +531,7 @@ end_func_diag_dl:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src,
 @                                            UWORD8 *pu1_dst,
 @                                            WORD32 src_strd,
@@ -597,7 +591,7 @@ end_func_diag_dr:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_8x8_mode_vert_r
@@ -628,7 +622,7 @@ end_func_diag_dr:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src,
 @                                            UWORD8 *pu1_dst,
 @                                            WORD32 src_strd,
@@ -717,7 +711,7 @@ end_func_vert_r:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_8x8_mode_horz_d
@@ -748,7 +742,7 @@ end_func_vert_r:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src,
 @                                            UWORD8 *pu1_dst,
 @                                            WORD32 src_strd,
@@ -791,7 +785,7 @@ ih264_intra_pred_luma_8x8_mode_horz_d_a9q:
     vmov.8        q4, q2
     vmov.8        q5, q3
     sub           r6, r3, #6
-    vtrn.8        q4, q5 @
+    vtrn.8        q4, q5                @
     vmov.8        q6, q4
     vmov.8        q7, q5
     sub           r5, r3, #4
@@ -835,7 +829,7 @@ end_func_horz_d:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_8x8_mode_vert_l
@@ -866,7 +860,7 @@ end_func_horz_d:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src,
 @                                            UWORD8 *pu1_dst,
 @                                            WORD32 src_strd,
@@ -887,6 +881,7 @@ ih264_intra_pred_luma_8x8_mode_vert_l_a9q:
 
     stmfd         sp!, {r4-r12, r14}    @Restoring registers from stack
     vpush         {d8-d15}
+
     add           r0, r0, #9
     vld1.u8       {q0}, [r0]
     add           r0, r0, #1
@@ -935,7 +930,7 @@ end_func_vert_l:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @*ih264_intra_pred_luma_8x8_mode_horz_u
@@ -966,7 +961,7 @@ end_func_vert_l:
 @* @remarks
 @*  None
 @*
-@*******************************************************************************/
+@*******************************************************************************
 @void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src,
 @                                           UWORD8 *pu1_dst,
 @                                           WORD32 src_strd,
diff --git a/common/arm/ih264_iquant_itrans_recon_a9.s b/common/arm/ih264_iquant_itrans_recon_a9.s
index f71ca69..4e49f6a 100644
--- a/common/arm/ih264_iquant_itrans_recon_a9.s
+++ b/common/arm/ih264_iquant_itrans_recon_a9.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @ *******************************************************************************
 @ * @file
 @ *  ih264_iquant_itrans_recon_a9.s
@@ -38,8 +38,8 @@
 @ *  None
 @ *
 @ *******************************************************************************
-@*/
-@/**
+@*
+@**
 @ *******************************************************************************
 @ *
 @ * @brief
@@ -82,7 +82,7 @@
 @ *  None
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
 @                                   UWORD8 *pu1_pred,
 @                                   UWORD8 *pu1_out,
@@ -225,7 +225,7 @@ ih264_iquant_itrans_recon_4x4_a9:
     ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
 
 
-    @/**
+@**
 @ *******************************************************************************
 @ *
 @ * @brief
@@ -268,7 +268,7 @@ ih264_iquant_itrans_recon_4x4_a9:
 @ *  None
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
 @                                   UWORD8 *pu1_pred,
 @                                   UWORD8 *pu1_out,
@@ -416,7 +416,7 @@ ih264_iquant_itrans_recon_chroma_4x4_a9:
     ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
 
 
-@/*
+@*
 @ *******************************************************************************
 @ *
 @ * @brief
@@ -459,7 +459,7 @@ ih264_iquant_itrans_recon_chroma_4x4_a9:
 @ *  None
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
 @                                   UWORD8 *pu1_pred,
 @                                   UWORD8 *pu1_out,
diff --git a/common/arm/ih264_iquant_itrans_recon_dc_a9.s b/common/arm/ih264_iquant_itrans_recon_dc_a9.s
index 8d71bdb..97c4724 100644
--- a/common/arm/ih264_iquant_itrans_recon_dc_a9.s
+++ b/common/arm/ih264_iquant_itrans_recon_dc_a9.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @ *******************************************************************************
 @ * @file
 @ *  ih264_iquant_itrans_recon_dc_a9.s
@@ -37,8 +37,8 @@
 @ *  None
 @ *
 @ *******************************************************************************
-@*/
-@/**
+@*
+@**
 @ *******************************************************************************
 @ *
 @ * @brief
@@ -83,7 +83,7 @@
 @ *  None
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
 @                                   UWORD8 *pu1_pred,
 @                                   UWORD8 *pu1_out,
@@ -167,7 +167,7 @@ ih264_iquant_itrans_recon_4x4_dc_a9:
 
 
 
-@/*
+@*
 @ *******************************************************************************
 @ *
 @ * @brief
@@ -212,7 +212,7 @@ ih264_iquant_itrans_recon_4x4_dc_a9:
 @ *  None
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src,
 @                                   UWORD8 *pu1_pred,
 @                                   UWORD8 *pu1_out,
@@ -300,7 +300,7 @@ ih264_iquant_itrans_recon_8x8_dc_a9:
     ldmfd         sp!, {r4-r8, r15}
 
 
-@ /*
+@ *
 @ ********************************************************************************
 @ *
 @ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
@@ -328,7 +328,7 @@ ih264_iquant_itrans_recon_8x8_dc_a9:
 @ * @remarks none
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @ void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
 @                                             UWORD8 *pu1_pred,
 @                                             UWORD8 *pu1_out,
@@ -368,6 +368,7 @@ ih264_iquant_itrans_recon_chroma_4x4_dc_a9:
 
     vmov.u16      q15, #0x00ff
 
+
     vld1.u8       d18, [r2], r0         @load out [8 bit size) -8 coeffs
     vaddw.u8      q1, q0, d2            @Add pred
     vld1.u8       d19, [r2], r0
diff --git a/common/arm/ih264_itrans_recon_a9.s b/common/arm/ih264_itrans_recon_a9.s
index 1d74da5..769d5d7 100644
--- a/common/arm/ih264_itrans_recon_a9.s
+++ b/common/arm/ih264_itrans_recon_a9.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @ *******************************************************************************
 @ * @file
 @ *  ih264_itrans_recon_neon_a9.s
@@ -33,8 +33,8 @@
 @ *  None
 @ *
 @ *******************************************************************************
-@*/
-@/**
+@*
+@**
 @ *******************************************************************************
 @ *
 @ * @brief
@@ -72,7 +72,7 @@
 @ *
 @ *
 @ *******************************************************************************
-@ */
+@ *
 @void ih264_itrans_recon_4x4(
 @       WORD16 *pi2_src,
 @       UWORD8 *pu1_pred,
diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s
index 2808897..39ad9b3 100644
--- a/common/arm/ih264_mem_fns_neon.s
+++ b/common/arm/ih264_mem_fns_neon.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @ *******************************************************************************
 @ * @file
 @ *  ih264_mem_fns_neon.s
@@ -40,9 +40,9 @@
 @ *  None
 @ *
 @ *******************************************************************************
-@*/
+@*
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -65,7 +65,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
 @                    UWORD8 *pu1_src,
 @                   UWORD8 num_bytes)
@@ -94,7 +94,7 @@ loop_neon_memcpy_mul_8:
 
 
 @*******************************************************************************
-@*/
+@*
 @void ih264_memcpy(UWORD8 *pu1_dst,
 @                  UWORD8 *pu1_src,
 @                  UWORD8 num_bytes)
@@ -143,6 +143,8 @@ loop_memcpy:
 
 
 
+
+
     .global ih264_memset_mul_8_a9q
 
 ih264_memset_mul_8_a9q:
@@ -208,6 +210,8 @@ loop_memset:
 
 
 
+
+
     .global ih264_memset_16bit_mul_8_a9q
 
 ih264_memset_16bit_mul_8_a9q:
diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s
index 9bab268..e7a1f91 100644
--- a/common/arm/ih264_padding_neon.s
+++ b/common/arm/ih264_padding_neon.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@*
 @ *******************************************************************************
 @ * @file
 @ *  ih264_padding_neon.s
@@ -39,10 +39,10 @@
 @ *  None
 @ *
 @ *******************************************************************************
-@*/
+@*
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief pad at the top of a 2d array
@@ -67,7 +67,7 @@
 @* @remarks none
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_pad_top(UWORD8 *pu1_src,
 @                   WORD32 src_strd,
 @                   WORD32 wd,
@@ -110,7 +110,7 @@ loop_neon_pad_top:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -147,7 +147,7 @@ loop_neon_pad_top:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @#if PAD_LEFT_LUMA == C
 @void ih264_pad_left_luma(UWORD8 *pu1_src,
 @                        WORD32 src_strd,
@@ -160,6 +160,7 @@ loop_neon_pad_top:
 @   r3 => pad_size
 
 
+
     .global ih264_pad_left_luma_a9q
 
 ih264_pad_left_luma_a9q:
@@ -245,7 +246,7 @@ end_func:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -282,7 +283,7 @@ end_func:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @#if PAD_LEFT_CHROMA == C
 @void ih264_pad_left_chroma(UWORD8 *pu1_src,
 @                            WORD32 src_strd,
@@ -373,7 +374,7 @@ end_func_l_c:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -410,7 +411,7 @@ end_func_l_c:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @#if PAD_RIGHT_LUMA == C
 @void ih264_pad_right_luma(UWORD8 *pu1_src,
 @                        WORD32 src_strd,
@@ -519,7 +520,7 @@ end_func_r:
 
 
 
-@/**
+@**
 @*******************************************************************************
 @*
 @* @brief
@@ -556,7 +557,7 @@ end_func_r:
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @#if PAD_RIGHT_CHROMA == C
 @void ih264_pad_right_chroma(UWORD8 *pu1_src,
 @                        WORD32 src_strd,
diff --git a/common/arm/ih264_resi_trans_a9.s b/common/arm/ih264_resi_trans_a9.s
deleted file mode 100644
index 08821f5..0000000
--- a/common/arm/ih264_resi_trans_a9.s
+++ /dev/null
@@ -1,604 +0,0 @@
-@/******************************************************************************
-@ *
-@ * Copyright (C) 2015 The Android Open Source Project
-@ *
-@ * Licensed under the Apache License, Version 2.0 (the "License");
-@ * you may not use this file except in compliance with the License.
-@ * You may obtain a copy of the License at:
-@ *
-@ * http://www.apache.org/licenses/LICENSE-2.0
-@ *
-@ * Unless required by applicable law or agreed to in writing, software
-@ * distributed under the License is distributed on an "AS IS" BASIS,
-@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ * See the License for the specific language governing permissions and
-@ * limitations under the License.
-@ *
-@ *****************************************************************************
-@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
-@*/
-@/**
-@*******************************************************************************
-@* @file
-@*  ih264_resi_trans_a9.s
-@*
-@* @brief
-@*  Contains function definitions for residual and forward trans
-@*
-@* @author
-@*  Ittiam
-@*
-@* @par List of Functions:
-@*  ih264_resi_trans_4x4_a9
-@*  ih264_resi_trans_8x8_a9
-@* @remarks
-@*  None
-@*
-@*******************************************************************************
-
-
-.text
-.p2align 2
-@*****************************************************************************
-@*
-@* Function Name     : ih264_resi_trans_4x4_a9
-@* Description       : This function does cf4 of H264 followed by and approximate scaling
-@*
-@* Arguments         :
-@                       R0 :pointer to src buffer
-@                       R1 :pointer to pred buffer
-@                       R2 :pointer to dst buffer
-@                       R3 :src_stride
-@                       STACk :pred_stride,dst_stride
-
-@* Values Returned   : NONE
-@*
-@* Register Usage    :
-@* Stack Usage       :
-@* Cycles            : Around
-@* Interruptiaility  : Interruptable
-@*
-@* Known Limitations
-@*   \Assumptions    :
-@*
-@* Revision History  :
-@*         DD MM YYYY    Author(s)   Changes
-@*         30 12 2009    100633      First version
-@*
-@*****************************************************************************
-
-
-    .global ih264_resi_trans_4x4_a9
-    .extern g_scal_coff_h264_4x4
-g_scal_coff_h264_4x4_addr:
-    .long g_scal_coff_h264_4x4 - 4x4lbl - 8
-
-ih264_resi_trans_4x4_a9:
-
-    @R0 :pointer to src buffer
-    @R1 :pointer to pred buffer
-    @R2 :pointer to dst buffer
-    @R3 :src_stride
-    @STACk :pred_stride,dst_stride
-
-    push          {r4-r12, lr}          @push all the variables first
-
-    mov           r6, sp
-    add           r6, r6, #40           @decrement stack pointer,to accomodate two variables
-    ldmfd         r6, {r4-r5}           @load the strides into registers
-                                        @R4 pred_stride
-                                        @R5 dst_stride
-
-
-    @we have to give the stride as post inrement in VLDR1
-    @but since thr stride is from end of row 1 to start of row 2,
-    @we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes)
-    @ADD R3,#4
-    @ADD R4,#4
-    @ADD R5,#4
-    @in case of dst the stride represnts 16 bit ie 2*8bits
-    @hence we need to add #4 to it and thenm multiply by 2
-    @--------------------function loading done------------------------
-
-    @lets find residual
-    @data is like 1a -> d0[1:31]  d0[32:64]
-    @                    a b c d   # # # #
-    vld1.u8       d30, [r0], r3         @load 4 pixels of row1 current buffer
-    vld1.u8       d31, [r1], r4         @load 4 pixels of row1 pred buffer
-    @ data is like 1a -> q4[1:63]  q4[64:148]
-    @                    d8[1:63]  d9[1:63]
-    @                    a b c d   # # # #
-
-    vld1.u8       d28, [r0], r3         @load row 2 of src to d28[0]
-    vld1.u8       d29, [r1], r4         @load row2 of pred to d29[0]
-
-    vld1.u8       d26, [r0], r3         @load row 3 of src to d26[0]
-    vsubl.u8      q0, d30, d31          @curr - pred for row one
-
-    vld1.u8       d27, [r1], r4         @load row 3of pred t0 d27[0]
-    vsubl.u8      q1, d28, d29          @find row 2 of src -pred to d0
-
-    vld1.u8       d24, [r0], r3         @load row 4 of src to d24[0]
-
-    vld1.u8       d25, [r1], r4         @load row 4 of src tp d25[0]
-    vsubl.u8      q2, d26, d27          @load src-pred row 3 to d[2]
-
-    lsl           r5, r5, #2            @ multiply dst stride by since we are storing 32 bit values
-    ldr           r6, g_scal_coff_h264_4x4_addr
-4x4lbl:
-    add           r6, r6, pc            @  load the address of global array
-
-    vsubl.u8      q3, d24, d25          @load row 4 of src - pred to q6
-
-    @after this
-    @D0  -> 1a
-    @D2 -> 2a
-    @D4 -> 3a
-    @D6 -> 4a
-
-    @transpose the matrix so that we can do the horizontal transform first
-    @#1 #2  #3  #4
-    @a  b   c   d       ---- D0
-    @e  f   g   h       -----D2
-    @i  j   k   l       -----D4
-    @m  n   o   p       -----D6
-    @transpose the inner 2x2 blocks
-    vtrn.16       d0, d2
-    vld1.s16      {q10}, [r6]!          @   load the scaling values 0-7;
-    vtrn.16       d4, d6
-    @a  e   c   g
-    @b  f   d   h
-    @i  m   k   o
-    @j  n   l   p
-    vtrn.32       d0, d4
-    vtrn.32       d2, d6
-    @a  e   i   m  #1  -- D0 --- x4
-    @b  f   j   n  #2  -- D2 --- x5
-    @c  g   k   o  #3  -- D4 ----x6
-    @d  h   l   p  #4  -- D6 ----x7
-
-    @we have loaded the residuals into the registers , now we need to add and subtract them
-    @let us do the horiz transform first
-
-    vsub.s16      d5, d2, d4            @x2 = x5-x6
-    vsub.s16      d7, d0, d6            @x3 = x4-x7;
-
-    vadd.s16      d3, d2, d4            @x1 = x5+x6
-    vadd.s16      d1, d0, d6            @x0 = x4+x7
-
-
-    vshl.s16      d31, d7, #1           @
-    vshl.s16      d30, d5, #1           @
-
-    vadd.s16      d0, d1, d3            @x0 + x1;
-    vsub.s16      d4, d1, d3            @x0 - x1;
-
-    vadd.s16      d2, d31, d5           @U_SHIFT(x3,1,shft) + x2;
-    vsub.s16      d6, d7, d30           @x3 - U_SHIFT(x2,1,shft);
-
-    @taking transform again so as to make do vert transform
-    vtrn.16       d0, d2
-    vtrn.16       d4, d6
-
-    vtrn.32       d0, d4
-    vtrn.32       d2, d6
-
-    @let us do vertical transform
-    @same code as horiz
-
-    vadd.s16      d1, d0, d6            @x0 = x4+x7
-    vadd.s16      d3, d2, d4            @x1 = x5+x6
-    vsub.s16      d7, d0, d6            @x3 = x4-x7;
-    vsub.s16      d5, d2, d4            @x2 = x5-x6
-
-
-@Since we are going to do scal / quant or whatever, we are going to divide by
-@a 32 bit number. So we have to expand the values
-
-    @VADDL.S16 Q12,D1,D3;x0 + x1
-    @VSUBL.S16 Q14,D1,D3;x0 - x1
-
-    @VSHL.S16  D8,D5,#1;
-    @VSHL.S16  D9,D7,#1;
-
-    @VADDL.S16 Q13,D9,D5 ; + x2
-    @VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft)
-
-@scaling follows
-
-@now we need to do the scaling,so load the scaling matrix
-@mutliplying by the scaling coeffient; store the results from q5-q8 ;
-
-    vadd.s16      d24, d3, d1           @x4 = x0 + x1
-    vsub.s16      d28, d1, d3           @x6 = x0 - x1
-
-    vshl.s16      d0, d7, #1            @ U_SHIFT(x3,1,shft)
-    vmull.s16     q4, d24, d20          @x4*s0
-
-    vshl.s16      d2, d5, #1            @ U_SHIFT(x2,1,shft)
-
-    vadd.s16      d26, d0, d5           @x5 = U_SHIFT(x3,1,shft) + x2
-    vmull.s16     q5, d26, d21          @x5*s1
-
-    vst1.s32      {q4}, [r2], r5        @save 4 pixels of row1 current buffer and increment pointer by stride
-
-    vld1.s16      {q10}, [r6]           @load 8-16 scaling coeffcients
-
-    vsub.s16      d30, d7, d2           @x7 = x3 - U_SHIFT(x2,1,shft)
-
-    vmull.s16     q6, d28, d20          @x6*s2
-    vst1.s32      {q5}, [r2], r5
-
-    vmull.s16     q7, d30, d21          @x7*s3
-
-
-    vst1.s32      {q6}, [r2], r5
-    vst1.s32      {q7}, [r2]
-
-    pop           {r4-r12, pc}          @pop back all variables
-
-
-
-
-@*****************************************************************************
-@* Function Name     : ih264_resi_trans_8x8_a9
-@* Description       : This function does cf8 followd by an approximate normalization of H264
-@*
-@* Arguments         :
-@*                      R0 :pointer to src buffer
-@                       R1 :pointer to pred buffer
-@                       R2 :pointer to dst buffer
-@                       R3 :src_stride
-@                       STACk :pred_stride,dst_st
-@*
-@*
-@* Values Returned   : NONE
-@*
-@* Register Usage    :
-@* Stack Usage       :
-@* Cycles            : Around
-@* Interruptiaility  : Interruptable
-@*
-@* Known Limitations
-@*   \Assumptions    :
-@*
-@* Revision History  :
-@*         DD MM YYYY    Author(s)   Changes
-@*         30 12 2009    100633      First version
-@*
-@*****************************************************************************
-
-
-    .global ih264_resi_trans_8x8_a9
-    .extern g_scal_coff_h264_8x8
-g_scal_coff_h264_8x8_addr:
-    .long g_scal_coff_h264_8x8 - 8x8lbl - 8
-
-
-ih264_resi_trans_8x8_a9:
-
-    @R0 :pointer to src buffer
-    @R1 :pointer to pred buffer
-    @R2 :pointer to dst buffer
-    @R3 :src_stride
-    @STACk :pred_stride,dst_stride
-
-    push          {r4-r12, lr}          @push all the variables first
-
-    mov           r6, sp
-    add           r6, r6, #40           @decrement stack pointer,to accomodate two variables
-    ldmfd         r6, {r4-r5}           @load the strides into registers
-                                        @R4 pred_stride
-                                        @R5 dst_stride
-
-    @we have to give the stride as post inrement in vst1
-    @in case of dst the stride represnts 16 bit ie 2*8bits
-    @hence we need to add #4 to it and thenm multiply by 2
-    @--------------------function loading done------------------------
-
-    @lets find residual
-    @data is like 1a -> d0[1:31]  d0[32:64]
-    @                    a b c d   # # # #
-    vld1.u8       d30, [r0], r3         @load 4 pixels of row1 current buffer
-    vld1.u8       d31, [r1], r4         @load 4 pixels of row1 pred buffer
-
-    vld1.u8       d28, [r0], r3         @src  rw2
-    vld1.u8       d29, [r1], r4         @pred rw2
-    vsubl.u8      q0, d30, d31          @src-pred rw1
-
-    vld1.u8       d26, [r0], r3
-    vld1.u8       d27, [r1], r4
-    vsubl.u8      q1, d28, d29
-
-    vld1.u8       d24, [r0], r3
-    vld1.u8       d25, [r1], r4
-    vsubl.u8      q2, d26, d27
-
-    vld1.u8       d22, [r0], r3
-    vld1.u8       d23, [r1], r4
-    vsubl.u8      q3, d24, d25
-
-    vld1.u8       d20, [r0], r3
-    vld1.u8       d21, [r1], r4
-    vsubl.u8      q4, d22, d23
-
-    vld1.u8       d18, [r0], r3
-    vld1.u8       d19, [r1], r4
-    vsubl.u8      q5, d20, d21
-
-    vld1.u8       d16, [r0], r3
-    vld1.u8       d17, [r1], r4
-    vsubl.u8      q6, d18, d19
-
-    lsl           r5, r5, #2
-
-
-    vsubl.u8      q7, d16, d17
-
-    @after this
-    @Q0 -> 1a
-    @Q1 -> 2a
-    @Q2 -> 3a
-    @Q3 -> 4a
-    @Q4 -> 5a
-    @Q5 -> 6a
-    @Q6 -> 7a
-    @Q7 -> 8a
-
-    @transpose the matrix so that we can do the horizontal transform first
-
-    @transpose the inner 2x2 blocks
-    vtrn.16       q0, q1
-    vtrn.16       q2, q3
-    vtrn.16       q4, q5
-    vtrn.16       q6, q7
-
-    @transpose the inner 4x4 blocks
-    vtrn.32       q0, q2
-    vtrn.32       q1, q3
-
-    vtrn.32       q4, q6
-    vtrn.32       q5, q7
-
-    @transpose the outer 8x8 blocks
-    vswp          d1, d8
-    vswp          d7, d14
-    vswp          d3, d10
-    vswp          d5, d12
-    @transpose done
-
-@@this point we will have data in Q0-Q7
-@Q7 will be populated within 2 clock cycle
-@all others are availabe @ this clock cycle
-
-    @we have loaded the residuals into the registers , now we need to add and subtract them
-    @let us do the horiz transform first
-
-    vadd.s16      q8, q0, q7            @      a0 = r0 + r7;
-    vadd.s16      q9, q1, q6            @      a1 = r1 + r6;
-    vadd.s16      q10, q2, q5           @     a2 = r2 + r5;
-    vadd.s16      q11, q3, q4           @     a3 = r3 + r4;
-
-    vsub.s16      q12, q0, q7           @     b0 = r0 - r7;
-    vsub.s16      q13, q1, q6           @     b1 = r1 - r6;
-    vsub.s16      q15, q3, q4           @     b3 = r3 - r4;
-    vsub.s16      q14, q2, q5           @     b2 = r2 - r5;
-
-    vadd.s16      q1, q8, q11           @     a4 = a0 + a3;
-    vadd.s16      q3, q9, q10           @     a5 = a1 + a2;
-    vsub.s16      q7, q9, q10           @     a7 = a1 - a2;
-    vsub.s16      q5, q8, q11           @     a6 = a0 - a3;
-
-    ldr           r6, g_scal_coff_h264_8x8_addr
-8x8lbl:
-    add           r6, r6, pc            @  load the address of global array
-
-    vadd.s16      q0, q1, q3            @      pi2_res[0] = a4 + a5;
-    vshr.s16      q8, q7, #1            @      pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
-
-    vsub.s16      q4, q1, q3            @      pi2_res[4] = a4 - a5;
-
-    vadd.s16      q2, q5, q8            @
-
-
-    vshr.s16      q9, q5, #1            @      pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
-    vsub.s16      q6, q9, q7            @
-
-@do not change Q0,Q2.Q4,Q6 they contain results
-@Q1,Q3,Q5,Q7 TO STORE RESULTS
-@Q8 Q9 Q10 Q11 USE @WILL
-
-    vshr.s16      q1, q12, #1           @     D_SHIFT(b0,1,shft)
-    vshr.s16      q3, q13, #1           @     D_SHIFT(b1,1,shft)
-    vshr.s16      q5, q14, #1           @     D_SHIFT(b2,1,shft)
-    vshr.s16      q7, q15, #1           @     D_SHIFT(b3,1,shft)
-
-    vadd.s16      q8, q1, q12           @     (D_SHIFT(b0,1,shft) + b0);
-    vadd.s16      q9, q3, q13           @     (D_SHIFT(b1,1,shft) + b1);
-    vadd.s16      q10, q5, q14          @    (D_SHIFT(b2,1,shft) + b2);
-    vadd.s16      q11, q7, q15          @    (D_SHIFT(b3,1,shft) + b3);
-
-    vadd.s16      q1, q14, q8           @     b2 + (D_SHIFT(b0,1,shft) + b0);
-    vsub.s16      q5, q15, q9           @     b3 - (D_SHIFT(b1,1,shft) + b1);
-    vadd.s16      q3, q15, q10          @    b3 + (D_SHIFT(b2,1,shft) + b2);
-    vsub.s16      q7, q11, q14          @    -b2 + (D_SHIFT(b3,1,shft) + b3);
-
-    vadd.s16      q8, q13, q1           @     b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
-    vsub.s16      q9, q12, q3           @     b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
-    vadd.s16      q10, q12, q5          @    b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
-    vadd.s16      q11, q13, q7          @    b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
-
-    vshr.s16      q15, q8, #2           @     D_SHIFT(b4,2,shft)
-    vshr.s16      q14, q9, #2           @     D_SHIFT(b5,2,shft);
-    vshr.s16      q13, q10, #2          @    D_SHIFT(b6,2,shft);
-    vshr.s16      q12, q11, #2          @    D_SHIFT(b7,2,shft);
-
-
-    vadd.s16      q3, q9, q13           @     pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
-    vsub.s16      q5, q10, q14          @    pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
-    vadd.s16      q1, q8, q12           @     pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
-    vsub.s16      q7, q15, q11          @    pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
-
-    @------------horiz transform done-------------------------
-    @results are in Q0-Q7
-    @all other neon registes can be used at will
-
-@doing vertical transform
-@code exact copy of horiz transform above
-
-    @transpose the inner 2x2 blocks
-    vtrn.16       q0, q1
-    vtrn.16       q2, q3
-    vtrn.16       q4, q5
-    vtrn.16       q6, q7
-
-    @transpose the inner 4x4 blocks
-    vtrn.32       q0, q2
-    vtrn.32       q1, q3
-
-    vtrn.32       q4, q6
-    vtrn.32       q5, q7
-
-    @transpose the outer 8x8 blocks
-    vswp          d1, d8
-    vswp          d3, d10
-    vswp          d5, d12
-    vswp          d7, d14
-
-    @transpose done
-
-    vadd.s16      q8, q0, q7            @      a0 = r0 + r7;
-    vadd.s16      q9, q1, q6            @      a1 = r1 + r6;
-    vadd.s16      q10, q2, q5           @     a2 = r2 + r5;
-    vadd.s16      q11, q3, q4           @     a3 = r3 + r4;
-
-    vsub.s16      q12, q0, q7           @     b0 = r0 - r7;
-    vsub.s16      q13, q1, q6           @     b1 = r1 - r6;
-    vsub.s16      q14, q2, q5           @     b2 = r2 - r5;
-    vsub.s16      q15, q3, q4           @     b3 = r3 - r4;
-
-    vadd.s16      q1, q8, q11           @     a4 = a0 + a3;
-    vadd.s16      q3, q9, q10           @     a5 = a1 + a2;
-    vsub.s16      q5, q8, q11           @     a6 = a0 - a3;
-    vsub.s16      q7, q9, q10           @     a7 = a1 - a2;
-
-
-    vadd.s16      q0, q1, q3            @      pi2_res[0] = a4 + a5;
-
-    vshr.s16      q8, q7, #1            @      pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
-    @DSHIFT_TO_0 Q8,Q7,#1,#0
-    vadd.s16      q2, q5, q8            @
-
-    vsub.s16      q4, q1, q3            @      pi2_res[4] = a4 - a5;
-
-    vshr.s16      q9, q5, #1            @      pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
-    vsub.s16      q6, q9, q7            @
-
-@do not change Q0,Q2.Q4,Q6 they contain results
-@Q1,Q3,Q5,Q7 TO STORE RESULTS
-@Q8 Q9 Q10 Q11 USE @WILL
-
-    vshr.s16      q1, q12, #1           @     D_SHIFT(b0,1,shft)
-    vshr.s16      q3, q13, #1           @     D_SHIFT(b1,1,shft)
-    vshr.s16      q5, q14, #1           @     D_SHIFT(b2,1,shft)
-    vshr.s16      q7, q15, #1           @     D_SHIFT(b3,1,shft)
-
-
-    vadd.s16      q8, q1, q12           @     (D_SHIFT(b0,1,shft) + b0);
-    vadd.s16      q9, q3, q13           @     (D_SHIFT(b1,1,shft) + b1);
-    vadd.s16      q10, q5, q14          @    (D_SHIFT(b2,1,shft) + b2);
-    vadd.s16      q11, q7, q15          @    (D_SHIFT(b3,1,shft) + b3);
-
-    vadd.s16      q1, q14, q8           @     b2 + (D_SHIFT(b0,1,shft) + b0);
-    vadd.s16      q3, q15, q10          @    b3 + (D_SHIFT(b2,1,shft) + b2);
-    vsub.s16      q5, q15, q9           @     b3 - (D_SHIFT(b1,1,shft) + b1);
-    vsub.s16      q7, q11, q14          @    -b2 + (D_SHIFT(b3,1,shft) + b3);
-
-    vadd.s16      q8, q13, q1           @     b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
-    vsub.s16      q9, q12, q3           @     b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
-    vadd.s16      q10, q12, q5          @    b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
-    vadd.s16      q11, q13, q7          @    b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
-
-    vshr.s16      q15, q8, #2           @     D_SHIFT(b4,2,shft)
-    vshr.s16      q14, q9, #2           @     D_SHIFT(b5,2,shft);
-    vshr.s16      q13, q10, #2          @    D_SHIFT(b6,2,shft);
-    vshr.s16      q12, q11, #2          @    D_SHIFT(b7,2,shft);
-
-
-@since we are going to scal by small values, we need not expand the guys to 32 bit bit values
-    vsub.s16      q5, q10, q14          @    pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
-    vsub.s16      q7, q15, q11          @    pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
-    vadd.s16      q3, q9, q13           @     pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
-    vadd.s16      q1, q8, q12           @     pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
-
-    @------------vert transform done-------------------------
-    @results are in Q0-Q7
-    @all other neon registes can be used at will
-
-    @scaling
-    @since the 8x8 scaling matrix repeats in 1x4,1x4 block ,
-    @we need only load 4 values for each row and in total 4 rows
-    vld1.s16      {q14-q15}, [r6]       @
-
-    @since we need to get a 32 bit o/p for two 16 bit multiplications
-    @we need a VMULL instruction
-@-----------------------------first and second row
-
-    vmull.s16     q8, d0, d28           @scale the first row first 4 elem
-    vmull.s16     q9, d28, d1           @scale the second row last 4 elemts
-
-    vmull.s16     q10, d2, d29          @ scale second row first 4 elem
-    vmull.s16     q11, d29, d3          @scale the second row last 4 elem
-    vmull.s16     q12, d4, d30          @scale third row first  4 elem
-
-    vst1.s32      {q8, q9}, [r2], r5    @ write the first row complete
-
-    vmull.s16     q13, d30, d5          @scale the third row last 4 elem
-    vmull.s16     q8, d6, d31           @scale the fourth row first 4 elem
-
-
-    vst1.s32      {q10, q11}, [r2], r5  @store the second row complete
-
-@------------------------------- 3rd and 4th row
-
-    vmull.s16     q9, d31, d7           @scale the fourth row second column
-
-    vst1.s32      {q12, q13}, [r2], r5  @store the third row complete
-
-    vmull.s16     q10, d8, d28          @scale the 5th row fisrst 4 elms
-    vmull.s16     q11, d28, d9          @scale the 5th row second 4 elems
-
-    vmull.s16     q12, d10, d29         @scale the 6th row first4 elements
-
-
-    vst1.s32      {q8, q9}, [r2], r5    @store fifth row
-
-@--------------------------------5th and 6th row
-
-    vmull.s16     q13, d29, d11         @scale 6th row sendond 4 elems
-
-    vmull.s16     q8, d12, d30          @scale 7th rw first 4 elms
-
-    vst1.s32      {q10, q11}, [r2], r5  @store 6th row second 4 elements
-
-    vmull.s16     q9, d30, d13          @scale 7th rw second 4 elms
-    vmull.s16     q10, d14, d31         @scale 8th rw forst 4 elms
-
-
-    vst1.s32      {q12, q13}, [r2], r5  @store 6th row
-
-@----------------------------------7th and 8th row
-    vmull.s16     q11, d31, d15         @scale 8th row second 4 elms
-
-    vst1.s32      {q8, q9}, [r2], r5    @store 7th row
-    vst1.s32      {q10, q11}, [r2], r5  @store 8th row
-
-@----------------------------------done writing
-
-    pop           {r4-r12, pc}          @pop back all variables
-
-
-
-
-
-
diff --git a/common/arm/ih264_resi_trans_quant_a9.s b/common/arm/ih264_resi_trans_quant_a9.s
index caf362e..bb836bd 100644
--- a/common/arm/ih264_resi_trans_quant_a9.s
+++ b/common/arm/ih264_resi_trans_quant_a9.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @*******************************************************************************
 @* @file
 @*  ih264_resi_trans_quant_a9.s
diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s
index ccae779..33859e6 100644
--- a/common/arm/ih264_weighted_bi_pred_a9q.s
+++ b/common/arm/ih264_weighted_bi_pred_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_weighted_bi_pred_a9q.s
@@ -37,7 +37,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @*******************************************************************************
 @* @function
 @*  ih264_weighted_bi_pred_luma_a9q()
@@ -96,7 +96,7 @@
 @*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_weighted_bi_pred_luma_a9q(UWORD8 *pu1_src1,
 @                                     UWORD8 *pu1_src2,
 @                                     UWORD8 *pu1_dst,
@@ -411,7 +411,7 @@ end_loops:
 @*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_weighted_bi_pred_chroma_a9q(UWORD8 *pu1_src1,
 @                                       UWORD8 *pu1_src2,
 @                                       UWORD8 *pu1_dst,
diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s
index 1ce94d0..81d26d4 100644
--- a/common/arm/ih264_weighted_pred_a9q.s
+++ b/common/arm/ih264_weighted_pred_a9q.s
@@ -17,7 +17,7 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 @******************************************************************************
 @* @file
 @*  ih264_weighted_pred_a9q.s
@@ -37,7 +37,7 @@
 @*  None
 @*
 @*******************************************************************************
-@*/
+@*
 @*******************************************************************************
 @* @function
 @*  ih264_weighted_pred_luma_a9q()
@@ -84,7 +84,7 @@
 @*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_weighted_pred_luma_a9q(UWORD8 *pu1_src,
 @                                  UWORD8 *pu1_dst,
 @                                  WORD32 src_strd,
@@ -314,7 +314,7 @@ end_loops:
 @*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
 @*
 @*******************************************************************************
-@*/
+@*
 @void ih264_weighted_pred_chroma_a9q(UWORD8 *pu1_src,
 @                                    UWORD8 *pu1_dst,
 @                                    WORD32 src_strd,
diff --git a/common/armv8/ih264_default_weighted_pred_av8.s b/common/armv8/ih264_default_weighted_pred_av8.s
index aefb902..6823015 100644
--- a/common/armv8/ih264_default_weighted_pred_av8.s
+++ b/common/armv8/ih264_default_weighted_pred_av8.s
@@ -24,7 +24,6 @@
 //*
 //* @brief
 //*  Contains function definitions for default weighted prediction.
-//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
 //*
 //* @author
 //*  Kaushik Senthoor R
diff --git a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
index 38934c9..9564f99 100644
--- a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
+++ b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
@@ -247,8 +247,8 @@ loop_16:                                //when  wd=16
     st1       {v30.2s, v31.2s}, [x1], x3 //  store row 6
     sqrshrun  v30.8b, v28.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
 
-    swp       v0.8b v4.8b
-    swp       v1.8b v5.8b
+    swp       v0.8b, v4.8b
+    swp       v1.8b, v5.8b
 
 
 
@@ -257,8 +257,8 @@ loop_16:                                //when  wd=16
     mov       v7.8b, v11.8b
     subs      x12, x14, #1              // if height==16  - looping
 
-    swp       v4.8b v8.8b
-    swp       v5.8b v9.8b
+    swp       v4.8b, v8.8b
+    swp       v5.8b, v9.8b
 
 
     sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
index ea7645e..202c516 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
@@ -68,7 +68,7 @@
 
 ih264_inter_pred_luma_horz_hpel_vert_hpel_av8:
 
-             //store register values to stack
+    //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
 
@@ -811,7 +811,7 @@ loop_4:
     bgt       loop_4
 
 end_func:
-           //Restoring registers from stack
+    //Restoring registers from stack
     ldp       x19, x20, [sp], #16
     pop_v_regs
     ret
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
index 3737e3f..38f971b 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
@@ -1111,7 +1111,7 @@ loop_4:
     bgt       loop_4
 
 end_func:
-           //Restoring registers from stack
+    //Restoring registers from stack
     ldp       x19, x20, [sp], #16
     pop_v_regs
     ret
diff --git a/common/armv8/ih264_intra_pred_chroma_av8.s b/common/armv8/ih264_intra_pred_chroma_av8.s
index 62edfdc..2c5efb3 100644
--- a/common/armv8/ih264_intra_pred_chroma_av8.s
+++ b/common/armv8/ih264_intra_pred_chroma_av8.s
@@ -262,7 +262,7 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8:
 
 
 
-     push_v_regs
+    push_v_regs
     ld1       {v0.8h}, [x0]
 
     dup       v10.8h, v0.h[7]
diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s
index f7d0846..96ef50a 100644
--- a/common/armv8/ih264_weighted_bi_pred_av8.s
+++ b/common/armv8/ih264_weighted_bi_pred_av8.s
@@ -24,7 +24,6 @@
 //*
 //* @brief
 //*  Contains function definitions for weighted biprediction.
-//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
 //*
 //* @author
 //*  Kaushik Senthoor R
diff --git a/common/armv8/ih264_weighted_pred_av8.s b/common/armv8/ih264_weighted_pred_av8.s
index 6a03875..ec5bb7a 100644
--- a/common/armv8/ih264_weighted_pred_av8.s
+++ b/common/armv8/ih264_weighted_pred_av8.s
@@ -24,7 +24,6 @@
 //*
 //* @brief
 //*  Contains function definitions for weighted prediction.
-//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
 //*
 //* @author
 //*  Kaushik Senthoor R
diff --git a/common/ih264_dpb_mgr.c b/common/ih264_dpb_mgr.c
index 8e087d3..9380b7e 100644
--- a/common/ih264_dpb_mgr.c
+++ b/common/ih264_dpb_mgr.c
@@ -536,7 +536,7 @@ WORD32 ih264_dpb_mgr_alternate_ref_fields(dpb_mgr_t *ps_dpb_mgr,
                             BOTTOM_FIELD:TOP_FIELD;
     }
 
-    if((reference_type == SHORT_TERM_REF))
+    if(reference_type == SHORT_TERM_REF)
     {
         ps_dpb_mgr->ps_dpb_short_term_head = ps_dpb_head->ps_prev_dpb;
     }
diff --git a/common/ithread.c b/common/ithread.c
index 4ffb98a..25a8cd0 100644
--- a/common/ithread.c
+++ b/common/ithread.c
@@ -327,6 +327,11 @@ WORD32 ithread_set_affinity(WORD32 core_id)
         return 1;
 }
 
+void ithread_set_name(CHAR *pc_thread_name)
+{
+    return;
+}
+
 #else
 
 UWORD32 ithread_get_handle_size(void)
diff --git a/common/x86/ih264_deblk_luma_ssse3.c b/common/x86/ih264_deblk_luma_ssse3.c
index 440d5f0..e29bebb 100644
--- a/common/x86/ih264_deblk_luma_ssse3.c
+++ b/common/x86/ih264_deblk_luma_ssse3.c
@@ -856,7 +856,7 @@ void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src,
 {
     UWORD8 u1_Bs, u1_Bs1;
 
-    UWORD32 j = 0;
+    WORD32 j = 0;
 
     __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
     __m128i int1, int2, int3, int4, high1, high2;
diff --git a/common/x86/ih264_ihadamard_scaling_sse42.c b/common/x86/ih264_ihadamard_scaling_sse42.c
index 895291b..d68d105 100644
--- a/common/x86/ih264_ihadamard_scaling_sse42.c
+++ b/common/x86/ih264_ihadamard_scaling_sse42.c
@@ -86,14 +86,19 @@
  *
  *******************************************************************************
  */
-void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, WORD16* pi2_out,
-        const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
-        UWORD32 u4_qp_div_6, WORD32* pi4_tmp) {
+void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src,
+                                       WORD16* pi2_out,
+                                       const UWORD16 *pu2_iscal_mat,
+                                       const UWORD16 *pu2_weigh_mat,
+                                       UWORD32 u4_qp_div_6,
+                                       WORD32* pi4_tmp)
+{
     __m128i src_r0_r1, src_r2_r3;
     __m128i src_r0, src_r1, src_r2, src_r3;
     __m128i temp0, temp1, temp2, temp3;
     __m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6)));
     __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
+    UNUSED (pi4_tmp);
 
     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
@@ -171,12 +176,15 @@ void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, WORD16* pi2_out,
     src_r3 = _mm_mullo_epi32(src_r3, mult_val);
 
     //Scaling
-    if (u4_qp_div_6 >= 6) {
+    if(u4_qp_div_6 >= 6)
+    {
         src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6);
         src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6);
         src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6);
         src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6);
-    } else {
+    }
+    else
+    {
         temp0 = _mm_add_epi32(src_r0, add_rshift);
         temp1 = _mm_add_epi32(src_r1, add_rshift);
         temp2 = _mm_add_epi32(src_r2, add_rshift);
@@ -194,16 +202,17 @@ void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, WORD16* pi2_out,
 }
 
 void ih264_ihadamard_scaling_2x2_uv_sse42(WORD16* pi2_src,
-                                    WORD16* pi2_out,
-                                    const UWORD16 *pu2_iscal_mat,
-                                    const UWORD16 *pu2_weigh_mat,
-                                    UWORD32 u4_qp_div_6,
-                                    WORD32* pi4_tmp)
+                                          WORD16* pi2_out,
+                                          const UWORD16 *pu2_iscal_mat,
+                                          const UWORD16 *pu2_weigh_mat,
+                                          UWORD32 u4_qp_div_6,
+                                          WORD32* pi4_tmp)
 {
-    UNUSED(pi4_tmp);
     __m128i src, plane_0, plane_1, temp0, temp1, sign_reg;
     __m128i zero_8x16b = _mm_setzero_si128();
     __m128i scale_val = _mm_set1_epi32((WORD32)(pu2_iscal_mat[0] * pu2_weigh_mat[0]));
+    UNUSED(pi4_tmp);
+
     src = _mm_loadu_si128((__m128i *) pi2_src);         //a0 a1 a2 a3 b0 b1 b2 b3
     sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
     plane_0 = _mm_unpacklo_epi16(src, sign_reg);        //a0 a1 a2 a3 -- 32 bits
diff --git a/common/x86/ih264_ihadamard_scaling_ssse3.c b/common/x86/ih264_ihadamard_scaling_ssse3.c
index 232d9fa..1b940ea 100644
--- a/common/x86/ih264_ihadamard_scaling_ssse3.c
+++ b/common/x86/ih264_ihadamard_scaling_ssse3.c
@@ -85,9 +85,13 @@
  *
  *******************************************************************************
  */
-void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out,
-        const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
-        UWORD32 u4_qp_div_6, WORD32* pi4_tmp) {
+void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src,
+                                       WORD16* pi2_out,
+                                       const UWORD16 *pu2_iscal_mat,
+                                       const UWORD16 *pu2_weigh_mat,
+                                       UWORD32 u4_qp_div_6,
+                                       WORD32* pi4_tmp)
+{
     int val = 0xFFFF;
     __m128i src_r0_r1, src_r2_r3, sign_reg, zero_8x16b = _mm_setzero_si128();
     __m128i src_r0, src_r1, src_r2, src_r3;
@@ -96,6 +100,8 @@ void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out,
     __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
 
     __m128i mask = _mm_set1_epi32(val);
+    UNUSED (pi4_tmp);
+
     mult_val = _mm_and_si128(mult_val, mask);
 
     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
@@ -177,12 +183,15 @@ void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out,
     src_r3 = _mm_madd_epi16(src_r3, mult_val);
 
     //Scaling
-    if (u4_qp_div_6 >= 6) {
+    if(u4_qp_div_6 >= 6)
+    {
         src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6);
         src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6);
         src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6);
         src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6);
-    } else {
+    }
+    else
+    {
         temp0 = _mm_add_epi32(src_r0, add_rshift);
         temp1 = _mm_add_epi32(src_r1, add_rshift);
         temp2 = _mm_add_epi32(src_r2, add_rshift);
diff --git a/common/x86/ih264_inter_pred_filters_ssse3.c b/common/x86/ih264_inter_pred_filters_ssse3.c
index 64e364e..6d318c9 100644
--- a/common/x86/ih264_inter_pred_filters_ssse3.c
+++ b/common/x86/ih264_inter_pred_filters_ssse3.c
@@ -98,11 +98,10 @@ void ih264_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src,
 {
     __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
 
+    WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4;
     UNUSED(pu1_tmp);
     UNUSED(dydx);
 
-    WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4;
-
     src_strd2 = src_strd << 1;
     dst_strd2 = dst_strd << 1;
     src_strd4 = src_strd << 2;
@@ -1825,7 +1824,6 @@ void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src,
     WORD32 y_offset;
     UWORD8 *pu1_pred1;
 
-    UNUSED(pu1_tmp);
 
     __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
     __m128i src_r5_16x8b, src_r6_16x8b;
@@ -1835,6 +1833,7 @@ void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src,
     __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
     __m128i const_val16_8x16b;
 
+    UNUSED(pu1_tmp);
     y_offset = dydx & 0xf;
 
     coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
diff --git a/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c
index d43c8e2..565cc75 100644
--- a/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c
+++ b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c
@@ -113,6 +113,8 @@ void ih264_iquant_itrans_recon_4x4_dc_ssse3(WORD16 *pi2_src,
     UWORD32 *pu4_out = (UWORD32 *)pu1_out;
     WORD32 q0 = pi2_src[0];
     WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
+    UNUSED (pi2_tmp);
+
     INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
 
     if (iq_start_idx != 0 )
@@ -233,6 +235,10 @@ void ih264_iquant_itrans_recon_8x8_dc_ssse3 (WORD16 *pi2_src,
 {
     WORD32 q0 = pi2_src[0];
     WORD16 i_macro, rnd_fact = (qp_div < 6) ? 1 << (5 - qp_div) : 0;
+    UNUSED (pi2_tmp);
+    UNUSED (iq_start_idx);
+    UNUSED (pi2_dc_ld_addr);
+
     INV_QUANT(q0, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
     i_macro = ((q0 + 32) >> 6);
 
@@ -392,6 +398,12 @@ void ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 *pi2_src,
     __m128i chroma_mask = _mm_set1_epi16 (0xFF);
     __m128i value_add = _mm_set1_epi16(i_macro);
 
+    UNUSED (pi2_src);
+    UNUSED (pu2_iscal_mat);
+    UNUSED (pu2_weigh_mat);
+    UNUSED (u4_qp_div_6);
+    UNUSED (pi2_tmp);
+
     //Load pred buffer
     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
diff --git a/common/x86/ih264_iquant_itrans_recon_sse42.c b/common/x86/ih264_iquant_itrans_recon_sse42.c
index 2a4ea3f..6399b65 100644
--- a/common/x86/ih264_iquant_itrans_recon_sse42.c
+++ b/common/x86/ih264_iquant_itrans_recon_sse42.c
@@ -120,6 +120,7 @@ void ih264_iquant_itrans_recon_4x4_sse42(WORD16 *pi2_src,
     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
     __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6)));
     __m128i value_32 = _mm_set1_epi32(32);
+    UNUSED (pi2_tmp);
 
     /*************************************************************/
     /* Dequantization of coefficients. Will be replaced by SIMD  */
@@ -369,6 +370,8 @@ void ih264_iquant_itrans_recon_chroma_4x4_sse42(WORD16 *pi2_src,
     __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6)));
     __m128i value_32 = _mm_set1_epi32(32);
     __m128i chroma_mask = _mm_set1_epi16 (0xFF);
+    UNUSED (pi2_tmp);
+
     /*************************************************************/
     /* Dequantization of coefficients. Will be replaced by SIMD  */
     /* operations on platform                                    */
diff --git a/common/x86/ih264_iquant_itrans_recon_ssse3.c b/common/x86/ih264_iquant_itrans_recon_ssse3.c
index ca1397e..388cafe 100644
--- a/common/x86/ih264_iquant_itrans_recon_ssse3.c
+++ b/common/x86/ih264_iquant_itrans_recon_ssse3.c
@@ -120,6 +120,8 @@ void ih264_iquant_itrans_recon_4x4_ssse3(WORD16 *pi2_src,
     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
     __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6)));
     __m128i value_32 = _mm_set1_epi32(32);
+    UNUSED (pi2_tmp);
+    UNUSED (pi2_dc_ld_addr);
 
     /*************************************************************/
     /* Dequantization of coefficients. Will be replaced by SIMD  */
@@ -397,6 +399,9 @@ void ih264_iquant_itrans_recon_8x8_ssse3(WORD16 *pi2_src,
     __m128i resq_r0_1, resq_r0_2, resq_r1_1, resq_r1_2, resq_r2_1, resq_r2_2,
             resq_r3_1, resq_r3_2, resq_r4_1, resq_r4_2, resq_r5_1, resq_r5_2,
             resq_r6_1, resq_r6_2, resq_r7_1, resq_r7_2;
+    UNUSED (pi2_tmp);
+    UNUSED (iq_start_idx);
+    UNUSED (pi2_dc_ld_addr);
 
     /*************************************************************/
     /* Dequantization of coefficients. Will be replaced by SIMD  */
diff --git a/common/x86/ih264_resi_trans_quant_sse42.c b/common/x86/ih264_resi_trans_quant_sse42.c
index c267651..eca43ed 100644
--- a/common/x86/ih264_resi_trans_quant_sse42.c
+++ b/common/x86/ih264_resi_trans_quant_sse42.c
@@ -121,6 +121,9 @@ void ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred,
     __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
     __m128i sign_reg0, sign_reg2;
     __m128i scalemat_r0_r1, scalemat_r2_r3;
+
+    UNUSED (pu2_threshold_matrix);
+
     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
     src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
@@ -394,6 +397,8 @@ void ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WO
     __m128i scalemat_r0_r1, scalemat_r2_r3;
     __m128i chroma_mask = _mm_set1_epi16 (0xFF);
 
+    UNUSED (pu2_threshold_matrix);
+
     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
     src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
@@ -676,6 +681,8 @@ void ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
     __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
     __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
 
+    UNUSED (pu2_threshold_matrix);
+
     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
     sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
@@ -902,6 +909,8 @@ void ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
     __m128i temp_1 = _mm_set1_epi16(1);
     __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
 
+    UNUSED (pu2_threshold_matrix);
+
     src = _mm_loadu_si128((__m128i *)pi2_src);          //a0 a1 a2 a3 b0 b1 b2 b3
     sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
     plane_0 = _mm_unpacklo_epi16(src, sign_reg);        //a0 a1 a2 a3 -- 32 bits
diff --git a/decoder/ih264d_thread_parse_decode.c b/decoder/ih264d_thread_parse_decode.c
index be3cb01..1c9eb68 100644
--- a/decoder/ih264d_thread_parse_decode.c
+++ b/decoder/ih264d_thread_parse_decode.c
@@ -582,13 +582,9 @@ WORD32 ih264d_decode_slice_thread(dec_struct_t *ps_dec /* Decoder parameters */
 
 void ih264d_decode_picture_thread(dec_struct_t *ps_dec )
 {
-    volatile WORD32 i4_err_status;
-
 
     ithread_set_name("ih264d_decode_picture_thread");
 
-
-
     // run the loop till all slices are decoded
 
     while(1)
@@ -644,13 +640,6 @@ void ih264d_decode_picture_thread(dec_struct_t *ps_dec )
                 DEBUG_THREADS_PRINTF("Waiting for next slice or end of frame\n");
 
                 NOP(32);
-                if(i4_err_status != 0)
-                {
-                    /*In the case of error set decode Mb number ,so that the
-                     parse thread does not wait because of mb difference being
-                     greated the 32*/
-                    ps_dec->cur_dec_mb_num = ps_dec->u2_cur_mb_addr - 1;
-                }
             }
 
             DEBUG_THREADS_PRINTF("Got next slice/end of frame signal \n ");
diff --git a/encoder/arm/ime_distortion_metrics_a9q.s b/encoder/arm/ime_distortion_metrics_a9q.s
index b58911e..27fbe3d 100644
--- a/encoder/arm/ime_distortion_metrics_a9q.s
+++ b/encoder/arm/ime_distortion_metrics_a9q.s
@@ -17,9 +17,9 @@
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
-@/**
+@**
 
-@/**
+@**
 @******************************************************************************
 @*
 @*
@@ -48,7 +48,7 @@
 @
 
 
-@/**
+@**
 @******************************************************************************
 @*
 @* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
@@ -79,59 +79,62 @@
 @* @remarks
 @*
 @******************************************************************************
-@*/
+@*
 .text
 .p2align 2
+
     .global ime_compute_sad_16x16_fast_a9q
+
 ime_compute_sad_16x16_fast_a9q:
 
-    stmfd     sp!, {r12, lr}
-    lsl       r2, r2, #1
-    lsl       r3, r3, #1
+    stmfd         sp!, {r12, lr}
+    vpush         {d8-d15}
+    lsl           r2, r2, #1
+    lsl           r3, r3, #1
 
     @for bringing buffer2 into cache..., dummy load instructions
-    @ LDR      r12,[r1]
+    @LDR         r12,[r1]
 
-    vld1.8    {d4, d5}, [r0], r2
-    vld1.8    {d6, d7}, [r1], r3
-    mov       r12, #6
-    vld1.8    {d8, d9}, [r0], r2
-    vabdl.u8  q0, d6, d4
-    vabdl.u8  q1, d7, d5
-    vld1.8    {d10, d11}, [r1], r3
+    vld1.8        {d4, d5}, [r0], r2
+    vld1.8        {d6, d7}, [r1], r3
+    mov           r12, #6
+    vld1.8        {d8, d9}, [r0], r2
+    vabdl.u8      q0, d6, d4
+    vabdl.u8      q1, d7, d5
+    vld1.8        {d10, d11}, [r1], r3
 
 loop_sad_16x16_fast:
 
-    vld1.8    {d4, d5}, [r0], r2
-    vabal.u8  q0, d10, d8
-    vabal.u8  q1, d11, d9
-    vld1.8    {d6, d7}, [r1], r3
-    subs      r12, #2
-    vld1.8    {d8, d9}, [r0], r2
-    vabal.u8  q0, d6, d4
-    vabal.u8  q1, d7, d5
-    vld1.8    {d10, d11}, [r1], r3
-
-    bne       loop_sad_16x16_fast
+    vld1.8        {d4, d5}, [r0], r2
+    vabal.u8      q0, d10, d8
+    vabal.u8      q1, d11, d9
+    vld1.8        {d6, d7}, [r1], r3
+    subs          r12, #2
+    vld1.8        {d8, d9}, [r0], r2
+    vabal.u8      q0, d6, d4
+    vabal.u8      q1, d7, d5
+    vld1.8        {d10, d11}, [r1], r3
 
-    vabal.u8  q0, d10, d8
-    vabal.u8  q1, d11, d9
+    bne           loop_sad_16x16_fast
 
-    vadd.i16  q0, q0, q1
-    vadd.i16  d0, d1, d0
+    vabal.u8      q0, d10, d8
+    vabal.u8      q1, d11, d9
 
-    ldr       r12, [sp, #12]
-    vpaddl.u16 d0, d0
-    vpaddl.u32 d0, d0
-    vshl.u32  d0, d0, #1
-    vst1.32   {d0[0]}, [r12]
+    vadd.i16      q0, q0, q1
+    vadd.i16      d0, d1, d0
+    vpop          {d8-d15}
+    ldr           r12, [sp, #12]
+    vpaddl.u16    d0, d0
+    vpaddl.u32    d0, d0
+    vshl.u32      d0, d0, #1
+    vst1.32       {d0[0]}, [r12]
 
-    ldmfd     sp!, {r12, pc}
+    ldmfd         sp!, {r12, pc}
 
 
 
 
-@/**
+@**
 @******************************************************************************
 @*
 @*  @brief computes distortion (SAD) between 2 16x8  blocks
@@ -163,56 +166,57 @@ loop_sad_16x16_fast:
 @* @remarks
 @*
 @******************************************************************************
-@*/
+@*
 @
     .global ime_compute_sad_16x8_a9q
+
 ime_compute_sad_16x8_a9q:
 
-    stmfd     sp!, {r12, lr}
+    stmfd         sp!, {r12, lr}
 
     @for bringing buffer2 into cache..., dummy load instructions
     @LDR      r12,[r1]
 
-    vld1.8    {d4, d5}, [r0], r2
-    vld1.8    {d6, d7}, [r1], r3
-    mov       r12, #6
-    vld1.8    {d8, d9}, [r0], r2
-    vabdl.u8  q0, d6, d4
-    vabdl.u8  q1, d7, d5
-    vld1.8    {d10, d11}, [r1], r3
+    vld1.8        {d4, d5}, [r0], r2
+    vld1.8        {d6, d7}, [r1], r3
+    mov           r12, #6
+    vpush         {d8-d15}
+    vld1.8        {d8, d9}, [r0], r2
+    vabdl.u8      q0, d6, d4
+    vabdl.u8      q1, d7, d5
+    vld1.8        {d10, d11}, [r1], r3
 
 loop_sad_16x8:
 
-    vld1.8    {d4, d5}, [r0], r2
-    vabal.u8  q0, d10, d8
-    vabal.u8  q1, d11, d9
-    vld1.8    {d6, d7}, [r1], r3
-    subs      r12, #2
-    vld1.8    {d8, d9}, [r0], r2
-    vabal.u8  q0, d6, d4
-    vabal.u8  q1, d7, d5
-    vld1.8    {d10, d11}, [r1], r3
-
-    bne       loop_sad_16x8
-
-    vabal.u8  q0, d10, d8
-    vabal.u8  q1, d11, d9
+    vld1.8        {d4, d5}, [r0], r2
+    vabal.u8      q0, d10, d8
+    vabal.u8      q1, d11, d9
+    vld1.8        {d6, d7}, [r1], r3
+    subs          r12, #2
+    vld1.8        {d8, d9}, [r0], r2
+    vabal.u8      q0, d6, d4
+    vabal.u8      q1, d7, d5
+    vld1.8        {d10, d11}, [r1], r3
 
-    vadd.i16  q0, q0, q1
-    vadd.i16  d0, d1, d0
+    bne           loop_sad_16x8
 
-    ldr       r12, [sp, #12]
-    vpaddl.u16 d0, d0
-    vpaddl.u32 d0, d0
-    vst1.32   {d0[0]}, [r12]
+    vabal.u8      q0, d10, d8
+    vabal.u8      q1, d11, d9
 
-    ldmfd     sp!, {r12, pc}
+    vadd.i16      q0, q0, q1
+    vadd.i16      d0, d1, d0
+    vpop          {d8-d15}
+    ldr           r12, [sp, #12]
+    vpaddl.u16    d0, d0
+    vpaddl.u32    d0, d0
 
+    vst1.32       {d0[0]}, [r12]
 
+    ldmfd         sp!, {r12, pc}
 
 
 
-@/**
+@**
 @******************************************************************************
 @*
 @* @brief computes distortion (SAD) between 2 16x16 blocks with early exit
@@ -243,100 +247,103 @@ loop_sad_16x8:
 @* @remarks
 @*
 @******************************************************************************
-@*/
+@*
+
     .global ime_compute_sad_16x16_ea8_a9q
 
 ime_compute_sad_16x16_ea8_a9q:
 
-    stmfd     sp!, {r5-r7, lr}
-    lsl       r2, r2, #1
-    lsl       r3, r3, #1
+    stmfd         sp!, {r5-r7, lr}
+    lsl           r2, r2, #1
+    lsl           r3, r3, #1
 
     @for bringing buffer2 into cache..., dummy load instructions
     @LDR         r12,[r1]
 
-    vld1.8    {d4, d5}, [r0], r2
-    vld1.8    {d6, d7}, [r1], r3
-    mov       r5, #6
-    vld1.8    {d8, d9}, [r0], r2
-    vabdl.u8  q0, d6, d4
-    vabdl.u8  q1, d7, d5
-    vld1.8    {d10, d11}, [r1], r3
-    ldrd      r6, r7, [sp, #16]
+    vld1.8        {d4, d5}, [r0], r2
+    vld1.8        {d6, d7}, [r1], r3
+    mov           r5, #6
+    ldrd          r6, r7, [sp, #16]
+    vpush         {d8-d15}
+    vld1.8        {d8, d9}, [r0], r2
+    vabdl.u8      q0, d6, d4
+    vabdl.u8      q1, d7, d5
+    vld1.8        {d10, d11}, [r1], r3
+
     @r6 = i4_max_sad, r7 = pi4_mb_distortion
 
 loop_sad_16x16_ea8_1:
 
-    vld1.8    {d4, d5}, [r0], r2
-    vabal.u8  q0, d10, d8
-    vabal.u8  q1, d11, d9
-    vld1.8    {d6, d7}, [r1], r3
-    subs      r5, #2
-    vld1.8    {d8, d9}, [r0], r2
-    vabal.u8  q0, d6, d4
-    vabal.u8  q1, d7, d5
-    vld1.8    {d10, d11}, [r1], r3
-
-    bne       loop_sad_16x16_ea8_1
-
-    vabal.u8  q0, d10, d8
-    sub       r0, r0, r2, lsl #3
-    vabal.u8  q1, d11, d9
-    sub       r1, r1, r3, lsl #3
-
-    vadd.i16  q6, q0, q1
-    add       r0, r0, r2, asr #1
-    vadd.i16  d12, d12, d13
-    add       r1, r1, r3, asr #1
-
-    vpaddl.u16 d12, d12
-    vld1.8    {d4, d5}, [r0], r2
-    vld1.8    {d6, d7}, [r1], r3
-    vpaddl.u32 d12, d12
-    vld1.8    {d8, d9}, [r0], r2
-    vabal.u8  q0, d6, d4
-    vabal.u8  q1, d7, d5
-
-    vst1.32   {d12[0]}, [r7]
-    ldr       r5, [r7]
-    cmp       r5, r6
-    bgt       end_func_16x16_ea8
-
-    vld1.8    {d10, d11}, [r1], r3
-    mov       r5, #6
+    vld1.8        {d4, d5}, [r0], r2
+    vabal.u8      q0, d10, d8
+    vabal.u8      q1, d11, d9
+    vld1.8        {d6, d7}, [r1], r3
+    subs          r5, #2
+    vld1.8        {d8, d9}, [r0], r2
+    vabal.u8      q0, d6, d4
+    vabal.u8      q1, d7, d5
+    vld1.8        {d10, d11}, [r1], r3
+
+    bne           loop_sad_16x16_ea8_1
+
+    vabal.u8      q0, d10, d8
+    sub           r0, r0, r2, lsl #3
+    vabal.u8      q1, d11, d9
+    sub           r1, r1, r3, lsl #3
+
+    vadd.i16      q6, q0, q1
+    add           r0, r0, r2, asr #1
+    vadd.i16      d12, d12, d13
+    add           r1, r1, r3, asr #1
+
+    vpaddl.u16    d12, d12
+    vld1.8        {d4, d5}, [r0], r2
+    vld1.8        {d6, d7}, [r1], r3
+    vpaddl.u32    d12, d12
+    vld1.8        {d8, d9}, [r0], r2
+    vabal.u8      q0, d6, d4
+    vabal.u8      q1, d7, d5
+
+    vst1.32       {d12[0]}, [r7]
+    ldr           r5, [r7]
+    cmp           r5, r6
+    bgt           end_func_16x16_ea8
+
+    vld1.8        {d10, d11}, [r1], r3
+    mov           r5, #6
 
 loop_sad_16x16_ea8_2:
 
-    vld1.8    {d4, d5}, [r0], r2
-    vabal.u8  q0, d10, d8
-    vabal.u8  q1, d11, d9
-    vld1.8    {d6, d7}, [r1], r3
-    subs      r5, #2
-    vld1.8    {d8, d9}, [r0], r2
-    vabal.u8  q0, d6, d4
-    vabal.u8  q1, d7, d5
-    vld1.8    {d10, d11}, [r1], r3
+    vld1.8        {d4, d5}, [r0], r2
+    vabal.u8      q0, d10, d8
+    vabal.u8      q1, d11, d9
+    vld1.8        {d6, d7}, [r1], r3
+    subs          r5, #2
+    vld1.8        {d8, d9}, [r0], r2
+    vabal.u8      q0, d6, d4
+    vabal.u8      q1, d7, d5
+    vld1.8        {d10, d11}, [r1], r3
 
-    bne       loop_sad_16x16_ea8_2
+    bne           loop_sad_16x16_ea8_2
 
-    vabal.u8  q0, d10, d8
-    vabal.u8  q1, d11, d9
+    vabal.u8      q0, d10, d8
+    vabal.u8      q1, d11, d9
 
-    vadd.i16  q0, q0, q1
-    vadd.i16  d0, d1, d0
+    vadd.i16      q0, q0, q1
+    vadd.i16      d0, d1, d0
 
-    vpaddl.u16 d0, d0
-    vpaddl.u32 d0, d0
+    vpaddl.u16    d0, d0
+    vpaddl.u32    d0, d0
 
-    vst1.32   {d0[0]}, [r7]
+    vst1.32       {d0[0]}, [r7]
 
 end_func_16x16_ea8:
-
-    ldmfd     sp!, {r5-r7, pc}
+    vpop          {d8-d15}
+    ldmfd         sp!, {r5-r7, pc}
 
 
 
-@/*
+@*
 @//---------------------------------------------------------------------------
 @// Function Name      : Calculate_Mad2_prog()
 @//
@@ -346,7 +353,7 @@ end_func_16x16_ea8:
 @// Platform           : CortexA8/NEON            .
 @//
 @//-----------------------------------------------------------------------------
-@*/
+@*
 
     .global ime_calculate_sad2_prog_a9q
 
@@ -358,72 +365,72 @@ ime_calculate_sad2_prog_a9q:
     @ r3    = RefBufferWidth <UWORD32>
     @ stack = CurBufferWidth <UWORD32>, psad <UWORD32 *>
 
-    stmfd     sp!, {r4-r5, lr}
-
-    ldr       r4, [sp, #8]              @ load src stride to r4
-    mov       r5, #14
+    stmfd         sp!, {r4-r5, lr}
 
+    ldr           r4, [sp, #8]          @ load src stride to r4
+    mov           r5, #14
+    vpush         {d8-d15}
     @Row 1
-    vld1.8    {d0, d1}, [r2], r4        @ load src Row 1
-    vld1.8    {d2, d3}, [r0], r3        @ load ref1 Row 1
-    vld1.8    {d4, d5}, [r1], r3        @ load ref2 Row 1
+    vld1.8        {d0, d1}, [r2], r4    @ load src Row 1
+    vld1.8        {d2, d3}, [r0], r3    @ load ref1 Row 1
+    vld1.8        {d4, d5}, [r1], r3    @ load ref2 Row 1
 
     @Row 2
-    vld1.8    {d6, d7}, [r2], r4        @ load src Row 2
-    vabdl.u8  q6, d2, d0
-    vabdl.u8  q7, d3, d1
-    vld1.8    {d8, d9}, [r0], r3        @ load ref1 Row 2
-    vabdl.u8  q8, d4, d0
-    vabdl.u8  q9, d5, d1
-    vld1.8    {d10, d11}, [r1], r3      @ load ref2 Row 2
+    vld1.8        {d6, d7}, [r2], r4    @ load src Row 2
+    vabdl.u8      q6, d2, d0
+    vabdl.u8      q7, d3, d1
+    vld1.8        {d8, d9}, [r0], r3    @ load ref1 Row 2
+    vabdl.u8      q8, d4, d0
+    vabdl.u8      q9, d5, d1
+    vld1.8        {d10, d11}, [r1], r3  @ load ref2 Row 2
 
 loop_sad2_prog:
 
-    subs      r5, #2
+    subs          r5, #2
     @Row 1
-    vld1.8    {d0, d1}, [r2], r4        @ load src Row 1
-    vabal.u8  q6, d8, d6
-    vabal.u8  q7, d9, d7
-    vld1.8    {d2, d3}, [r0], r3        @ load ref1 Row 1
-    vabal.u8  q8, d10, d6
-    vabal.u8  q9, d11, d7
-    vld1.8    {d4, d5}, [r1], r3        @ load ref2 Row 1
+    vld1.8        {d0, d1}, [r2], r4    @ load src Row 1
+    vabal.u8      q6, d8, d6
+    vabal.u8      q7, d9, d7
+    vld1.8        {d2, d3}, [r0], r3    @ load ref1 Row 1
+    vabal.u8      q8, d10, d6
+    vabal.u8      q9, d11, d7
+    vld1.8        {d4, d5}, [r1], r3    @ load ref2 Row 1
 
     @Row 2
-    vld1.8    {d6, d7}, [r2], r4        @ load src Row 2
-    vabal.u8  q6, d2, d0
-    vabal.u8  q7, d3, d1
-    vld1.8    {d8, d9}, [r0], r3        @ load ref1 Row 2
-    vabal.u8  q8, d4, d0
-    vabal.u8  q9, d5, d1
-    vld1.8    {d10, d11}, [r1], r3      @ load ref2 Row 2
+    vld1.8        {d6, d7}, [r2], r4    @ load src Row 2
+    vabal.u8      q6, d2, d0
+    vabal.u8      q7, d3, d1
+    vld1.8        {d8, d9}, [r0], r3    @ load ref1 Row 2
+    vabal.u8      q8, d4, d0
+    vabal.u8      q9, d5, d1
+    vld1.8        {d10, d11}, [r1], r3  @ load ref2 Row 2
 
-    bne       loop_sad2_prog
+    bne           loop_sad2_prog
 
-    vabal.u8  q6, d8, d6
-    vabal.u8  q7, d9, d7
-    vabal.u8  q8, d10, d6
-    vabal.u8  q9, d11, d7
+    vabal.u8      q6, d8, d6
+    vabal.u8      q7, d9, d7
+    vabal.u8      q8, d10, d6
+    vabal.u8      q9, d11, d7
 
     @ Compute SAD
 
-    vadd.u16  q6, q6, q7                @ Q6  : sad_ref1
-    vadd.u16  q8, q8, q9                @ Q8  : sad_ref2
+    vadd.u16      q6, q6, q7            @ Q6  : sad_ref1
+    vadd.u16      q8, q8, q9            @ Q8  : sad_ref2
 
-    vadd.u16  d12, d12, d13
-    ldr       r5, [sp, #16]             @ loading pi4_sad to r5
-    vadd.u16  d16, d16, d17
+    vadd.u16      d12, d12, d13
+    ldr           r5, [sp, #16]         @ loading pi4_sad to r5
+    vadd.u16      d16, d16, d17
 
-    vpadd.u16 d12, d12, d16
-    vpaddl.u16 d12, d12
+    vpadd.u16     d12, d12, d16
+    vpaddl.u16    d12, d12
 
-    vst1.64   {d12}, [r5]!
+    vst1.64       {d12}, [r5]!
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r5, pc}
 
-    ldmfd     sp!, {r4-r5, pc}
 
 
-
-@/*
+@*
 @//---------------------------------------------------------------------------
 @// Function Name      : Calculate_Mad3_prog()
 @//
@@ -433,7 +440,7 @@ loop_sad2_prog:
 @// Platform           : CortexA8/NEON            .
 @//
 @//-----------------------------------------------------------------------------
-@*/
+@*
 
     .global ime_calculate_sad3_prog_a9q
 
@@ -446,90 +453,90 @@ ime_calculate_sad3_prog_a9q:
     @ stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *>
 
 
-    stmfd     sp!, {r4-r6, lr}
-
-    ldrd      r4, r5, [sp, #16]         @ load ref stride to r4, src stride to r5
-    mov       r6, #14
-
-    @ Row 1
-    vld1.8    {d0, d1}, [r3], r5        @ load src Row 1
-    vld1.8    {d2, d3}, [r0], r4        @ load ref1 Row 1
-    vld1.8    {d4, d5}, [r1], r4        @ load ref2 Row 1
-    vabdl.u8  q8, d2, d0
-    vabdl.u8  q9, d3, d1
-    vld1.8    {d6, d7}, [r2], r4        @ load ref3 Row 1
-    vabdl.u8  q10, d4, d0
-    vabdl.u8  q11, d5, d1
-
-    @ Row 2
-    vld1.8    {d8, d9}, [r3], r5        @ load src Row 1
-    vabdl.u8  q12, d6, d0
-    vabdl.u8  q13, d7, d1
-    vld1.8    {d10, d11}, [r0], r4      @ load ref1 Row 1
-    vld1.8    {d12, d13}, [r1], r4      @ load ref2 Row 1
-    vabal.u8  q8, d10, d8
-    vabal.u8  q9, d11, d9
-    vld1.8    {d14, d15}, [r2], r4      @ load ref3 Row 1
-    vabal.u8  q10, d12, d8
-    vabal.u8  q11, d13, d9
+    stmfd         sp!, {r4-r6, lr}
+
+    ldrd          r4, r5, [sp, #16]     @ load ref stride to r4, src stride to r5
+    mov           r6, #14
+    vpush         {d8-d15}
+    @Row 1
+    vld1.8        {d0, d1}, [r3], r5    @ load src Row 1
+    vld1.8        {d2, d3}, [r0], r4    @ load ref1 Row 1
+    vld1.8        {d4, d5}, [r1], r4    @ load ref2 Row 1
+    vabdl.u8      q8, d2, d0
+    vabdl.u8      q9, d3, d1
+    vld1.8        {d6, d7}, [r2], r4    @ load ref3 Row 1
+    vabdl.u8      q10, d4, d0
+    vabdl.u8      q11, d5, d1
+
+    @Row 2
+    vld1.8        {d8, d9}, [r3], r5    @ load src Row 1
+    vabdl.u8      q12, d6, d0
+    vabdl.u8      q13, d7, d1
+    vld1.8        {d10, d11}, [r0], r4  @ load ref1 Row 1
+    vld1.8        {d12, d13}, [r1], r4  @ load ref2 Row 1
+    vabal.u8      q8, d10, d8
+    vabal.u8      q9, d11, d9
+    vld1.8        {d14, d15}, [r2], r4  @ load ref3 Row 1
+    vabal.u8      q10, d12, d8
+    vabal.u8      q11, d13, d9
 
 loop_sad3_prog:
 
     @Row 1
-    vld1.8    {d0, d1}, [r3], r5        @ load src Row 1
-    vabal.u8  q12, d14, d8
-    vabal.u8  q13, d15, d9
-    vld1.8    {d2, d3}, [r0], r4        @ load ref1 Row 1
-    vld1.8    {d4, d5}, [r1], r4        @ load ref2 Row 1
-    vabal.u8  q8, d2, d0
-    vabal.u8  q9, d3, d1
-    vld1.8    {d6, d7}, [r2], r4        @ load ref3 Row 1
-    vabal.u8  q10, d4, d0
-    vabal.u8  q11, d5, d1
+    vld1.8        {d0, d1}, [r3], r5    @ load src Row 1
+    vabal.u8      q12, d14, d8
+    vabal.u8      q13, d15, d9
+    vld1.8        {d2, d3}, [r0], r4    @ load ref1 Row 1
+    vld1.8        {d4, d5}, [r1], r4    @ load ref2 Row 1
+    vabal.u8      q8, d2, d0
+    vabal.u8      q9, d3, d1
+    vld1.8        {d6, d7}, [r2], r4    @ load ref3 Row 1
+    vabal.u8      q10, d4, d0
+    vabal.u8      q11, d5, d1
 
     @Row 2
-    vld1.8    {d8, d9}, [r3], r5        @ load src Row 1
-    vabal.u8  q12, d6, d0
-    vabal.u8  q13, d7, d1
-    vld1.8    {d10, d11}, [r0], r4      @ load ref1 Row 1
-    subs      r6, #2
-    vld1.8    {d12, d13}, [r1], r4      @ load ref2 Row 1
-    vabal.u8  q8, d10, d8
-    vabal.u8  q9, d11, d9
-    vld1.8    {d14, d15}, [r2], r4      @ load ref3 Row 1
-    vabal.u8  q10, d12, d8
-    vabal.u8  q11, d13, d9
-
-    bne       loop_sad3_prog
-
-    vabal.u8  q12, d14, d8
-    vabal.u8  q13, d15, d9
+    vld1.8        {d8, d9}, [r3], r5    @ load src Row 1
+    vabal.u8      q12, d6, d0
+    vabal.u8      q13, d7, d1
+    vld1.8        {d10, d11}, [r0], r4  @ load ref1 Row 1
+    subs          r6, #2
+    vld1.8        {d12, d13}, [r1], r4  @ load ref2 Row 1
+    vabal.u8      q8, d10, d8
+    vabal.u8      q9, d11, d9
+    vld1.8        {d14, d15}, [r2], r4  @ load ref3 Row 1
+    vabal.u8      q10, d12, d8
+    vabal.u8      q11, d13, d9
+
+    bne           loop_sad3_prog
+
+    vabal.u8      q12, d14, d8
+    vabal.u8      q13, d15, d9
 
     @ Compute SAD
 
-    vadd.u16  q8, q8, q9                @ Q8  : sad_ref1
-    vadd.u16  q10, q10, q11             @ Q10 : sad_ref2
-    vadd.u16  q12, q12, q13             @ Q12 : sad_ref3
+    vadd.u16      q8, q8, q9            @ Q8  : sad_ref1
+    vadd.u16      q10, q10, q11         @ Q10 : sad_ref2
+    vadd.u16      q12, q12, q13         @ Q12 : sad_ref3
 
-    vadd.u16  d16, d16, d17
-    vadd.u16  d20, d20, d21
-    vadd.u16  d24, d24, d25
+    vadd.u16      d16, d16, d17
+    vadd.u16      d20, d20, d21
+    vadd.u16      d24, d24, d25
 
-    vpadd.u16 d16, d16, d20
-    vpadd.u16 d24, d24, d24
+    vpadd.u16     d16, d16, d20
+    vpadd.u16     d24, d24, d24
 
-    ldr       r6, [sp, #24]             @ loading pi4_sad to r6
-    vpaddl.u16 d16, d16
-    vpaddl.u16 d24, d24
+    ldr           r6, [sp, #24]         @ loading pi4_sad to r6
+    vpaddl.u16    d16, d16
+    vpaddl.u16    d24, d24
 
-    vst1.64   {d16}, [r6]!
-    vst1.32   {d24[0]}, [r6]
+    vst1.64       {d16}, [r6]!
+    vst1.32       {d24[0]}, [r6]
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r6, pc}
 
-    ldmfd     sp!, {r4-r6, pc}
 
 
-
-@/**
+@**
 @******************************************************************************
 @*
 @* @brief computes distortion (SAD) for sub-pel motion estimation
@@ -551,7 +558,7 @@ loop_sad3_prog:
 @* @remarks
 @*
 @******************************************************************************
-@*/
+@*
 
 .text
 .p2align 2
@@ -560,115 +567,116 @@ loop_sad3_prog:
 
 ime_sub_pel_compute_sad_16x16_a9q:
 
-    stmfd     sp!, {r4-r11, lr}         @store register values to stack
+    stmfd         sp!, {r4-r11, lr}     @store register values to stack
 
-    ldr       r9, [sp, #36]
-    ldr       r10, [sp, #40]
+    ldr           r9, [sp, #36]
+    ldr           r10, [sp, #40]
+    vpush         {d8-d15}
+    sub           r4, r1, #1            @ x left
+    sub           r5, r2, r10           @ y top
 
-    sub       r4, r1, #1                @ x left
-    sub       r5, r2, r10               @ y top
+    sub           r6, r3, #1            @ xy left
+    sub           r7, r3, r10           @ xy top
 
-    sub       r6, r3, #1                @ xy left
-    sub       r7, r3, r10               @ xy top
-
-    sub       r8, r7, #1                @ xy top-left
-    mov       r11, #15
+    sub           r8, r7, #1            @ xy top-left
+    mov           r11, #15
 
     @for bringing buffer2 into cache..., dummy load instructions
     @ LDR         r12,[r1]
     @ LDR         r12,[sp,#12]
 
-    vld1.8    {d0, d1}, [r0], r9        @ src
-    vld1.8    {d2, d3}, [r5], r10       @ y top LOAD
-    vld1.8    {d4, d5}, [r7], r10       @ xy top LOAD
-    vld1.8    {d6, d7}, [r8], r10       @ xy top-left LOAD
-
-    vabdl.u8  q6, d2, d0                @ y top ABS1
-    vabdl.u8  q7, d4, d0                @ xy top ABS1
-    vld1.8    {d8, d9}, [r1], r10       @ x LOAD
-    vabdl.u8  q8, d6, d0                @ xy top-left ABS1
-    vabdl.u8  q9, d8, d0                @ x ABS1
-    vld1.8    {d10, d11}, [r4], r10     @ x left LOAD
-
-    vabal.u8  q6, d3, d1                @ y top ABS2
-    vabal.u8  q7, d5, d1                @ xy top ABS2
-    vld1.8    {d2, d3}, [r2], r10       @ y LOAD
-    vabal.u8  q8, d7, d1                @ xy top-left ABS2
-    vabal.u8  q9, d9, d1                @ x ABS2
-    vld1.8    {d4, d5}, [r3], r10       @ xy LOAD
-
-    vabdl.u8  q10, d10, d0              @ x left ABS1
-    vabdl.u8  q11, d2, d0               @ y ABS1
-    vld1.8    {d6, d7}, [r6], r10       @ xy left LOAD
-    vabdl.u8  q12, d4, d0               @ xy ABS1
-    vabdl.u8  q13, d6, d0               @ xy left ABS1
+    vld1.8        {d0, d1}, [r0], r9    @ src
+    vld1.8        {d2, d3}, [r5], r10   @ y top LOAD
+    vld1.8        {d4, d5}, [r7], r10   @ xy top LOAD
+    vld1.8        {d6, d7}, [r8], r10   @ xy top-left LOAD
+
+    vabdl.u8      q6, d2, d0            @ y top ABS1
+    vabdl.u8      q7, d4, d0            @ xy top ABS1
+    vld1.8        {d8, d9}, [r1], r10   @ x LOAD
+    vabdl.u8      q8, d6, d0            @ xy top-left ABS1
+    vabdl.u8      q9, d8, d0            @ x ABS1
+    vld1.8        {d10, d11}, [r4], r10 @ x left LOAD
+
+    vabal.u8      q6, d3, d1            @ y top ABS2
+    vabal.u8      q7, d5, d1            @ xy top ABS2
+    vld1.8        {d2, d3}, [r2], r10   @ y LOAD
+    vabal.u8      q8, d7, d1            @ xy top-left ABS2
+    vabal.u8      q9, d9, d1            @ x ABS2
+    vld1.8        {d4, d5}, [r3], r10   @ xy LOAD
+
+    vabdl.u8      q10, d10, d0          @ x left ABS1
+    vabdl.u8      q11, d2, d0           @ y ABS1
+    vld1.8        {d6, d7}, [r6], r10   @ xy left LOAD
+    vabdl.u8      q12, d4, d0           @ xy ABS1
+    vabdl.u8      q13, d6, d0           @ xy left ABS1
 
 loop_sub_pel_16x16:
 
-    vabal.u8  q10, d11, d1              @ x left ABS2
-    vabal.u8  q11, d3, d1               @ y ABS2
-    subs      r11, #1
-    vabal.u8  q12, d5, d1               @ xy ABS2
-    vabal.u8  q13, d7, d1               @ xy left ABS2
-
-    vld1.8    {d0, d1}, [r0], r9        @ src
-    vabal.u8  q6, d2, d0                @ y top ABS1
-    vabal.u8  q7, d4, d0                @ xy top ABS1
-    vld1.8    {d8, d9}, [r1], r10       @ x LOAD
-    vabal.u8  q8, d6, d0                @ xy top-left ABS1
-    vabal.u8  q9, d8, d0                @ x ABS1
-    vld1.8    {d10, d11}, [r4], r10     @ x left LOAD
-
-    vabal.u8  q6, d3, d1                @ y top ABS2
-    vabal.u8  q7, d5, d1                @ xy top ABS2
-    vld1.8    {d2, d3}, [r2], r10       @ y LOAD
-    vabal.u8  q8, d7, d1                @ xy top-left ABS2
-    vabal.u8  q9, d9, d1                @ x ABS2
-    vld1.8    {d4, d5}, [r3], r10       @ xy LOAD
-
-    vabal.u8  q10, d10, d0              @ x left ABS1
-    vabal.u8  q11, d2, d0               @ y ABS1
-    vld1.8    {d6, d7}, [r6], r10       @ xy left LOAD
-    vabal.u8  q12, d4, d0               @ xy ABS1
-    vabal.u8  q13, d6, d0               @ xy left ABS1
-
-    bne       loop_sub_pel_16x16
-
-    vabal.u8  q10, d11, d1              @ x left ABS2
-    vabal.u8  q11, d3, d1               @ y ABS2
-    vabal.u8  q12, d5, d1               @ xy ABS2
-    vabal.u8  q13, d7, d1               @ xy left ABS2
-
-    vadd.i16  d0, d18, d19              @ x
-    vadd.i16  d3, d12, d13              @ y top
-    vadd.i16  d6, d14, d15              @ xy top
-    vadd.i16  d5, d26, d27              @ xy left
-    vadd.i16  d1, d20, d21              @ x left
-    vadd.i16  d2, d22, d23              @ y
-    vadd.i16  d4, d24, d25              @ xy
-    vadd.i16  d7, d16, d17              @ xy top left
-
-    vpadd.i16 d0, d0, d1
-    vpadd.i16 d2, d2, d3
-    vpadd.i16 d4, d4, d5
-    vpadd.i16 d6, d6, d7
-
-    vpaddl.u16 d0, d0
-    vpaddl.u16 d2, d2
-    ldr       r11, [sp, #44]
-    vpaddl.u16 d4, d4
-    vpaddl.u16 d6, d6
-
-    vst1.32   {d0}, [r11]!
-    vst1.32   {d2}, [r11]!
-    vst1.32   {d4}, [r11]!
-    vst1.32   {d6}, [r11]!
-
-    ldmfd     sp!, {r4-r11, pc}         @Restoring registers from stack
-
-
-
-@/**
+    vabal.u8      q10, d11, d1          @ x left ABS2
+    vabal.u8      q11, d3, d1           @ y ABS2
+    subs          r11, #1
+    vabal.u8      q12, d5, d1           @ xy ABS2
+    vabal.u8      q13, d7, d1           @ xy left ABS2
+
+    vld1.8        {d0, d1}, [r0], r9    @ src
+    vabal.u8      q6, d2, d0            @ y top ABS1
+    vabal.u8      q7, d4, d0            @ xy top ABS1
+    vld1.8        {d8, d9}, [r1], r10   @ x LOAD
+    vabal.u8      q8, d6, d0            @ xy top-left ABS1
+    vabal.u8      q9, d8, d0            @ x ABS1
+    vld1.8        {d10, d11}, [r4], r10 @ x left LOAD
+
+    vabal.u8      q6, d3, d1            @ y top ABS2
+    vabal.u8      q7, d5, d1            @ xy top ABS2
+    vld1.8        {d2, d3}, [r2], r10   @ y LOAD
+    vabal.u8      q8, d7, d1            @ xy top-left ABS2
+    vabal.u8      q9, d9, d1            @ x ABS2
+    vld1.8        {d4, d5}, [r3], r10   @ xy LOAD
+
+    vabal.u8      q10, d10, d0          @ x left ABS1
+    vabal.u8      q11, d2, d0           @ y ABS1
+    vld1.8        {d6, d7}, [r6], r10   @ xy left LOAD
+    vabal.u8      q12, d4, d0           @ xy ABS1
+    vabal.u8      q13, d6, d0           @ xy left ABS1
+
+    bne           loop_sub_pel_16x16
+
+    vabal.u8      q10, d11, d1          @ x left ABS2
+    vabal.u8      q11, d3, d1           @ y ABS2
+    vabal.u8      q12, d5, d1           @ xy ABS2
+    vabal.u8      q13, d7, d1           @ xy left ABS2
+
+    vadd.i16      d0, d18, d19          @ x
+    vadd.i16      d3, d12, d13          @ y top
+    vadd.i16      d6, d14, d15          @ xy top
+    vadd.i16      d5, d26, d27          @ xy left
+    vadd.i16      d1, d20, d21          @ x left
+    vadd.i16      d2, d22, d23          @ y
+    vadd.i16      d4, d24, d25          @ xy
+    vadd.i16      d7, d16, d17          @ xy top left
+
+    vpadd.i16     d0, d0, d1
+    vpadd.i16     d2, d2, d3
+    vpadd.i16     d4, d4, d5
+    vpadd.i16     d6, d6, d7
+
+    vpaddl.u16    d0, d0
+    vpaddl.u16    d2, d2
+    vpop          {d8-d15}
+    ldr           r11, [sp, #44]
+    vpaddl.u16    d4, d4
+    vpaddl.u16    d6, d6
+
+    vst1.32       {d0}, [r11]!
+    vst1.32       {d2}, [r11]!
+    vst1.32       {d4}, [r11]!
+    vst1.32       {d6}, [r11]!
+
+    ldmfd         sp!, {r4-r11, pc}     @Restoring registers from stack
+
+
+
+@**
 @******************************************************************************
 @*
 @* @brief computes distortion (SAD) between 2 16x16 blocks
@@ -699,7 +707,7 @@ loop_sub_pel_16x16:
 @* @remarks
 @*
 @******************************************************************************
-@*/
+@*
 
 .text
 .p2align 2
@@ -710,51 +718,52 @@ ime_compute_sad_16x16_a9q:
 
 
     @STMFD       sp!,{r12,lr}
-    stmfd     sp!, {r12, r14}           @store register values to stack
+    stmfd         sp!, {r12, r14}       @store register values to stack
 
     @for bringing buffer2 into cache..., dummy load instructions
     @ LDR         r12,[r1]
     @ LDR         r12,[sp,#12]
 
-    vld1.8    {d4, d5}, [r0], r2
-    vld1.8    {d6, d7}, [r1], r3
-
-    mov       r12, #14
-    vld1.8    {d8, d9}, [r0], r2
-    vabdl.u8  q0, d4, d6
-    vld1.8    {d10, d11}, [r1], r3
-    vabdl.u8  q1, d5, d7
+    vld1.8        {d4, d5}, [r0], r2
+    vld1.8        {d6, d7}, [r1], r3
+    vpush         {d8-d15}
+    mov           r12, #14
+    vld1.8        {d8, d9}, [r0], r2
+    vabdl.u8      q0, d4, d6
+    vld1.8        {d10, d11}, [r1], r3
+    vabdl.u8      q1, d5, d7
 
 loop_sad_16x16:
 
-    vld1.8    {d4, d5}, [r0], r2
-    vabal.u8  q0, d8, d10
-    vld1.8    {d6, d7}, [r1], r3
-    vabal.u8  q1, d9, d11
+    vld1.8        {d4, d5}, [r0], r2
+    vabal.u8      q0, d8, d10
+    vld1.8        {d6, d7}, [r1], r3
+    vabal.u8      q1, d9, d11
 
-    vld1.8    {d8, d9}, [r0], r2
-    vabal.u8  q0, d4, d6
-    subs      r12, #2
-    vld1.8    {d10, d11}, [r1], r3
-    vabal.u8  q1, d5, d7
+    vld1.8        {d8, d9}, [r0], r2
+    vabal.u8      q0, d4, d6
+    subs          r12, #2
+    vld1.8        {d10, d11}, [r1], r3
+    vabal.u8      q1, d5, d7
 
-    bne       loop_sad_16x16
+    bne           loop_sad_16x16
 
-    vabal.u8  q0, d8, d10
-    vabal.u8  q1, d9, d11
+    vabal.u8      q0, d8, d10
+    vabal.u8      q1, d9, d11
 
-    vadd.i16  q0, q0, q1
-    vadd.i16  d0, d1, d0
-    ldr       r12, [sp, #12]
+    vadd.i16      q0, q0, q1
+    vadd.i16      d0, d1, d0
+    vpop          {d8-d15}
+    ldr           r12, [sp, #12]
 
-    vpaddl.u16 d0, d0
-    vpaddl.u32 d0, d0
-    vst1.32   {d0[0]}, [r12]
+    vpaddl.u16    d0, d0
+    vpaddl.u32    d0, d0
+    vst1.32       {d0[0]}, [r12]
 
-    ldmfd     sp!, {r12, pc}            @Restoring registers from stack
+    ldmfd         sp!, {r12, pc}        @Restoring registers from stack
 
 
-@/*
+@*
 @//---------------------------------------------------------------------------
 @// Function Name      : Calculate_Mad4_prog()
 @//
@@ -764,7 +773,7 @@ loop_sad_16x16:
 @// Platform           : CortexA8/NEON            .
 @//
 @//-----------------------------------------------------------------------------
-@*/
+@*
 
     .global ime_calculate_sad4_prog_a9q
 
@@ -775,20 +784,20 @@ ime_calculate_sad4_prog_a9q:
     @ r3    = CurBufferWidth <UWORD32>
     @ stack = psad           <UWORD32 *> {at 0x34}
 
-    stmfd     sp!, {r4-r7, lr}
+    stmfd         sp!, {r4-r7, lr}
 
     @UWORD8 *left_ptr       = temp_frame - 1;
     @UWORD8 *right_ptr      = temp_frame + 1;
     @UWORD8 *top_ptr        = temp_frame - RefBufferWidth;
     @UWORD8 *bot_ptr        = temp_frame + RefBufferWidth;
 
-    mov       r7, #14
-    sub       r4, r0, #0x01             @r4 = left_ptr
-    add       r5, r0, #0x1              @r5 = right_ptr
-    sub       r6, r0, r2                @r6 = top_ptr
-    add       r0, r0, r2                @r0 = bot_ptr
+    mov           r7, #14
+    sub           r4, r0, #0x01         @r4 = left_ptr
+    add           r5, r0, #0x1          @r5 = right_ptr
+    sub           r6, r0, r2            @r6 = top_ptr
+    add           r0, r0, r2            @r0 = bot_ptr
                                         @r1 = buffer_ptr
-
+    vpush         {d8-d15}
     @D0:D1  : buffer
     @D2:D3  : top
     @D4:D5  : left
@@ -796,94 +805,93 @@ ime_calculate_sad4_prog_a9q:
     @D8:D9  : bottom
 
     @Row 1
-    vld1.8    {d0, d1}, [r1], r3        @ load src Row 1
-    vld1.8    {d2, d3}, [r6], r2        @ load top Row 1
-    vld1.8    {d4, d5}, [r4], r2        @ load left Row 1
+    vld1.8        {d0, d1}, [r1], r3    @ load src Row 1
+    vld1.8        {d2, d3}, [r6], r2    @ load top Row 1
+    vld1.8        {d4, d5}, [r4], r2    @ load left Row 1
 
-    vabdl.u8  q5, d2, d0
-    vld1.8    {d6, d7}, [r5], r2        @ load right Row 1
-    vabdl.u8  q6, d3, d1
+    vabdl.u8      q5, d2, d0
+    vld1.8        {d6, d7}, [r5], r2    @ load right Row 1
+    vabdl.u8      q6, d3, d1
 
-    vabdl.u8  q7, d0, d4
-    vld1.8    {d8, d9}, [r0], r2        @ load bottom Row 1
-    vabdl.u8  q8, d1, d5
+    vabdl.u8      q7, d0, d4
+    vld1.8        {d8, d9}, [r0], r2    @ load bottom Row 1
+    vabdl.u8      q8, d1, d5
 
     @Row 2
-    vabdl.u8  q9, d0, d6
-    vld1.8    {d26, d27}, [r1], r3      @ load src Row 2
-    vabdl.u8  q10, d1, d7
+    vabdl.u8      q9, d0, d6
+    vld1.8        {d26, d27}, [r1], r3  @ load src Row 2
+    vabdl.u8      q10, d1, d7
 
-    vabdl.u8  q11, d0, d8
-    vld1.8    {d2, d3}, [r6], r2        @ load top Row 2
-    vabdl.u8  q12, d1, d9
+    vabdl.u8      q11, d0, d8
+    vld1.8        {d2, d3}, [r6], r2    @ load top Row 2
+    vabdl.u8      q12, d1, d9
 
 loop_sad4_prog:
 
-    vabal.u8  q5, d26, d2
-    vld1.8    {d4, d5}, [r4], r2        @ load left Row 2
-    vabal.u8  q6, d27, d3
+    vabal.u8      q5, d26, d2
+    vld1.8        {d4, d5}, [r4], r2    @ load left Row 2
+    vabal.u8      q6, d27, d3
 
-    vabal.u8  q7, d26, d4
-    vld1.8    {d6, d7}, [r5], r2        @ load right Row 2
-    vabal.u8  q8, d27, d5
+    vabal.u8      q7, d26, d4
+    vld1.8        {d6, d7}, [r5], r2    @ load right Row 2
+    vabal.u8      q8, d27, d5
 
-    vabal.u8  q9, d26, d6
-    vld1.8    {d8, d9}, [r0], r2        @ load bottom Row 2
-    vabal.u8  q10, d27, d7
+    vabal.u8      q9, d26, d6
+    vld1.8        {d8, d9}, [r0], r2    @ load bottom Row 2
+    vabal.u8      q10, d27, d7
 
     @Row 1
-    vabal.u8  q11, d26, d8
-    vld1.8    {d0, d1}, [r1], r3        @ load src Row 1
-    vabal.u8  q12, d27, d9
-
-    vld1.8    {d2, d3}, [r6], r2        @ load top Row 1
-    subs      r7, #2
-    vld1.8    {d4, d5}, [r4], r2        @ load left Row 1
+    vabal.u8      q11, d26, d8
+    vld1.8        {d0, d1}, [r1], r3    @ load src Row 1
+    vabal.u8      q12, d27, d9
 
-    vabal.u8  q5, d0, d2
+    vld1.8        {d2, d3}, [r6], r2    @ load top Row 1
+    subs          r7, #2
+    vld1.8        {d4, d5}, [r4], r2    @ load left Row 1
 
-    vld1.8    {d6, d7}, [r5], r2        @ load right Row 1
-    vabal.u8  q6, d1, d3
+    vabal.u8      q5, d0, d2
+    vld1.8        {d6, d7}, [r5], r2    @ load right Row 1
+    vabal.u8      q6, d1, d3
 
-    vabal.u8  q7, d0, d4
-    vld1.8    {d8, d9}, [r0], r2        @ load bottom Row 1
-    vabal.u8  q8, d1, d5
+    vabal.u8      q7, d0, d4
+    vld1.8        {d8, d9}, [r0], r2    @ load bottom Row 1
+    vabal.u8      q8, d1, d5
 
     @Row 2
-    vabal.u8  q9, d0, d6
-    vld1.8    {d26, d27}, [r1], r3      @ load src Row 2
-    vabal.u8  q10, d1, d7
+    vabal.u8      q9, d0, d6
+    vld1.8        {d26, d27}, [r1], r3  @ load src Row 2
+    vabal.u8      q10, d1, d7
 
-    vabal.u8  q11, d0, d8
-    vld1.8    {d2, d3}, [r6], r2        @ load top Row 2
-    vabal.u8  q12, d1, d9
+    vabal.u8      q11, d0, d8
+    vld1.8        {d2, d3}, [r6], r2    @ load top Row 2
+    vabal.u8      q12, d1, d9
 
-    bne       loop_sad4_prog
+    bne           loop_sad4_prog
 
-    vabal.u8  q5, d26, d2
-    vld1.8    {d4, d5}, [r4], r2        @ load left Row 2
-    vabal.u8  q6, d27, d3
+    vabal.u8      q5, d26, d2
+    vld1.8        {d4, d5}, [r4], r2    @ load left Row 2
+    vabal.u8      q6, d27, d3
 
-    vabal.u8  q7, d26, d4
-    vld1.8    {d6, d7}, [r5], r2        @ load right Row 2
-    vabal.u8  q8, d27, d5
+    vabal.u8      q7, d26, d4
+    vld1.8        {d6, d7}, [r5], r2    @ load right Row 2
+    vabal.u8      q8, d27, d5
 
-    vabal.u8  q9, d26, d6
-    vld1.8    {d8, d9}, [r0], r2        @ load bottom Row 2
-    vabal.u8  q10, d27, d7
+    vabal.u8      q9, d26, d6
+    vld1.8        {d8, d9}, [r0], r2    @ load bottom Row 2
+    vabal.u8      q10, d27, d7
 
-    vabal.u8  q11, d26, d8
-    vabal.u8  q12, d27, d9
+    vabal.u8      q11, d26, d8
+    vabal.u8      q12, d27, d9
 
     @;Q5:Q6   : sad_top
     @;Q7:Q8   : sad_left
     @;Q9:Q10  : sad_right
     @;Q11:Q12 : sad_bot
 
-    vadd.u16  q5, q5, q6
-    vadd.u16  q7, q7, q8
-    vadd.u16  q9, q9, q10
-    vadd.u16  q11, q11, q12
+    vadd.u16      q5, q5, q6
+    vadd.u16      q7, q7, q8
+    vadd.u16      q9, q9, q10
+    vadd.u16      q11, q11, q12
 
     @; Free :-
     @; Q6,Q8,Q10,Q12
@@ -893,10 +901,10 @@ loop_sad4_prog:
     @;Q9  -> D18:D19
     @;Q11 -> D22:D23
 
-    vadd.u16  d10, d10, d11
-    vadd.u16  d14, d14, d15
-    vadd.u16  d18, d18, d19
-    vadd.u16  d22, d22, d23
+    vadd.u16      d10, d10, d11
+    vadd.u16      d14, d14, d15
+    vadd.u16      d18, d18, d19
+    vadd.u16      d22, d22, d23
 
     @;D10  : sad_top
     @;D14  : sad_left
@@ -904,35 +912,35 @@ loop_sad4_prog:
     @;D22  : sad_bot
 
 
-    vpaddl.u16 d11, d10
-    vpaddl.u16 d15, d14
-    vpaddl.u16 d19, d18
-    vpaddl.u16 d23, d22
+    vpaddl.u16    d11, d10
+    vpaddl.u16    d15, d14
+    vpaddl.u16    d19, d18
+    vpaddl.u16    d23, d22
 
     @;D11  : sad_top
     @;D15  : sad_left
     @;D19  : sad_right
     @;D23  : sad_bot
 
-    vpaddl.u32 d10, d11
-    vpaddl.u32 d22, d23
-    vpaddl.u32 d14, d15
-    vpaddl.u32 d18, d19
+    vpaddl.u32    d10, d11
+    vpaddl.u32    d22, d23
+    vpaddl.u32    d14, d15
+    vpaddl.u32    d18, d19
 
     @;D10  : sad_top
     @;D14  : sad_left
     @;D18  : sad_right
     @;D22  : sad_bot
 
-    ldr       r4, [sp, #20]             @;Can be rearranged
-
-    vsli.64   d10, d22, #32
-    vsli.64   d14, d18, #32
+    ldr           r4, [sp, #84]         @;Can be rearranged
 
-    vst1.64   {d14}, [r4]!
-    vst1.64   {d10}, [r4]!
+    vsli.64       d10, d22, #32
+    vsli.64       d14, d18, #32
 
-    ldmfd     sp!, {r4-r7, pc}
+    vst1.64       {d14}, [r4]!
+    vst1.64       {d10}, [r4]!
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r7, pc}
 
 
 
@@ -974,37 +982,37 @@ ime_compute_satqd_16x16_lumainter_a9q:
     @R5 :Distortion,ie SAD
     @R6 :is nonzero
 
-    push      {r4-r12, lr}              @push all the variables first
+    push          {r4-r12, lr}          @push all the variables first
     @ADD      SP,SP,#40         ;decrement stack pointer,to accomodate two variables
-    ldr       r4, [sp, #40]             @load the threshold address
-
-    mov       r8, #8                    @Number of 4x8 blocks to be processed
-    mov       r10, #0                   @Sad
-    mov       r7, #0                    @Nonzero info
+    ldr           r4, [sp, #40]         @load the threshold address
+    vpush         {d8-d15}
+    mov           r8, #8                @Number of 4x8 blocks to be processed
+    mov           r10, #0               @Sad
+    mov           r7, #0                @Nonzero info
     @----------------------------------------------------
 
-    vld1.u8   d30, [r0], r2             @I  load 8 pix src row 1
+    vld1.u8       d30, [r0], r2         @I  load 8 pix src row 1
 
-    vld1.u8   d31, [r1], r3             @I  load 8 pix pred row 1
+    vld1.u8       d31, [r1], r3         @I  load 8 pix pred row 1
 
-    vld1.u8   d28, [r0], r2             @I  load 8 pix src row 2
+    vld1.u8       d28, [r0], r2         @I  load 8 pix src row 2
 
-    vld1.u8   d29, [r1], r3             @I  load 8 pix pred row 2
+    vld1.u8       d29, [r1], r3         @I  load 8 pix pred row 2
 
-    vld1.u8   d26, [r0], r2             @I  load 8 pix src row 3
-    vabdl.u8  q0, d30, d31              @I  Abs diff r1 blk 12
+    vld1.u8       d26, [r0], r2         @I  load 8 pix src row 3
+    vabdl.u8      q0, d30, d31          @I  Abs diff r1 blk 12
 
-    vld1.u8   d27, [r1], r3             @I  load 8 pix pred row 3
+    vld1.u8       d27, [r1], r3         @I  load 8 pix pred row 3
 
-    vld1.u8   d24, [r0], r2             @I  load 8 pix src row 4
+    vld1.u8       d24, [r0], r2         @I  load 8 pix src row 4
 
-    vld1.u8   d25, [r1], r3             @I  load 8 pix pred row 4
-    vabdl.u8  q1, d28, d29              @I  Abs diff r1 blk 12
+    vld1.u8       d25, [r1], r3         @I  load 8 pix pred row 4
+    vabdl.u8      q1, d28, d29          @I  Abs diff r1 blk 12
 
-    vld1.u16  {q11}, [r4]               @I  load the threhold
-    vabdl.u8  q2, d26, d27              @I  Abs diff r1 blk 12
+    vld1.u16      {q11}, [r4]           @I  load the threhold
+    vabdl.u8      q2, d26, d27          @I  Abs diff r1 blk 12
 
-    vabdl.u8  q3, d24, d25              @I  Abs diff r1 blk 12
+    vabdl.u8      q3, d24, d25          @I  Abs diff r1 blk 12
 
 
 
@@ -1013,128 +1021,128 @@ core_loop:
                                         @S5  S6  S7  S8     A5  A6  A7  A8
                                         @S9  S10 S11 S12    A9  A10 A11 A12
                                         @S13 S14 S15 S16    A13 A14 A15 A16
-    ands      r11, r8, #1               @II See if we are at even or odd block
-    vadd.u16  q4 , q0, q3               @I  Add r1 r4
-    lsl       r11, r2, #2               @II Move back src 4 rows
+    ands          r11, r8, #1           @II See if we are at even or odd block
+    vadd.u16      q4 , q0, q3           @I  Add r1 r4
+    lsl           r11, r2, #2           @II Move back src 4 rows
 
-    subeq     r0, r0, r11               @II Move back src 4 rows if we are at even block
-    vadd.u16  q5 , q1, q2               @I  Add r2 r3
-    addeq     r0, r0, #8                @II Move src 8 cols forward if we are at even block
+    subeq         r0, r0, r11           @II Move back src 4 rows if we are at even block
+    vadd.u16      q5 , q1, q2           @I  Add r2 r3
+    addeq         r0, r0, #8            @II Move src 8 cols forward if we are at even block
 
-    lsl       r11, r3, #2               @II Move back pred 4 rows
-    vtrn.16   d8 , d10                  @I trnspse 1
-    subeq     r1, r1, r11               @II Move back pred 4 rows if we are at even block
+    lsl           r11, r3, #2           @II Move back pred 4 rows
+    vtrn.16       d8 , d10              @I trnspse 1
+    subeq         r1, r1, r11           @II Move back pred 4 rows if we are at even block
 
-    addeq     r1, r1, #8                @II Move pred 8 cols forward if we are at even block
-    vtrn.16   d9 , d11                  @I trnspse 2
-    subne     r0, r0, #8                @II Src 8clos back for odd rows
+    addeq         r1, r1, #8            @II Move pred 8 cols forward if we are at even block
+    vtrn.16       d9 , d11              @I trnspse 2
+    subne         r0, r0, #8            @II Src 8clos back for odd rows
 
-    subne     r1, r1, #8                @II Pred 8 cols back for odd rows
-    vtrn.32   d10, d11                  @I trnspse 4
+    subne         r1, r1, #8            @II Pred 8 cols back for odd rows
+    vtrn.32       d10, d11              @I trnspse 4
 
 
-    vtrn.32   d8 , d9                   @I trnspse 3
-    vswp      d10, d11                  @I rearrange so that the q4 and q5 add properly
+    vtrn.32       d8 , d9               @I trnspse 3
+    vswp          d10, d11              @I rearrange so that the q4 and q5 add properly
                                         @D8     S1 S4 A1 A4
                                         @D9     S2 S3 A2 A3
                                         @D11    S1 S4 A1 A4
                                         @D10    S2 S3 A2 A3
 
-    vadd.s16  q6, q4, q5                @I  Get s1 s4
-    vld1.u8   d30, [r0], r2             @II load first 8 pix src row 1
+    vadd.s16      q6, q4, q5            @I  Get s1 s4
+    vld1.u8       d30, [r0], r2         @II load first 8 pix src row 1
 
-    vtrn.s16  d12, d13                  @I  Get s2 s3
+    vtrn.s16      d12, d13              @I  Get s2 s3
                                         @D12 S1 S4 A1 A4
                                         @D13 S2 S3 A2 A3
 
-    vshl.s16  q7, q6 , #1               @I  si  = si<<1
-    vld1.u8   d31, [r1], r3             @II load first 8 pix pred row 1
+    vshl.s16      q7, q6 , #1           @I  si  = si<<1
+    vld1.u8       d31, [r1], r3         @II load first 8 pix pred row 1
 
-    vpadd.s16 d16, d12, d13             @I  (s1 + s4) (s2 + s3)
-    vld1.u8   d28, [r0], r2             @II load first 8 pix src row 2
+    vpadd.s16     d16, d12, d13         @I  (s1 + s4) (s2 + s3)
+    vld1.u8       d28, [r0], r2         @II load first 8 pix src row 2
                                         @   D16  S14 A14 S23 A23
-    vrev32.16 d0, d16                   @I
-    vuzp.s16  d16, d0                   @I
+    vrev32.16     d0, d16               @I
+    vuzp.s16      d16, d0               @I
                                         @D16  S14 S23 A14 A23
-    vadd.s16  d17, d12, d13             @I  (s1 + s2) (s3 + s4)
-    vld1.u8   d29, [r1], r3             @II load first 8 pix pred row 2
+    vadd.s16      d17, d12, d13         @I  (s1 + s2) (s3 + s4)
+    vld1.u8       d29, [r1], r3         @II load first 8 pix pred row 2
                                         @D17  S12 S34 A12 A34
 
-    vrev32.16 q9, q7                    @I  Rearrange si's
+    vrev32.16     q9, q7                @I  Rearrange si's
                                         @Q9  Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2
 
                                         @D12    S1 S4 A1 A4
                                         @D19    Z3 Z2 Y3 Y2
-    vsub.s16  d8, d12, d19              @I  (s1 - (s3<<1)) (s4 - (s2<<1))
-    vld1.u8   d26, [r0], r2             @II load first 8 pix src row 3
+    vsub.s16      d8, d12, d19          @I  (s1 - (s3<<1)) (s4 - (s2<<1))
+    vld1.u8       d26, [r0], r2         @II load first 8 pix src row 3
                                         @D13    S2 S3 A2 A3
                                         @D18    Z4 Z1 Y4 Y1
-    vsub.s16  d9, d13, d18              @I  (s2 - (s4<<1)) (s3 - (s1<<1))
-    vld1.u8   d27, [r1], r3             @II load first 8 pix pred row 3
+    vsub.s16      d9, d13, d18          @I  (s2 - (s4<<1)) (s3 - (s1<<1))
+    vld1.u8       d27, [r1], r3         @II load first 8 pix pred row 3
                                         @Q10    S8 S5 A8 A5 S7 S4 A7 A4
 
                                         @D16  S14 S23 A14 A23
-    vpadd.s16 d10, d16, d17             @I  Get sad by adding s1 s2 s3 s4
-    vld1.u8   d24, [r0], r2             @II load first 8 pix src row 4
+    vpadd.s16     d10, d16, d17         @I  Get sad by adding s1 s2 s3 s4
+    vld1.u8       d24, [r0], r2         @II load first 8 pix src row 4
                                         @D22 SAD1 SAD2 junk junk
 
 
                                         @Q8     S2 S1 A2 A1 S6 S3 A6 A3
                                         @Q10    S8 S5 A8 A5 S7 S4 A7 A4
-    vtrn.32   q8, q4                    @I  Rearrange to make ls of each block togather
+    vtrn.32       q8, q4                @I  Rearrange to make ls of each block togather
                                         @Q8     S2 S1 S8 S5 S6 S3 S7 S4
                                         @Q10    A2 A1 A8 A5 A6 A3 A7 A4
 
 
-    ldrh      r11, [r4, #16]            @I  Load the threshold for DC val blk 1
-    vdup.s16  q6, d10[0]                @I  Get the sad blk 1
-    vabdl.u8  q0, d30, d31              @II Abs diff r1 blk 12
+    ldrh          r11, [r4, #16]        @I  Load the threshold for DC val blk 1
+    vdup.s16      q6, d10[0]            @I  Get the sad blk 1
+    vabdl.u8      q0, d30, d31          @II Abs diff r1 blk 12
 
-    vshl.s16  q7, q6, #1                @I  sad_2 = sad_1<<1
-    vmov.s16  r9, d10[0]                @I  Get the sad for block 1
+    vshl.s16      q7, q6, #1            @I  sad_2 = sad_1<<1
+    vmov.s16      r9, d10[0]            @I  Get the sad for block 1
 
-    vsub.s16  q9, q7, q8                @I  Add to the lss
-    vmov.s16  r5, d10[1]                @I  Get the sad for block 2
+    vsub.s16      q9, q7, q8            @I  Add to the lss
+    vmov.s16      r5, d10[1]            @I  Get the sad for block 2
 
-    vcle.s16  q7, q11, q9               @I  Add to the lss
-    vld1.u8   d25, [r1], r3             @II load first 8 pix pred row 4
+    vcle.s16      q7, q11, q9           @I  Add to the lss
+    vld1.u8       d25, [r1], r3         @II load first 8 pix pred row 4
 
-    vdup.s16  q15, d10[1]               @I  Get the sad blk 1
-    vabdl.u8  q1, d28, d29              @II Abs diff r1 blk 12
+    vdup.s16      q15, d10[1]           @I  Get the sad blk 1
+    vabdl.u8      q1, d28, d29          @II Abs diff r1 blk 12
 
 
-    vshl.s16  q14, q15, #1              @I  sad_2 = sad_1<<1
-    vsub.s16  q3, q14, q4               @I  Add to the lss
-    vcle.s16  q15, q11, q3              @I  Add to the lss
+    vshl.s16      q14, q15, #1          @I  sad_2 = sad_1<<1
+    vsub.s16      q3, q14, q4           @I  Add to the lss
+    vcle.s16      q15, q11, q3          @I  Add to the lss
 
-    ADD       R10, R10, R9              @I  Add to  the global sad blk 1
-    vtrn.u8   q15, q7                   @I  get all comparison bits to one reg
-    vabdl.u8  q2, d26, d27              @II Abs diff r1 blk 12
+    ADD           R10, R10, R9          @I  Add to  the global sad blk 1
+    vtrn.u8       q15, q7               @I  get all comparison bits to one reg
+    vabdl.u8      q2, d26, d27          @II Abs diff r1 blk 12
 
-    ADD       R10, R10, R5              @I  Add to  the global sad blk 2
-    vshr.u8   q14, q15, #7              @I  Shift the bits so that no  overflow occurs
-    cmp       r11, r9
+    ADD           R10, R10, R5          @I  Add to  the global sad blk 2
+    vshr.u8       q14, q15, #7          @I  Shift the bits so that no  overflow occurs
+    cmp           r11, r9
 
-    movle     r7, #0xf                  @I  If not met mark it by mvoing non zero val to R7 blk 1                   ;I  Compare with threshold blk 1
-    vadd.u8   d28, d28, d29             @I  Add the bits
-    cmp       r11, r5                   @I  Compare with threshold blk 2
+    movle         r7, #0xf              @I  If not met mark it by mvoing non zero val to R7 blk 1                   ;I  Compare with threshold blk 1
+    vadd.u8       d28, d28, d29         @I  Add the bits
+    cmp           r11, r5               @I  Compare with threshold blk 2
 
-    movle     r7, #0xf                  @I  If not met mark it by mvoing non zero val to R7 blk 2
-    vpadd.u8  d28, d28, d29             @I  Add the bits
+    movle         r7, #0xf              @I  If not met mark it by mvoing non zero val to R7 blk 2
+    vpadd.u8      d28, d28, d29         @I  Add the bits
 
-    vmov.u32  r11, d28[0]               @I  Since a set bit now represents a unstatisofrd contifon store it in r11
-    vabdl.u8  q3, d24, d25              @II Abs diff r1 blk 12
+    vmov.u32      r11, d28[0]           @I  Since a set bit now represents a unstatisofrd contifon store it in r11
+    vabdl.u8      q3, d24, d25          @II Abs diff r1 blk 12
 
-    orr       r7, r7, r11               @I  get the guy to r11
+    orr           r7, r7, r11           @I  get the guy to r11
 
 
-    sub       r8, r8, #1                @I  Decremrnt block count
+    sub           r8, r8, #1            @I  Decremrnt block count
 
-    cmp       r7, #0                    @I  If we have atlest one non zero block
-    bne       compute_sad_only          @I  if a non zero block is der,From now on compute sad only
+    cmp           r7, #0                @I  If we have atlest one non zero block
+    bne           compute_sad_only      @I  if a non zero block is der,From now on compute sad only
 
-    cmp       r8, #1                    @I  See if we are at the last block
-    bne       core_loop                 @I  If the blocks are zero, lets continue the satdq
+    cmp           r8, #1                @I  See if we are at the last block
+    bne           core_loop             @I  If the blocks are zero, lets continue the satdq
 
 
     @EPILOUGE for core loop
@@ -1142,94 +1150,94 @@ core_loop:
                                         @S5  S6  S7  S8     A5  A6  A7  A8
                                         @S9  S10 S11 S12    A9  A10 A11 A12
                                         @S13 S14 S15 S16    A13 A14 A15 A16
-    vadd.u16  q4 , q0, q3               @Add r1 r4
-    vadd.u16  q5 , q1, q2               @Add r2 r3
+    vadd.u16      q4 , q0, q3           @Add r1 r4
+    vadd.u16      q5 , q1, q2           @Add r2 r3
                                         @D8     S1 S2 S2 S1
                                         @D10    S4 S3 S3 S4
                                         @D9     A1 A2 A2 A1
                                         @D11    A4 A3 A3 A4
-    vtrn.16   d8 , d10                  @I trnspse 1
-    vtrn.16   d9 , d11                  @I trnspse 2
-    vtrn.32   d8 , d9                   @I trnspse 3
-    vtrn.32   d10, d11                  @I trnspse 4
+    vtrn.16       d8 , d10              @I trnspse 1
+    vtrn.16       d9 , d11              @I trnspse 2
+    vtrn.32       d8 , d9               @I trnspse 3
+    vtrn.32       d10, d11              @I trnspse 4
 
-    vswp      d10, d11                  @I rearrange so that the q4 and q5 add properly
+    vswp          d10, d11              @I rearrange so that the q4 and q5 add properly
                                         @D8     S1 S4 A1 A4
                                         @D9     S2 S3 A2 A3
                                         @D11    S1 S4 A1 A4
                                         @D10    S2 S3 A2 A3
-    vadd.s16  q6, q4, q5                @Get s1 s4
-    vtrn.s16  d12, d13                  @Get s2 s3
+    vadd.s16      q6, q4, q5            @Get s1 s4
+    vtrn.s16      d12, d13              @Get s2 s3
                                         @D12 S1 S4 A1 A4
                                         @D13 S2 S3 A2 A3
 
-    vshl.s16  q7, q6 , #1               @si  = si<<1
-    vmov.s16  r9, d10[0]                @Get the sad for block 1
+    vshl.s16      q7, q6 , #1           @si  = si<<1
+    vmov.s16      r9, d10[0]            @Get the sad for block 1
 
-    vpadd.s16 d16, d12, d13             @(s1 + s4) (s2 + s3)
-    vmov.s16  r5, d10[1]                @Get the sad for block 2
+    vpadd.s16     d16, d12, d13         @(s1 + s4) (s2 + s3)
+    vmov.s16      r5, d10[1]            @Get the sad for block 2
                                         @D16  S14 A14 S23 A23
-    vrev32.16 d30, d16                  @
-    vuzp.s16  d16, d30                  @
+    vrev32.16     d30, d16              @
+    vuzp.s16      d16, d30              @
                                         @D16  S14 S23 A14 A23
-    vadd.s16  d17, d12, d13             @(s1 + s2) (s3 + s4)
+    vadd.s16      d17, d12, d13         @(s1 + s2) (s3 + s4)
                                         @D17  S12 S34 A12 A34
 
-    vrev32.16 q9, q7                    @Rearrange si's
+    vrev32.16     q9, q7                @Rearrange si's
                                         @Q9  Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2
 
                                         @D12    S1 S4 A1 A4
                                         @D19    Z3 Z2 Y3 Y2
-    vsub.s16  d8, d12, d19              @(s1 - (s3<<1)) (s4 - (s2<<1))
+    vsub.s16      d8, d12, d19          @(s1 - (s3<<1)) (s4 - (s2<<1))
                                         @D13    S2 S3 A2 A3
                                         @D18    Z4 Z1 Y4 Y1
-    vsub.s16  d9, d13, d18              @(s2 - (s4<<1)) (s3 - (s1<<1))
+    vsub.s16      d9, d13, d18          @(s2 - (s4<<1)) (s3 - (s1<<1))
                                         @Q10    S8 S5 A8 A5 S7 S4 A7 A4
 
                                         @D16  S14 S23 A14 A23
-    vpadd.s16 d10, d16, d17             @I  Get sad by adding s1 s2 s3 s4
+    vpadd.s16     d10, d16, d17         @I  Get sad by adding s1 s2 s3 s4
                                         @D22 SAD1 SAD2 junk junk
-    vmov.u16  r9, d10[0]                @Get the sad for block 1
-    vmov.u16  r5, d10[1]                @Get the sad for block 2
+    vmov.u16      r9, d10[0]            @Get the sad for block 1
+    vmov.u16      r5, d10[1]            @Get the sad for block 2
 
                                         @Q8     S2 S1 A2 A1 S6 S3 A6 A3
                                         @Q10    S8 S5 A8 A5 S7 S4 A7 A4
-    ldrh      r11, [r4, #16]            @Load the threshold for DC val blk 1
-    vtrn.32   q8, q4                    @Rearrange to make ls of each block togather
-    ADD       R10, R10, R9              @Add to  the global sad blk 1
+    ldrh          r11, [r4, #16]        @Load the threshold for DC val blk 1
+    vtrn.32       q8, q4                @Rearrange to make ls of each block togather
+    ADD           R10, R10, R9          @Add to  the global sad blk 1
 
                                         @Q8     S2 S1 S8 S5 S6 S3 S7 S4
                                         @Q10    A2 A1 A8 A5 A6 A3 A7 A4
 
-    vld1.u16  {q11}, [r4]               @load the threhold
-    ADD       R10, R10, R5              @Add to  the global sad blk 2
+    vld1.u16      {q11}, [r4]           @load the threhold
+    ADD           R10, R10, R5          @Add to  the global sad blk 2
 
-    vdup.u16  q6, d10[0]                @Get the sad blk 1
+    vdup.u16      q6, d10[0]            @Get the sad blk 1
 
-    cmp       r11, r9                   @Compare with threshold blk 1
-    vshl.u16  q7, q6, #1                @sad_2 = sad_1<<1
+    cmp           r11, r9               @Compare with threshold blk 1
+    vshl.u16      q7, q6, #1            @sad_2 = sad_1<<1
 
-    vsub.s16  q9, q7, q8                @Add to the lss
+    vsub.s16      q9, q7, q8            @Add to the lss
 
-    vcle.s16  q15, q11, q9              @Add to the lss
-    movle     r7, #0xf                  @If not met mark it by mvoing non zero val to R7 blk 1
+    vcle.s16      q15, q11, q9          @Add to the lss
+    movle         r7, #0xf              @If not met mark it by mvoing non zero val to R7 blk 1
 
-    cmp       r11, r5                   @Compare with threshold blk 2
-    vdup.u16  q14, d10[1]               @Get the sad blk 1
+    cmp           r11, r5               @Compare with threshold blk 2
+    vdup.u16      q14, d10[1]           @Get the sad blk 1
 
-    vshl.u16  q13, q14, #1              @sad_2 = sad_1<<1
-    vsub.s16  q12, q13, q4              @Add to the lss
-    vcle.s16  q14, q11, q12             @Add to the lss
-    movle     r7, #0xf                  @If not met mark it by mvoing non zero val to R7 blk 2
+    vshl.u16      q13, q14, #1          @sad_2 = sad_1<<1
+    vsub.s16      q12, q13, q4          @Add to the lss
+    vcle.s16      q14, q11, q12         @Add to the lss
+    movle         r7, #0xf              @If not met mark it by mvoing non zero val to R7 blk 2
 
-    vtrn.u8   q14, q15                  @get all comparison bits to one reg
-    vshr.u8   q14, q14, #7              @Shift the bits so that no  overflow occurs
-    vadd.u8   d28, d28, d29             @Add the bits
-    vpadd.u8  d28, d28, d29             @Add the bits
-    vmov.u32  r11, d28[0]               @Since a set bit now represents a unstatisofrd contifon store it in r11
-    orr       r7, r7, r11               @get the guy to r11
+    vtrn.u8       q14, q15              @get all comparison bits to one reg
+    vshr.u8       q14, q14, #7          @Shift the bits so that no  overflow occurs
+    vadd.u8       d28, d28, d29         @Add the bits
+    vpadd.u8      d28, d28, d29         @Add the bits
+    vmov.u32      r11, d28[0]           @Since a set bit now represents a unstatisofrd contifon store it in r11
+    orr           r7, r7, r11           @get the guy to r11
 
-    b         funcend_sad_16x16         @Since all blocks ar processed nw, got to end
+    b             funcend_sad_16x16     @Since all blocks ar processed nw, got to end
 
 compute_sad_only:                       @This block computes SAD only, so will be lighter
                                         @IT will start processign at n odd block
@@ -1237,117 +1245,119 @@ compute_sad_only:                       @This block computes SAD only, so will b
                                         @and then for two blocks at a time
                                         @The counter is r7, hence r7 blocks will be processed
 
-    and       r11, r8, #1               @Get the last bit of counter
-    cmp       r11, #0                   @See if we are at even or odd block
+    and           r11, r8, #1           @Get the last bit of counter
+    cmp           r11, #0               @See if we are at even or odd block
                                         @iif the blk is even we just have to set the pointer to the
                                         @start of current row
 
-    lsleq     r11, r2, #2               @I  Move back src 4 rows
-    subeq     r0, r0, r11               @I  Move back src 4 rows if we are at even block
+    lsleq         r11, r2, #2           @I  Move back src 4 rows
+    subeq         r0, r0, r11           @I  Move back src 4 rows if we are at even block
 
-    lsleq     r11, r3, #2               @I  Move back pred 4 rows
-    subeq     r1, r1, r11               @I  Move back pred 4 rows if we are at even block
+    lsleq         r11, r3, #2           @I  Move back pred 4 rows
+    subeq         r1, r1, r11           @I  Move back pred 4 rows if we are at even block
     @ADDEQ R8,R8,#2         ;Inc counter
-    beq       skip_odd_blk              @If the blk is odd we have to compute sad
+    beq           skip_odd_blk          @If the blk is odd we have to compute sad
 
 
-    vadd.u16  q4, q0, q1                @Add SAD of row1 and row2
-    vadd.u16  q5, q2, q3                @Add SAD of row3 and row4
-    vadd.u16  q6, q4, q5                @Add SAD of row 1-4
-    vadd.u16  d14, d12, d13             @Add Blk1 and blk2
-    vpadd.u16 d16, d14, d15             @Add col 1-2 and 3-4
-    vpadd.u16 d18, d16, d17             @Add col 12-34
+    vadd.u16      q4, q0, q1            @Add SAD of row1 and row2
+    vadd.u16      q5, q2, q3            @Add SAD of row3 and row4
+    vadd.u16      q6, q4, q5            @Add SAD of row 1-4
+    vadd.u16      d14, d12, d13         @Add Blk1 and blk2
+    vpadd.u16     d16, d14, d15         @Add col 1-2 and 3-4
+    vpadd.u16     d18, d16, d17         @Add col 12-34
 
-    vmov.u16  r9, d18[0]                @Move sad to arm
-    ADD       R10, R10, R9              @Add to  the global sad
+    vmov.u16      r9, d18[0]            @Move sad to arm
+    ADD           R10, R10, R9          @Add to  the global sad
 
-    sub       r8, r8, #1                @Dec counter
-    cmp       r8, #0                    @See if we processed last block
-    beq       funcend_sad_16x16         @if lprocessed last block goto end of func
+    sub           r8, r8, #1            @Dec counter
+    cmp           r8, #0                @See if we processed last block
+    beq           funcend_sad_16x16     @if lprocessed last block goto end of func
 
-    sub       r0, r0, #8                @Since we processed od block move back src by 8 cols
-    sub       r1, r1, #8                @Since we processed od block move back pred by 8 cols
+    sub           r0, r0, #8            @Since we processed od block move back src by 8 cols
+    sub           r1, r1, #8            @Since we processed od block move back pred by 8 cols
 
 skip_odd_blk:
 
-    vmov.s16  q0, #0                    @Initialize the accumulator
-    vmov.s16  q1, #0                    @Initialize the accumulator
+    vmov.s16      q0, #0                @Initialize the accumulator
+    vmov.s16      q1, #0                @Initialize the accumulator
 
-    vld1.u8   {q15}, [r0], r2           @load src r1
-    vld1.u8   {q14}, [r1], r3           @load pred r1
+    vld1.u8       {q15}, [r0], r2       @load src r1
+    vld1.u8       {q14}, [r1], r3       @load pred r1
 
-    vld1.u8   {q13}, [r0], r2           @load src r2
-    vld1.u8   {q12}, [r1], r3           @load pred r2
+    vld1.u8       {q13}, [r0], r2       @load src r2
+    vld1.u8       {q12}, [r1], r3       @load pred r2
 
-    vld1.u8   {q11}, [r0], r2           @load src r3
-    vld1.u8   {q10}, [r1], r3           @load pred r2
+    vld1.u8       {q11}, [r0], r2       @load src r3
+    vld1.u8       {q10}, [r1], r3       @load pred r2
 
-    vld1.u8   {q9}, [r0], r2            @load src r4
-    vld1.u8   {q8}, [r1], r3            @load pred r4
+    vld1.u8       {q9}, [r0], r2        @load src r4
+    vld1.u8       {q8}, [r1], r3        @load pred r4
 
-    cmp       r8, #2
-    beq       sad_epilouge
+    cmp           r8, #2
+    beq           sad_epilouge
 
 sad_loop:
 
-    vabal.u8  q0, d30, d28              @I  accumulate Abs diff R1
-    vabal.u8  q1, d31, d29              @I  accumulate Abs diff R1
+    vabal.u8      q0, d30, d28          @I  accumulate Abs diff R1
+    vabal.u8      q1, d31, d29          @I  accumulate Abs diff R1
 
-    vld1.u8   {q15}, [r0], r2           @II load r1 src
-    vabal.u8  q0, d26, d24              @I  accumulate Abs diff R2
+    vld1.u8       {q15}, [r0], r2       @II load r1 src
+    vabal.u8      q0, d26, d24          @I  accumulate Abs diff R2
 
-    vld1.u8   {q14}, [r1], r3           @II load r1 pred
-    vabal.u8  q1, d27, d25              @I  accumulate Abs diff R2
+    vld1.u8       {q14}, [r1], r3       @II load r1 pred
+    vabal.u8      q1, d27, d25          @I  accumulate Abs diff R2
 
-    vld1.u8   {q13}, [r0], r2           @II load r3 src
-    vabal.u8  q0, d22, d20              @I  accumulate Abs diff R3
+    vld1.u8       {q13}, [r0], r2       @II load r3 src
+    vabal.u8      q0, d22, d20          @I  accumulate Abs diff R3
 
-    vld1.u8   {q12}, [r1], r3           @II load r2 pred
-    vabal.u8  q1, d23, d21              @I  accumulate Abs diff R3
+    vld1.u8       {q12}, [r1], r3       @II load r2 pred
+    vabal.u8      q1, d23, d21          @I  accumulate Abs diff R3
 
-    vld1.u8   {q11}, [r0], r2           @II load r3 src
-    vabal.u8  q0, d18, d16              @I  accumulate Abs diff R4
+    vld1.u8       {q11}, [r0], r2       @II load r3 src
+    vabal.u8      q0, d18, d16          @I  accumulate Abs diff R4
 
 
-    sub       r8, r8, #2                @Since we processe 16 pix @a time, dec by 2
-    vld1.u8   {q10}, [r1], r3           @II load r3 pred
-    vabal.u8  q1, d19, d17              @I  accumulate Abs diff R4
+    sub           r8, r8, #2            @Since we processe 16 pix @a time, dec by 2
+    vld1.u8       {q10}, [r1], r3       @II load r3 pred
+    vabal.u8      q1, d19, d17          @I  accumulate Abs diff R4
 
-    cmp       r8, #2                    @Check if last loop
-    vld1.u8   {q9}, [r0], r2            @II load r4 src
-    vld1.u8   {q8}, [r1], r3            @II load r4 pred
+    cmp           r8, #2                @Check if last loop
+    vld1.u8       {q9}, [r0], r2        @II load r4 src
+    vld1.u8       {q8}, [r1], r3        @II load r4 pred
 
-    bne       sad_loop                  @Go back to SAD computation
+    bne           sad_loop              @Go back to SAD computation
 
 sad_epilouge:
-    vabal.u8  q0, d30, d28              @Accumulate Abs diff R1
-    vabal.u8  q1, d31, d29              @Accumulate Abs diff R1
+    vabal.u8      q0, d30, d28          @Accumulate Abs diff R1
+    vabal.u8      q1, d31, d29          @Accumulate Abs diff R1
 
-    vabal.u8  q0, d26, d24              @Accumulate Abs diff R2
-    vabal.u8  q1, d27, d25              @Accumulate Abs diff R2
+    vabal.u8      q0, d26, d24          @Accumulate Abs diff R2
+    vabal.u8      q1, d27, d25          @Accumulate Abs diff R2
 
-    vabal.u8  q0, d22, d20              @Accumulate Abs diff R3
-    vabal.u8  q1, d23, d21              @Aaccumulate Abs diff R3
+    vabal.u8      q0, d22, d20          @Accumulate Abs diff R3
+    vabal.u8      q1, d23, d21          @Aaccumulate Abs diff R3
 
-    vabal.u8  q0, d18, d16              @Accumulate Abs diff R4
-    vabal.u8  q1, d19, d17              @Accumulate Abs diff R4
+    vabal.u8      q0, d18, d16          @Accumulate Abs diff R4
+    vabal.u8      q1, d19, d17          @Accumulate Abs diff R4
 
-    vadd.u16  q2, q0, q1                @ADD two accumulators
-    vadd.u16  d6, d4, d5                @Add two blk sad
-    vpadd.u16 d8, d6, d7                @Add col 1-2 and 3-4 sad
-    vpadd.u16 d10, d8, d9               @Add col 12-34 sad
+    vadd.u16      q2, q0, q1            @ADD two accumulators
+    vadd.u16      d6, d4, d5            @Add two blk sad
+    vpadd.u16     d8, d6, d7            @Add col 1-2 and 3-4 sad
+    vpadd.u16     d10, d8, d9           @Add col 12-34 sad
 
-    vmov.u16  r9, d10[0]                @move SAD to ARM
-    ADD       R10, R10, R9              @Add to  the global sad
+    vmov.u16      r9, d10[0]            @move SAD to ARM
+    ADD           R10, R10, R9          @Add to  the global sad
 
 funcend_sad_16x16:                      @End of fucntion process
-    ldr       r5, [sp, #44]
-    ldr       r6, [sp, #48]
 
-    str       r7, [r6]                  @Store the is zero reg
-    str       r10, [r5]                 @Store sad
+    vpop          {d8-d15}
+    ldr           r5, [sp, #44]
+    ldr           r6, [sp, #48]
+
+    str           r7, [r6]              @Store the is zero reg
+    str           r10, [r5]             @Store sad
 
     @SUB SP,SP,#40
-    pop       {r4-r12, pc}
+    pop           {r4-r12, pc}
 
 
diff --git a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
index c442077..e768c21 100644
--- a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
+++ b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
@@ -76,7 +76,7 @@
 .p2align 2
 .include "ih264_neon_macros.s"
 
-.globl ih264e_evaluate_intra16x16_modes_av8
+.global ih264e_evaluate_intra16x16_modes_av8
 
 ih264e_evaluate_intra16x16_modes_av8:
 
diff --git a/encoder/armv8/ih264e_half_pel_av8.s b/encoder/armv8/ih264e_half_pel_av8.s
index 6dbd8f8..817faa6 100644
--- a/encoder/armv8/ih264e_half_pel_av8.s
+++ b/encoder/armv8/ih264e_half_pel_av8.s
@@ -1015,10 +1015,3 @@ filter_2dvh_skip_row:
 
 
 ///*****************************************
-
-
-
-
-
-
-    .section .note.gnu-stack,"",%progbits
diff --git a/encoder/armv8/ime_distortion_metrics_av8.s b/encoder/armv8/ime_distortion_metrics_av8.s
index 99ebc8a..47c3425 100644
--- a/encoder/armv8/ime_distortion_metrics_av8.s
+++ b/encoder/armv8/ime_distortion_metrics_av8.s
@@ -975,4 +975,3 @@ satdq_end_func:
     ldp       d8, d9, [sp], #16
     pop_v_regs
     ret
-    .section .note.gnu-stack,"",%progbits
diff --git a/encoder/x86/ih264e_intra_modes_eval_ssse3.c b/encoder/x86/ih264e_intra_modes_eval_ssse3.c
index 657921f..0f4a9ad 100644
--- a/encoder/x86/ih264e_intra_modes_eval_ssse3.c
+++ b/encoder/x86/ih264e_intra_modes_eval_ssse3.c
@@ -487,7 +487,7 @@ void ih264e_evaluate_intra_4x4_modes_ssse3(UWORD8 *pu1_src,
                               INT_MAX, INT_MAX, INT_MAX, INT_MAX };
 
     WORD32 min_cost;
-    WORD32 lambda4 = u4_lambda << 2;
+    UWORD32 lambda4 = u4_lambda << 2;
     WORD32 dst_strd2, dst_strd3;
 
     __m128i left_top_16x8b, src_16x8b, pred0_16x8b, sad_8x16b;
diff --git a/encoder/x86/ime_distortion_metrics_sse42.c b/encoder/x86/ime_distortion_metrics_sse42.c
index 0876788..baf18a4 100644
--- a/encoder/x86/ime_distortion_metrics_sse42.c
+++ b/encoder/x86/ime_distortion_metrics_sse42.c
@@ -110,6 +110,7 @@ void ime_compute_sad_16x16_sse42(UWORD8 *pu1_src,
     __m128i res_r0, res_r1, res_r2, res_r3;
     __m128i sad_val;
     int val1, val2;
+    UNUSED (i4_max_sad);
 
     // Row 0-3 sad calculation
     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
@@ -248,6 +249,7 @@ void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src,
                     WORD32 i4_max_sad,
                     WORD32 *pi4_mb_distortion)
 {
+    UNUSED (i4_max_sad);
     __m128i src_r0, src_r1, src_r2, src_r3;
     __m128i est_r0, est_r1, est_r2, est_r3;
     __m128i res_r0, res_r1, res_r2, res_r3;
@@ -498,6 +500,7 @@ void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src,
                                 WORD32 i4_max_sad,
                                 WORD32 *pi4_mb_distortion)
 {
+    UNUSED (i4_max_sad);
     __m128i src_r0, src_r1, src_r2, src_r3;
     __m128i est_r0, est_r1, est_r2, est_r3;
     __m128i res_r0, res_r1, res_r2, res_r3;
diff --git a/test/encoder/main.c b/test/encoder/main.c
index 2a9635d..bb9cabf 100644
--- a/test/encoder/main.c
+++ b/test/encoder/main.c
@@ -29,7 +29,10 @@
 #include <assert.h>
 #include <string.h>
 #include <sys/time.h>
+
+#ifndef IOS
 #include <malloc.h>
+#endif
 
 #ifdef WINDOWS_TIMER
 #include "windows.h"
@@ -1989,7 +1992,7 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
 /*         20 11 2013   100189          Initial Version                      */
 /*****************************************************************************/
 #ifdef IOS
-int h264enc_main(char * homedir)
+int h264enc_main(char * homedir,char *documentdir, int screen_wd, int screen_ht)
 #else
 int main(int argc, char *argv[])
 #endif
@@ -2036,6 +2039,9 @@ int main(int argc, char *argv[])
         strcpy(ac_cfg_fname, argv[1]);
     }
 
+#else
+    strcpy(ac_cfg_fname, "test.cfg");
+
 #endif
 
     /*************************************************************************/
@@ -2406,22 +2412,22 @@ int main(int argc, char *argv[])
 
 #ifdef IOS
     /* Correct file paths */
-    sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_ip_fname);
+    sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_ip_fname);
     strcpy (s_app_ctxt.ac_ip_fname, filename_with_path);
 
-    sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_op_fname);
+    sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_op_fname);
     strcpy (s_app_ctxt.ac_op_fname, filename_with_path);
 
-    sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_recon_fname);
+    sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_recon_fname);
     strcpy (s_app_ctxt.ac_recon_fname, filename_with_path);
 
-    sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_chksum_fname);
+    sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_chksum_fname);
     strcpy (s_app_ctxt.ac_chksum_fname, filename_with_path);
 
-    sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_mb_info_fname);
+    sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_mb_info_fname);
     strcpy (s_app_ctxt.ac_mb_info_fname, filename_with_path);
 
-    sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_pic_info_fname);
+    sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_pic_info_fname);
     strcpy (s_app_ctxt.ac_pic_info_fname, filename_with_path);
 #endif