summaryrefslogtreecommitdiffstats
path: root/common/arm
diff options
context:
space:
mode:
authorHamsalekha S <hamsalekha.s@ittiam.com>2015-03-13 21:24:58 +0530
committerHamsalekha S <hamsalekha.s@ittiam.com>2015-04-02 15:59:02 +0530
commit8d3d303c7942ced6a987a52db8977d768dc3605f (patch)
treecc806c96794356996b13ba9970941d0aed74a97e /common/arm
parent3956d913d37327dcb340f836e604b04bd478b158 (diff)
downloadandroid_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz
android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2
android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip
Initial version
Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017
Diffstat (limited to 'common/arm')
-rwxr-xr-xcommon/arm/ih264_arm_memory_barrier.s77
-rwxr-xr-xcommon/arm/ih264_deblk_chroma_a9.s1337
-rwxr-xr-xcommon/arm/ih264_deblk_luma_a9.s1092
-rwxr-xr-xcommon/arm/ih264_default_weighted_pred_a9q.s359
-rwxr-xr-xcommon/arm/ih264_ihadamard_scaling_a9.s250
-rwxr-xr-xcommon/arm/ih264_inter_pred_chroma_a9q.s254
-rwxr-xr-xcommon/arm/ih264_inter_pred_filters_luma_horz_a9q.s245
-rwxr-xr-xcommon/arm/ih264_inter_pred_filters_luma_vert_a9q.s301
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_bilinear_a9q.s398
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_copy_a9q.s253
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s441
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s1044
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_horz_qpel_a9q.s266
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s505
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s355
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_vert_qpel_a9q.s330
-rwxr-xr-xcommon/arm/ih264_intra_pred_chroma_a9q.s551
-rwxr-xr-xcommon/arm/ih264_intra_pred_luma_16x16_a9q.s520
-rwxr-xr-xcommon/arm/ih264_intra_pred_luma_4x4_a9q.s842
-rwxr-xr-xcommon/arm/ih264_intra_pred_luma_8x8_a9q.s1037
-rwxr-xr-xcommon/arm/ih264_iquant_itrans_recon_a9.s871
-rwxr-xr-xcommon/arm/ih264_iquant_itrans_recon_dc_a9.s399
-rwxr-xr-xcommon/arm/ih264_itrans_recon_a9.s216
-rwxr-xr-xcommon/arm/ih264_mem_fns_neon.s268
-rwxr-xr-xcommon/arm/ih264_padding_neon.s646
-rwxr-xr-xcommon/arm/ih264_platform_macros.h152
-rwxr-xr-xcommon/arm/ih264_resi_trans_a9.s604
-rwxr-xr-xcommon/arm/ih264_resi_trans_quant_a9.s694
-rwxr-xr-xcommon/arm/ih264_weighted_bi_pred_a9q.s642
-rwxr-xr-xcommon/arm/ih264_weighted_pred_a9q.s479
30 files changed, 15428 insertions, 0 deletions
diff --git a/common/arm/ih264_arm_memory_barrier.s b/common/arm/ih264_arm_memory_barrier.s
new file mode 100755
index 0000000..523218f
--- /dev/null
+++ b/common/arm/ih264_arm_memory_barrier.s
@@ -0,0 +1,77 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@*******************************************************************************
+@* @file
+@* ih264_arm_memory_barrier.s
+@*
+@* @brief
+@* Contains function definitions for data synchronization.
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+
+.text
+.p2align 2
+
+
+@*****************************************************************************
+@*
+@* Function Name : ih264_arm_dsb
+@* Description : Adds DSB
+@* Revision History :
+@* DD MM YYYY Author(s) Changes
+@* 03 07 2008 100355 First version
+@*
+@*****************************************************************************
+
+ .global ih264_arm_dsb
+ih264_arm_dsb:
+ dsb
+ bx lr
+
+
+
+@*****************************************************************************
+@*
+@* Function Name : ih264_arm_dmb
+@* Description : Adds DMB
+@* Revision History :
+@* DD MM YYYY Author(s) Changes
+@* 03 07 2008 100355 First version
+@*
+@*****************************************************************************
+
+ .global ih264_arm_dmb
+
+ih264_arm_dmb:
+ dmb
+ bx lr
+
+
+
diff --git a/common/arm/ih264_deblk_chroma_a9.s b/common/arm/ih264_deblk_chroma_a9.s
new file mode 100755
index 0000000..66102a7
--- /dev/null
+++ b/common/arm/ih264_deblk_chroma_a9.s
@@ -0,0 +1,1337 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/*****************************************************************************/
+@/* */
+@/* File Name : ih264_deblk_chroma_a9.s */
+@/* */
+@/* Description : Contains function definitions for deblocking luma */
+@/* edge. Functions are coded in NEON assembly and can */
+@/* be compiled using ARM RVDS. */
+@/* */
+@/* List of Functions : ih264_deblk_chroma_vert_bs4_bp_a9() */
+@/* ih264_deblk_chroma_vert_bslt4_bp_a9() */
+@/* ih264_deblk_chroma_horz_bs4_bp_a9() */
+@/* ih264_deblk_chroma_horz_bslt4_bp_a9() */
+@/* ih264_deblk_chroma_vert_bs4_mbaff_bp_a9() */
+@/* ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9() */
+@/* ih264_deblk_chroma_vert_bs4_a9() */
+@/* ih264_deblk_chroma_vert_bslt4_a9() */
+@/* ih264_deblk_chroma_horz_bs4_a9() */
+@/* ih264_deblk_chroma_horz_bslt4_a9() */
+@/* ih264_deblk_chroma_vert_bs4_mbaff_a9() */
+@/* ih264_deblk_chroma_vert_bslt4_mbaff_a9() */
+@/* */
+@/* Issues / Problems : None */
+@/* */
+@/* Revision History : */
+@/* */
+@/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+@/* 28 11 2013 Ittiam Draft */
+@/* 05 01 2015 Kaushik Added double-call functions for */
+@/* Senthoor vertical deblocking, and high */
+@/* profile functions. */
+@/* */
+@/*****************************************************************************/
+
+
+.text
+.p2align 2
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block horizontal edge when the
+@* boundary strength is set to 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_horz_bs4_bp_a9
+
+ih264_deblk_chroma_horz_bs4_bp_a9:
+
+ stmfd sp!, {r4, lr} @
+ vpush {d8 - d15}
+ sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma
+ vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v
+ mov r4, r0 @Keeping a backup of the pointer p0 of chroma
+ vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v
+ vdup.8 q10, r2 @Q10 contains alpha
+ vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v
+ vaddl.u8 q4, d6, d0 @
+ vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1
+ vmov.i8 d31, #2 @
+ vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v
+ vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
+ vmlal.u8 q4, d2, d31 @
+ vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U)
+ vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
+ vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
+ vaddl.u8 q7, d4, d2 @
+ vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1
+ vdup.8 q8, r3 @Q8 contains beta
+ vmlal.u8 q7, d6, d31 @
+ vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U)
+ vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
+ vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
+ vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
+ vrshrn.u16 d8, q4, #2 @
+ vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
+ vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ vrshrn.u16 d10, q7, #2 @
+ vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
+ vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ vbit q5, q2, q9 @
+ vbit q4, q0, q9 @
+ vst2.8 {d10, d11}, [r4], r1 @
+ vst2.8 {d8, d9}, [r4] @
+ vpop {d8 - d15}
+ ldmfd sp!, {r4, pc} @
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge when the
+@* boundary strength is set to 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bs4_bp_a9
+
+ih264_deblk_chroma_vert_bs4_bp_a9:
+
+ stmfd sp!, {r12, r14}
+ vpush {d8 - d15}
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ mov r12, r0 @keep a back up of r0 for buffer write
+
+ vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+
+ vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+ vdup.8 q11, r2 @Q4 = alpha
+ vdup.8 q12, r3 @Q5 = beta
+ vmov.i8 d31, #2
+
+ vabd.u8 q4, q1, q2 @|p0-q0|
+ vabd.u8 q5, q3, q2 @|q1-q0|
+ vabd.u8 q6, q0, q1 @|p1-p0|
+ vaddl.u8 q7, d2, d6
+ vaddl.u8 q8, d3, d7 @(p0 + q1)
+ vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
+ vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
+ vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
+ vmlal.u8 q7, d0, d31
+ vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1)
+ vaddl.u8 q9, d0, d4
+ vaddl.u8 q10, d1, d5 @(p1 + q0)
+ vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
+ vmlal.u8 q9, d6, d31
+ vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0)
+
+ vrshrn.i16 d14, q7, #2
+ vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2
+ vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ vrshrn.i16 d18, q9, #2
+ vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2
+
+ vbit q1, q7, q4
+ vbit q2, q9, q4
+
+ vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
+ vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
+ vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
+ vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
+
+ vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
+ vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
+ vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
+ vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block horizontal edge for cases where the
+@* boundary strength is less than 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@* tc0_table
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_horz_bslt4_bp_a9
+
+ih264_deblk_chroma_horz_bslt4_bp_a9:
+
+ stmfd sp!, {r4-r6, lr} @
+
+ ldrd r4, r5, [sp, #0x10] @r4 = u4_bs , r5 = pu1_cliptab
+ vpush {d8 - d15}
+ sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p2 of chroma U
+ rev r4, r4 @
+ vmov.32 d12[0], r4 @d12[0] = ui_Bs
+ vld1.32 d16[0], [r5] @D16[0] contains cliptab
+ vld2.8 {d6, d7}, [r0], r1 @Q3=p1
+ vtbl.8 d14, {d16}, d12 @
+ vmovl.u8 q6, d12 @q6 = uc_Bs in each 16 bit scalar
+ mov r6, r0 @Keeping a backup of the pointer to chroma U P0
+ vld2.8 {d4, d5}, [r0], r1 @Q2=p0
+ vmov.i8 d30, #1 @
+ vdup.8 q10, r2 @Q10 contains alpha
+ vld2.8 {d0, d1}, [r0], r1 @Q0=q0
+ vmovl.u8 q7, d14 @
+ vld2.8 {d2, d3}, [r0] @Q1=q1
+ vsubl.u8 q5, d1, d5 @
+ vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0)
+ vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
+ vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2
+ vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
+ vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2
+ vsli.16 q7, q7, #8 @
+ vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
+ vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
+ vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L
+ vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H
+ vdup.8 q8, r3 @Q8 contains beta
+ vadd.i16 q4, q4, q10 @
+ vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
+ vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
+ vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0)
+ vqrshrn.s16 d8, q4, #3 @
+ vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
+ vadd.i8 d14, d14, d30 @Q7 = C = C0+1
+ vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
+ vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ vabs.s8 q3, q4 @Q4 = ABS (i_macro)
+ vmov.i8 d15, d14 @
+ vmov.i8 d13, d12 @
+ vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
+ vbic q6, q6, q9 @final condition
+ vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0)
+ vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd
+ vqadd.u8 q8, q2, q7 @Q8 = p0 + delta
+ vqsub.u8 q2, q2, q7 @Q2 = p0 - delta
+ vqadd.u8 q9, q0, q7 @Q9 = q0 + delta
+ vqsub.u8 q0, q0, q7 @Q0 = q0 - delta
+ vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
+ vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
+ vst2.8 {d16, d17}, [r6], r1 @
+ vst2.8 {d0, d1}, [r6] @
+ vpop {d8 - d15}
+ ldmfd sp!, {r4-r6, pc} @
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge for cases where the
+@* boundary strength is less than 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@* tc0_table
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bslt4_bp_a9
+
+ih264_deblk_chroma_vert_bslt4_bp_a9:
+
+ stmfd sp!, {r10-r12, r14}
+
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ ldr r11, [sp, #16] @r12 = ui_Bs
+
+ ldr r10, [sp, #20] @r14 = puc_ClipTab
+ mov r12, r0 @keep a back up of r0 for buffer write
+ vpush {d8 - d15}
+ vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+
+ vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+
+ vdup.8 q11, r2 @Q4 = alpha
+ vabd.u8 q4, q1, q2 @|p0-q0|
+ vdup.8 q12, r3 @Q5 = beta
+ vabd.u8 q5, q3, q2 @|q1-q0|
+ vabd.u8 q6, q0, q1 @|p1-p0|
+ vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
+ vsubl.u8 q7, d0, d6
+ vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
+ vsubl.u8 q8, d1, d7 @(p1 - q1)
+ vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
+ vsubl.u8 q9, d4, d2
+ vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
+ vsubl.u8 q10, d5, d3 @(q0 - p0)
+ vmov.u16 q14, #4
+ vld1.32 {d24[0]}, [r10] @Load ClipTable
+ rev r11, r11 @Blocking strengths
+ vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+
+ vmov.32 d10[0], r11
+
+ vmla.s16 q7, q9, q14
+ vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1)
+
+ vmovl.u8 q5, d10
+
+
+ vsli.u16 d10, d10, #8
+ vmovl.u16 q5, d10
+ vsli.u32 q5, q5, #16
+ vtbl.8 d12, {d24}, d10
+ vtbl.8 d13, {d24}, d11 @tC0
+ vmov.u8 q12, #1
+ vadd.u8 q6, q6, q12 @tC0 + 1
+ vcge.u8 q5, q5, q12 @u4_bS > 0 ?
+ vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+
+ @ Q0 - Q3(inputs),
+ @ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+ @ Q6 (tC)
+
+ vrshr.s16 q7, q7, #3
+ vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+
+ vcgt.s16 q9, q7, #0
+ vcgt.s16 q10, q8, #0
+ vmovn.i16 d18, q9
+ vmovn.i16 d19, q10 @Q9 = sign(delta)
+ vabs.s16 q7, q7
+ vabs.s16 q8, q8
+ vmovn.u16 d14, q7
+ vmovn.u16 d15, q8
+ vmin.u8 q7, q7, q6 @Q7 = |delta|
+
+ vqadd.u8 q10, q1, q7 @p0+|delta|
+ vqadd.u8 q11, q2, q7 @q0+|delta|
+ vqsub.u8 q12, q1, q7 @p0-|delta|
+ vqsub.u8 q13, q2, q7 @q0-|delta|
+
+ vbit q12, q10, q9 @p0 + delta
+ vbit q11, q13, q9 @q0 - delta
+
+ vbit q1, q12, q4
+ vbit q2, q11, q4
+
+ vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
+ vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
+ vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
+ vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
+
+ vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
+ vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
+ vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
+ vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r10-r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge when the
+@* boundary strength is set to 4 on calling twice
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bs4_mbaff_bp_a9
+
+ih264_deblk_chroma_vert_bs4_mbaff_bp_a9:
+
+ stmfd sp!, {r12, r14}
+ vpush {d8 - d15}
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ mov r12, r0 @keep a back up of r0 for buffer write
+
+ vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+ vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+ vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+ vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+
+ vdup.8 d11, r2 @D11 = alpha
+ vdup.8 d12, r3 @D12 = beta
+ vmov.i8 d31, #2
+
+ vabd.u8 d4, d1, d2 @|p0-q0|
+ vabd.u8 d5, d3, d2 @|q1-q0|
+ vabd.u8 d6, d0, d1 @|p1-p0|
+ vaddl.u8 q14, d1, d3 @(p0 + q1)
+ vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
+ vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
+ vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
+ vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1)
+ vaddl.u8 q13, d0, d2 @(p1 + q0)
+ vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
+ vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0)
+
+ vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2
+ vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2
+
+ vbit d1, d7, d4
+ vbit d2, d9, d4
+
+ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
+ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
+ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
+ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge for cases where the
+@* boundary strength is less than 4 on calling twice
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@* tc0_table
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9
+
+ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9:
+
+ stmfd sp!, {r10-r12, r14}
+
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ ldr r11, [sp, #16] @r11 = ui_Bs
+
+ ldr r10, [sp, #20] @r10 = puc_ClipTab
+ mov r12, r0 @keep a back up of r0 for buffer write
+ vpush {d8 - d15}
+ vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+ vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+ vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+ vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+
+ vdup.8 d11, r2 @D11 = alpha
+ vabd.u8 d4, d1, d2 @|p0-q0|
+ vdup.8 d12, r3 @D12 = beta
+ vabd.u8 d5, d3, d2 @|q1-q0|
+ vabd.u8 d6, d0, d1 @|p1-p0|
+ vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
+ vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
+ vsubl.u8 q14, d0, d3 @(p1 - q1)
+ vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
+ vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
+ vsubl.u8 q12, d2, d1 @(q0 - p0)
+ vmov.u16 q10, #4
+
+ vld1.32 {d31[0]}, [r10] @Load ClipTable
+ rev r11, r11 @Blocking strengths
+ vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ vmov.32 d22[0], r11
+ vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1)
+ vmovl.u8 q11, d22
+ vsli.u16 d22, d22, #8
+ vtbl.8 d6, {d31}, d22 @tC0
+ vmov.u8 d12, #1
+ vadd.u8 d6, d6, d12 @tC0 + 1
+ vcge.u8 d5, d22, d12 @u4_bS > 0 ?
+ vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+
+ @ D0 - D3(inputs),
+ @ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+ @ D6 (tC)
+
+ vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+
+ vcgt.s16 q13, q14, #0
+ vmovn.i16 d9, q13 @D9 = sign(delta)
+ vabs.s16 q14, q14
+ vmovn.u16 d7, q14
+ vmin.u8 d7, d7, d6 @D7 = |delta|
+
+ vqadd.u8 d10, d1, d7 @p0+|delta|
+ vqadd.u8 d11, d2, d7 @q0+|delta|
+ vqsub.u8 d12, d1, d7 @p0-|delta|
+ vqsub.u8 d13, d2, d7 @q0-|delta|
+
+ vbit d12, d10, d9 @p0 + delta
+ vbit d11, d13, d9 @q0 - delta
+
+ vbit d1, d12, d4
+ vbit d2, d11, d4
+
+ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
+ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
+ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
+ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r10-r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block horizontal edge when the
+@* boundary strength is set to 4 in high profile
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@* Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@* Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@* Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@* Beta Value for the boundary in V
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_horz_bs4_a9
+
+ih264_deblk_chroma_horz_bs4_a9:
+
+ stmfd sp!, {r4-r6, lr} @
+
+ ldr r5, [sp, #16] @R5 = alpha_cr
+ ldr r6, [sp, #20] @R6 = beta_cr
+ vpush {d8 - d15}
+ sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma
+ vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v
+ mov r4, r0 @Keeping a backup of the pointer p0 of chroma
+ vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v
+ vdup.8 d20, r2 @D20 contains alpha_cb
+ vdup.8 d21, r5 @D21 contains alpha_cr
+ vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v
+ vaddl.u8 q4, d6, d0 @
+ vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1
+ vmov.i8 d31, #2 @
+ vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v
+ vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
+ vmlal.u8 q4, d2, d31 @
+ vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U)
+ vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
+ vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
+ vaddl.u8 q7, d4, d2 @
+ vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1
+ vdup.8 d16, r3 @D16 contains beta_cb
+ vdup.8 d17, r6 @D17 contains beta_cr
+ vmlal.u8 q7, d6, d31 @
+ vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U)
+ vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
+ vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
+ vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
+ vrshrn.u16 d8, q4, #2 @
+ vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
+ vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ vrshrn.u16 d10, q7, #2 @
+ vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
+ vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ vbit q5, q2, q9 @
+ vbit q4, q0, q9 @
+ vst2.8 {d10, d11}, [r4], r1 @
+ vst2.8 {d8, d9}, [r4] @
+ vpop {d8 - d15}
+ ldmfd sp!, {r4-r6, pc} @
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge when the
+@* boundary strength is set to 4 in high profile
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@* Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@* Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@* Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@* Beta Value for the boundary in V
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bs4_a9
+
+ih264_deblk_chroma_vert_bs4_a9:
+
+ stmfd sp!, {r4, r5, r12, r14}
+
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ mov r12, r0 @keep a back up of r0 for buffer write
+
+ ldr r4, [sp, #16] @r4 = alpha_cr
+ ldr r5, [sp, #20] @r5 = beta_cr
+ add r2, r2, r4, lsl #8 @r2 = (alpha_cr,alpha_cb)
+ add r3, r3, r5, lsl #8 @r3 = (beta_cr,beta_cb)
+ vpush {d8 - d15}
+ vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+
+ vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+ vdup.16 q11, r2 @Q11 = alpha
+ vdup.16 q12, r3 @Q12 = beta
+ vmov.i8 d31, #2
+
+ vabd.u8 q4, q1, q2 @|p0-q0|
+ vabd.u8 q5, q3, q2 @|q1-q0|
+ vabd.u8 q6, q0, q1 @|p1-p0|
+ vaddl.u8 q7, d2, d6
+ vaddl.u8 q8, d3, d7 @(p0 + q1)
+ vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
+ vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
+ vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
+ vmlal.u8 q7, d0, d31
+ vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1)
+ vaddl.u8 q9, d0, d4
+ vaddl.u8 q10, d1, d5 @(p1 + q0)
+ vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
+ vmlal.u8 q9, d6, d31
+ vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0)
+
+ vrshrn.i16 d14, q7, #2
+ vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2
+ vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ vrshrn.i16 d18, q9, #2
+ vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2
+
+ vbit q1, q7, q4
+ vbit q2, q9, q4
+
+ vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
+ vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
+ vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
+ vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
+
+ vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
+ vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
+ vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
+ vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r4, r5, r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block horizontal edge for cases where the
+@* boundary strength is less than 4 in high profile
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@* Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@* Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@* Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@* Beta Value for the boundary in V
+@*
+@* @param[in] sp(8) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(12) - pu1_cliptab_cb
+@* tc0_table for U
+@*
+@* @param[in] sp(16) - pu1_cliptab_cr
+@* tc0_table for V
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_horz_bslt4_a9
+
+ih264_deblk_chroma_horz_bslt4_a9:
+
+ stmfd sp!, {r4-r9, lr} @
+
+ ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr
+ ldr r7, [sp, #36] @R7 = u4_bs
+ ldrd r8, r9, [sp, #40] @R8 = pu1_cliptab_cb , R9 = pu1_cliptab_cr
+ sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p1 of chroma U
+ vpush {d8 - d15}
+ rev r7, r7 @
+ vmov.32 d12[0], r7 @D12[0] = ui_Bs
+
+ vld1.32 d16[0], [r8] @D16[0] contains cliptab_cb
+ vld1.32 d17[0], [r9] @D17[0] contains cliptab_cr
+ vld2.8 {d6, d7}, [r0], r1 @Q3=p1
+ vtbl.8 d14, {d16}, d12 @Retreiving cliptab values for U
+ vtbl.8 d28, {d17}, d12 @Retrieving cliptab values for V
+ vmovl.u8 q6, d12 @Q6 = uc_Bs in each 16 bit scalar
+ mov r6, r0 @Keeping a backup of the pointer to chroma U P0
+ vld2.8 {d4, d5}, [r0], r1 @Q2=p0
+ vmov.i8 d30, #1 @
+ vdup.8 d20, r2 @D20 contains alpha_cb
+ vdup.8 d21, r4 @D21 contains alpha_cr
+ vld2.8 {d0, d1}, [r0], r1 @Q0=q0
+ vmovl.u8 q7, d14 @
+ vmovl.u8 q14, d28 @
+ vmov.i16 d15, d28 @D14 has cliptab values for U, D15 for V
+ vld2.8 {d2, d3}, [r0] @Q1=q1
+ vsubl.u8 q5, d1, d5 @
+ vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0)
+ vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
+ vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2
+ vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
+ vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2
+ vsli.16 q7, q7, #8 @
+ vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
+ vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
+ vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L
+ vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H
+ vdup.8 d16, r3 @Q8 contains beta_cb
+ vdup.8 d17, r5 @Q8 contains beta_cr
+ vadd.i16 q4, q4, q10 @
+ vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
+ vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
+ vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0)
+ vqrshrn.s16 d8, q4, #3 @
+ vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
+ vadd.i8 d14, d14, d30 @D14 = C = C0+1 for U
+ vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
+ vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ vabs.s8 q3, q4 @Q4 = ABS (i_macro)
+ vadd.i8 d15, d15, d30 @D15 = C = C0+1 for V
+ vmov.i8 d13, d12 @
+ vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
+ vbic q6, q6, q9 @final condition
+ vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0)
+ vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd
+ vqadd.u8 q8, q2, q7 @Q8 = p0 + delta
+ vqsub.u8 q2, q2, q7 @Q2 = p0 - delta
+ vqadd.u8 q9, q0, q7 @Q9 = q0 + delta
+ vqsub.u8 q0, q0, q7 @Q0 = q0 - delta
+ vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
+ vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
+ vst2.8 {d16, d17}, [r6], r1 @
+ vst2.8 {d0, d1}, [r6] @
+ vpop {d8 - d15}
+ ldmfd sp!, {r4-r9, pc} @
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge for cases where the
+@* boundary strength is less than 4 in high profile
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@* Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@* Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@* Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@* Beta Value for the boundary in V
+@*
+@* @param[in] sp(8) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(12) - pu1_cliptab_cb
+@* tc0_table for U
+@*
+@* @param[in] sp(16) - pu1_cliptab_cr
+@* tc0_table for V
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bslt4_a9
+
+ih264_deblk_chroma_vert_bslt4_a9:
+
+ stmfd sp!, {r4-r7, r10-r12, r14}
+
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ ldrd r4, r5, [sp, #32] @R4 = alpha_cr , R5 = beta_cr
+ add r2, r2, r4, lsl #8
+ add r3, r3, r5, lsl #8
+ ldr r6, [sp, #40] @R6 = u4_bs
+ ldrd r10, r11, [sp, #44] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr
+ vpush {d8 - d15}
+ mov r12, r0 @keep a back up of R0 for buffer write
+
+ vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+
+ vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+
+ vdup.16 q11, r2 @Q11 = alpha
+ vabd.u8 q4, q1, q2 @|p0-q0|
+ vdup.16 q12, r3 @Q12 = beta
+ vabd.u8 q5, q3, q2 @|q1-q0|
+ vabd.u8 q6, q0, q1 @|p1-p0|
+ vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
+ vsubl.u8 q7, d0, d6
+ vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
+ vsubl.u8 q8, d1, d7 @(p1 - q1)
+ vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
+ vsubl.u8 q9, d4, d2
+ vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
+ vsubl.u8 q10, d5, d3 @(q0 - p0)
+ vmov.u16 q14, #4
+ vld1.32 {d24[0]}, [r10] @Load ClipTable for U
+ vld1.32 {d25[0]}, [r11] @Load ClipTable for V
+ rev r6, r6 @Blocking strengths
+ vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+
+ vmov.32 d10[0], r6
+
+ vmla.s16 q7, q9, q14
+ vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1)
+
+ vmovl.u8 q5, d10
+ vsli.u16 d10, d10, #8
+ vtbl.8 d12, {d24}, d10 @tC0 for U
+ vtbl.8 d13, {d25}, d10 @tC0 for V
+ vzip.8 d12, d13
+ vmovl.u16 q5, d10
+ vsli.u32 q5, q5, #16
+ vmov.u8 q12, #1
+ vadd.u8 q6, q6, q12 @tC0 + 1
+ vcge.u8 q5, q5, q12 @u4_bS > 0 ?
+ vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+
+ @ Q0 - Q3(inputs),
+ @ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+ @ Q6 (tC)
+
+ vrshr.s16 q7, q7, #3
+ vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+
+ vcgt.s16 q9, q7, #0
+ vcgt.s16 q10, q8, #0
+ vmovn.i16 d18, q9
+ vmovn.i16 d19, q10 @Q9 = sign(delta)
+ vabs.s16 q7, q7
+ vabs.s16 q8, q8
+ vmovn.u16 d14, q7
+ vmovn.u16 d15, q8
+ vmin.u8 q7, q7, q6 @Q7 = |delta|
+
+ vqadd.u8 q10, q1, q7 @p0+|delta|
+ vqadd.u8 q11, q2, q7 @q0+|delta|
+ vqsub.u8 q12, q1, q7 @p0-|delta|
+ vqsub.u8 q13, q2, q7 @q0-|delta|
+
+ vbit q12, q10, q9 @p0 + delta
+ vbit q11, q13, q9 @q0 - delta
+
+ vbit q1, q12, q4
+ vbit q2, q11, q4
+
+ vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
+ vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
+ vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
+ vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
+
+ vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
+ vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
+ vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
+ vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r4-r7, r10-r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge when the
+@* boundary strength is set to 4 on calling twice in high profile
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@* Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@* Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@* Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@* Beta Value for the boundary in V
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bs4_mbaff_a9
+
+ih264_deblk_chroma_vert_bs4_mbaff_a9:
+
+ stmfd sp!, {r4, r5, r12, r14}
+
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ mov r12, r0 @keep a back up of r0 for buffer write
+ ldrd r4, r5, [sp, #16] @R4 = alpha_cr , R5 = beta_cr
+ add r2, r2, r4, lsl #8
+ add r3, r3, r5, lsl #8
+ vpush {d8 - d15}
+ vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+ vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+ vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+ vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+
+ vdup.16 d11, r2 @D11 = alpha
+ vdup.16 d12, r3 @D12 = beta
+ vmov.i8 d31, #2
+
+ vabd.u8 d4, d1, d2 @|p0-q0|
+ vabd.u8 d5, d3, d2 @|q1-q0|
+ vabd.u8 d6, d0, d1 @|p1-p0|
+ vaddl.u8 q14, d1, d3 @(p0 + q1)
+ vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
+ vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
+ vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
+ vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1)
+ vaddl.u8 q13, d0, d2 @(p1 + q0)
+ vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
+ vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0)
+
+ vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2
+ vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2
+
+ vbit d1, d7, d4
+ vbit d2, d9, d4
+
+ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
+ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
+ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
+ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r4, r5, r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge for cases where the
+@* boundary strength is less than 4 on calling twice in high profile
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@* Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@* Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@* Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@* Beta Value for the boundary in V
+@*
+@* @param[in] sp(8) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(12) - pu1_cliptab_cb
+@* tc0_table for U
+@*
+@* @param[in] sp(16) - pu1_cliptab_cr
+@* tc0_table for V
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bslt4_mbaff_a9
+
+ih264_deblk_chroma_vert_bslt4_mbaff_a9:
+
+ stmfd sp!, {r4-r6, r10-r12, r14}
+
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ mov r12, r0 @keep a back up of r0 for buffer write
+
+ ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr
+ add r2, r2, r4, lsl #8
+ add r3, r3, r5, lsl #8
+ ldr r6, [sp, #36] @R6 = u4_bs
+ ldrd r10, r11, [sp, #40] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr
+ vpush {d8 - d15}
+ vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+ vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+ vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+ vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+
+ vdup.16 d11, r2 @D11 = alpha
+ vabd.u8 d4, d1, d2 @|p0-q0|
+ vdup.16 d12, r3 @D12 = beta
+ vabd.u8 d5, d3, d2 @|q1-q0|
+ vabd.u8 d6, d0, d1 @|p1-p0|
+ vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
+ vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
+ vsubl.u8 q14, d0, d3 @(p1 - q1)
+ vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
+ vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
+ vsubl.u8 q12, d2, d1 @(q0 - p0)
+ vmov.u16 q10, #4
+
+ vld1.32 {d31[1]}, [r10] @Load ClipTable for U
+ vld1.32 {d31[0]}, [r11] @Load ClipTable for V
+ rev r6, r6 @Blocking strengths
+ vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ vmov.32 d22[0], r6
+ vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1)
+ vmovl.u8 q11, d22
+ vsli.u16 d22, d22, #8
+ vmov.u16 d13, #4
+ vadd.u8 d22, d22, d13
+ vtbl.8 d6, {d31}, d22 @tC0
+ vmov.u8 d12, #1
+ vsub.u8 d22, d22, d13
+ vadd.u8 d6, d6, d12 @tC0 + 1
+ vcge.u8 d5, d22, d12 @u4_bS > 0 ?
+ vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+
+ @ D0 - D3(inputs),
+ @ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+ @ D6 (tC)
+
+ vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+
+ vcgt.s16 q13, q14, #0
+ vmovn.i16 d9, q13 @D9 = sign(delta)
+ vabs.s16 q14, q14
+ vmovn.u16 d7, q14
+ vmin.u8 d7, d7, d6 @D7 = |delta|
+
+ vqadd.u8 d10, d1, d7 @p0+|delta|
+ vqadd.u8 d11, d2, d7 @q0+|delta|
+ vqsub.u8 d12, d1, d7 @p0-|delta|
+ vqsub.u8 d13, d2, d7 @q0-|delta|
+
+ vbit d12, d10, d9 @p0 + delta
+ vbit d11, d13, d9 @q0 - delta
+
+ vbit d1, d12, d4
+ vbit d2, d11, d4
+
+ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
+ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
+ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
+ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r4-r6, r10-r12, pc}
+
+
+
diff --git a/common/arm/ih264_deblk_luma_a9.s b/common/arm/ih264_deblk_luma_a9.s
new file mode 100755
index 0000000..3e6a4d9
--- /dev/null
+++ b/common/arm/ih264_deblk_luma_a9.s
@@ -0,0 +1,1092 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/*****************************************************************************/
+@/* */
+@/* File Name : ih264_deblk_luma_a9.s */
+@/* */
+@/* Description : Contains function definitions for deblocking luma */
+@/* edge. Functions are coded in NEON assembly and can */
+@/* be compiled using ARM RVDS. */
+@/* */
+@/* List of Functions : ih264_deblk_luma_vert_bs4_a9() */
+@/* ih264_deblk_luma_vert_bslt4_a9() */
+@/* ih264_deblk_luma_horz_bs4_a9() */
+@/* ih264_deblk_luma_horz_bslt4_a9() */
+@/* ih264_deblk_luma_vert_bs4_mbaff_a9() */
+@/* ih264_deblk_luma_vert_bslt4_mbaff_a9() */
+@/* */
+@/* Issues / Problems : None */
+@/* */
+@/* Revision History : */
+@/* */
+@/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+@/* 28 11 2013 Ittiam Draft */
+@/* 05 01 2015 Kaushik Added double-call functions for */
+@/* Senthoor vertical deblocking. */
+@/* */
+@/*****************************************************************************/
+
+
+.text
+.p2align 2
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a luma block horizontal edge for cases where the
+@* boundary strength is less than 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@* tc0_table
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_luma_horz_bslt4_a9
+
+ih264_deblk_luma_horz_bslt4_a9:
+
+ stmfd sp!, {r4-r7, lr}
+
+ ldrd r4, r5, [sp, #0x14] @r4 = ui_Bs , r5 = *puc_ClpTab
+ vpush {d8 - d15}
+ sub r0, r0, r1, lsl #1 @R1 = uc_Horizonpad
+ sub r0, r0, r1 @r0 pointer to p2
+ rev r4, r4 @
+ vld1.8 {q5}, [r0], r1 @p2 values are loaded into q5
+ vmov.32 d12[0], r4 @d12[0] = ui_Bs
+ mov r6, r0 @keeping backup of pointer to p1
+ vld1.8 {q4}, [r0], r1 @p1 values are loaded into q4
+ mov r7, r0 @keeping backup of pointer to p0
+ vld1.8 {q3}, [r0], r1 @p0 values are loaded into q3
+ vmovl.u8 q6, d12 @q6 = uc_Bs in each 16 bt scalar
+ vld1.8 {q0}, [r0], r1 @q0 values are loaded into q0
+ vabd.u8 q13, q4, q3 @Q13 = ABS(p1 - p0)
+ vld1.8 {q1}, [r0], r1 @q1 values are loaded into q1
+ vabd.u8 q11, q3, q0 @Q11 = ABS(p0 - q0)
+ vld1.32 d16[0], [r5] @D16[0] contains cliptab
+ vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
+ vld1.8 {q2}, [r0], r1 @q2 values are loaded into q2
+ vtbl.8 d14, {d16}, d12 @
+ vdup.8 q10, r2 @Q10 contains alpha
+ vdup.8 q8, r3 @Q8 contains beta
+ vmovl.u16 q6, d12 @
+ vmovl.u16 q7, d14 @
+ vabd.u8 q14, q5, q3 @Q14 = Ap = ABS(p2 - p0)
+ vabd.u8 q15, q2, q0 @Q15 = Aq = ABS(q2 - q0)
+ vcgt.s32 q6, q6, #0 @Q6 = (us_Bs > 0)
+ vsli.32 q7, q7, #8 @
+ vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
+ vcge.u8 q12, q12, q8 @Q12=( ABS(q1 - q0) >= Beta )
+ vcge.u8 q13, q13, q8 @Q13=( ABS(p1 - p0) >= Beta )
+ vcgt.u8 q10, q8, q14 @Q10=(Ap<Beta)
+ vcgt.u8 q11, q8, q15 @Q11=(Aq<Beta)
+ vsli.32 q7, q7, #16 @Q7 = C0
+ vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ vsubl.u8 q15, d1, d7 @
+ vsubl.u8 q12, d0, d6 @Q15,Q12 = (q0 - p0)
+ vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ vsubl.u8 q14, d8, d2 @Q14 = (p1 - q1)L
+ vshl.i16 q13, q15, #2 @Q13 = (q0 - p0)<<2
+ vshl.i16 q12, q12, #2 @Q12 = (q0 - p0)<<2
+ vsubl.u8 q15, d9, d3 @Q15 = (p1 - q1)H
+ vbic q6, q6, q9 @final condition
+ vadd.i16 q12, q12, q14 @
+ vadd.i16 q13, q13, q15 @Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1)
+ vsub.i8 q9, q7, q10 @Q9 = C0 + (Ap < Beta)
+ vrhadd.u8 q8, q3, q0 @Q8 = ((p0+q0+1) >> 1)
+ vqrshrn.s16 d24, q12, #3 @
+ vqrshrn.s16 d25, q13, #3 @Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
+ vsub.i8 q9, q9, q11 @Q9 = C0 + (Ap < Beta) + (Aq < Beta)
+ vand.i8 q10, q10, q6 @
+ vand.i8 q11, q11, q6 @
+ vabs.s8 q13, q12 @Q13 = ABS (i_macro)
+ vaddl.u8 q14, d17, d11 @
+ vaddl.u8 q5, d16, d10 @Q14,Q5 = p2 + (p0+q0+1)>>1
+ vaddl.u8 q15, d17, d5 @
+ vmin.u8 q9, q13, q9 @Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
+ vshll.u8 q13, d9, #1 @
+ vaddl.u8 q2, d16, d4 @Q15,Q2 = q2 + (p0+q0+1)>>1
+ vshll.u8 q8, d8, #1 @Q13,Q8 = (p1<<1)
+ vand q9, q9, q6 @Making delta zero in places where values shouldn be filterd
+ vsub.i16 q14, q14, q13 @Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1)
+ vsub.i16 q5, q5, q8 @
+ vshll.u8 q8, d2, #1 @
+ vshll.u8 q13, d3, #1 @Q13,Q8 = (q1<<1)
+ vqshrn.s16 d29, q14, #1 @
+ vqshrn.s16 d28, q5, #1 @Q14 = i_macro_p1
+ vsub.i16 q2, q2, q8 @
+ vsub.i16 q15, q15, q13 @Q15,Q2 = [q2 + (p0+q0+1)>>1] - (q1<<1)
+ vneg.s8 q13, q7 @Q13 = -C0
+ vmin.s8 q14, q14, q7 @Q14 = min(C0,i_macro_p1)
+ vcge.s8 q12, q12, #0 @Q12 = (i_macro >= 0)
+ vqshrn.s16 d31, q15, #1 @
+ vqshrn.s16 d30, q2, #1 @Q15 = i_macro_q1
+ vmax.s8 q14, q14, q13 @Q14 = max( - C0 , min(C0, i_macro_p1) )
+ vqadd.u8 q8, q3, q9 @Q8 = p0 + delta
+ vqsub.u8 q3, q3, q9 @Q3 = p0 - delta
+ vmin.s8 q15, q15, q7 @Q15 = min(C0,i_macro_q1)
+ vand.i8 q14, q10, q14 @condition check Ap<beta
+ vqadd.u8 q7, q0, q9 @Q7 = q0 + delta
+ vqsub.u8 q0, q0, q9 @Q0 = q0 - delta
+ vmax.s8 q15, q15, q13 @Q15 = max( - C0 , min(C0, i_macro_q1) )
+ vbif q8, q3, q12 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
+ vbif q0, q7, q12 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
+ vadd.i8 q14, q14, q4 @
+ vand.i8 q15, q11, q15 @condition check Aq<beta
+ vst1.8 {q8}, [r7], r1 @writting back filtered value of p0
+ vadd.i8 q15, q15, q1 @
+ vst1.8 {q0}, [r7], r1 @writting back filtered value of q0
+ vst1.8 {q14}, [r6] @writting back filtered value of p1
+ vst1.8 {q15}, [r7], r1 @writting back filtered value of q1
+ vpop {d8 - d15}
+ ldmfd sp!, {r4-r7, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a luma block horizontal edge when the
+@* boundary strength is set to 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_luma_horz_bs4_a9
+
+ih264_deblk_luma_horz_bs4_a9:
+
+ @ Back up necessary registers on stack
+ stmfd sp!, {r12, r14}
+ vpush {d8 - d15}
+ @ Init
+ vdup.8 q0, r2 @duplicate alpha
+ sub r12, r0, r1 @pointer to p0 = q0 - src_strd
+ vdup.8 q1, r3 @duplicate beta
+ sub r14, r0, r1, lsl#1 @pointer to p1 = q0 - src_strd*2
+ sub r2, r0, r1, lsl#2 @pointer to p3 = q0 - src_strd*4
+ sub r3, r14, r1 @pointer to p2 = p1 - src_strd
+
+ @ Load Data
+ vld1.8 {d4, d5}, [r0], r1 @load q0 to Q2, q0 = q0 + src_strd
+ vld1.8 {d6, d7}, [r12] @load p0 to Q3
+ vld1.8 {d8, d9}, [r0], r1 @load q1 to Q4, q0 = q0 + src_strd
+ vld1.8 {d10, d11}, [r14] @load p1 to Q5
+
+ @ Filter Decision
+ vabd.u8 q6, q2, q3 @ABS(p0 - q0)
+ vabd.u8 q7, q4, q2 @ABS(q1 - q0)
+ vabd.u8 q8, q5, q3 @ABS(p1 - p0)
+ vcge.u8 q9, q6, q0 @ABS(p0 - q0) >= Alpha
+ vcge.u8 q7, q7, q1 @ABS(q1 - q0) >= Beta
+ vcge.u8 q8, q8, q1 @ABS(p1 - p0) >= Beta
+ vmov.i8 q10, #2
+ vorr q9, q9, q7 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta
+ vld1.8 {d14, d15}, [r0], r1 @load q2 to Q7, q0 = q0 + src_strd
+ vorr q9, q9, q8 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta
+ vsra.u8 q10, q0, #2 @((Alpha >> 2) + 2)
+ vabd.u8 q11, q7, q2 @Aq = ABS(q2 - q0)
+ vaddl.u8 q12, d4, d6 @p0+q0 L
+ vaddl.u8 q13, d5, d7 @p0+q0 H
+ vclt.u8 q11, q11, q1 @Aq < Beta
+ vclt.u8 q10, q6, q10 @(ABS(p0 - q0) <((Alpha >>2) + 2))
+
+ @ Deblock Filtering q0', q1', q2'
+ vaddw.u8 q14, q12, d8 @p0+q0+q1 L
+ vaddw.u8 q15, q13, d9 @p0+q0+q1 H
+ vand q11, q11, q10 @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ @ q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE
+ vadd.i16 q8, q14, q14 @2*(p0+q0+q1)L
+ vadd.i16 q0, q15, q15 @2*(p0+q0+q1)H
+ vaddw.u8 q8, q8, d14 @2*(p0+q0+q1)+q2 L
+ vaddw.u8 q0, q0, d15 @2*(p0+q0+q1)+q2 H
+ vaddw.u8 q8, q8, d10 @2*(p0+q0+q1)+q2 +p1 L
+ vaddw.u8 q0, q0, d11 @2*(p0+q0+q1)+q2 +p1 H
+ vrshrn.u16 d12, q8, #3 @(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0']
+ vrshrn.u16 d13, q0, #3 @(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0']
+ @ q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE
+ vaddl.u8 q8, d8, d8 @2*q1 L
+ vaddl.u8 q0, d9, d9 @2*q1 H
+ vaddw.u8 q8, q8, d4 @2*q1+q0 L
+ vaddw.u8 q0, q0, d5 @2*q1+q0 H
+ vaddw.u8 q8, q8, d10 @2*q1+q0+p1 L
+ vaddw.u8 q0, q0, d11 @2*q1+q0+p1 H
+ vrshrn.u16 d16, q8, #2 @(2*q1+q0+p1+2)>>2 L [q0"]
+ vrshrn.u16 d17, q0, #2 @(2*q1+q0+p1+2)>>2 H [q0"]
+ @ q1'
+ vaddw.u8 q14, q14, d14 @p0+q0+q1+q2 L
+ vaddw.u8 q15, q15, d15 @p0+q0+q1+q2 H
+ vld1.8 {q0}, [r0], r1 @load q3 to Q0, q0 = q0 + src_strd
+ vbit q8, q6, q11 @choosing between q0' and q0" depending on condn
+ sub r0, r0, r1, lsl #2 @pointer to q0
+ vbic q11, q11, q9 @((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
+ @ && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ vrshrn.u16 d12, q14, #2 @(p0+q0+q1+q2+2)>>2 L [q1']
+ vrshrn.u16 d13, q15, #2 @(p0+q0+q1+q2+2)>>2 H [q1']
+ vbif q2, q8, q9 @choose q0 or filtered q0
+ @ q2'
+ vaddl.u8 q8, d14, d0 @q2+q3,L
+ vaddl.u8 q0, d15, d1 @q2+q3,H
+ vadd.i16 q14, q14, q8 @p0+q0+q1+2*q2+q3 L
+ vst1.8 {d4, d5}, [r0], r1 @store q0
+ vadd.i16 q15, q15, q0 @p0+q0+q1+2*q2+q3 H
+ vadd.i16 q14, q14, q8 @p0+q0+q1+3*q2+2*q3 L
+ vadd.i16 q15, q15, q0 @p0+q0+q1+3*q2+2*q3 H
+ vrshrn.u16 d0, q14, #3 @(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2']
+ vrshrn.u16 d1, q15, #3 @(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2']
+ vld1.8 {d30, d31}, [r3] @load p2 to Q15
+ vbif q6, q4, q11 @choose q1 or filtered value of q1
+
+ vabd.u8 q8, q15, q3 @Ap,ABS(p2 - p0)
+ vaddw.u8 q12, q12, d10 @p0+q0+p1 L
+ vbif q0, q7, q11 @choose q2 or filtered q2
+ vaddw.u8 q13, q13, d11 @p0+q0+p1 H
+ vst1.8 {d12, d13}, [r0], r1 @store q1
+ vclt.u8 q8, q8, q1 @Ap < Beta
+ vadd.i16 q14, q12, q12 @2*(p0+q0+p1) L
+ vadd.i16 q2, q13, q13 @2*(p0+q0+p1) H
+ vst1.8 {d0, d1}, [r0], r1 @store q2
+ vand q10, q10, q8 @((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2)))
+ vaddw.u8 q14, q14, d30 @2*(p0+q0+p1)+p2 l
+ vaddw.u8 q2, q2, d31 @2*(p0+q0+p1)+p2 H
+ vaddw.u8 q14, q14, d8 @2*(p0+q0+p1)+p2+q1 L
+ vaddw.u8 q2, q2, d9 @2*(p0+q0+p1)+p2+q1 H
+ vrshrn.u16 d28, q14, #3 @(2*(p0+q0+p1)+p2+q1+4)>>3 L,p0'
+ vrshrn.u16 d29, q2, #3 @(2*(p0+q0+p1)+p2+q1+4)>>3 H,p0'
+ vmov.i8 d0, #2
+ vmov.i16 d1, #2
+ vaddl.u8 q1, d6, d8 @p0+q1 L
+ vmlal.u8 q1, d10, d0 @2*p1+p0+q1 L
+ vaddl.u8 q8, d7, d9 @p0+q1 H
+ vmlal.u8 q8, d11, d0 @2*p1+p0+q1 H
+ vaddw.u8 q6, q12, d30 @(p0+q0+p1) +p2 L
+ vld1.8 {d24, d25}, [r2] @load p3,Q12
+ vaddw.u8 q2, q13, d31 @(p0+q0+p1) +p2 H
+ vaddl.u8 q4, d30, d24 @p2+p3 L
+ vrshrn.u16 d26, q6, #2 @((p0+q0+p1)+p2 +2)>>2,p1' L
+ vrshrn.u16 d2, q1, #2 @(2*p1+p0+q1+2)>>2,p0"L
+ vrshrn.u16 d27, q2, #2 @((p0+q0+p1)+p2 +2)>>2,p1' H
+ vrshrn.u16 d3, q8, #2 @(2*p1+p0+q1+2)>>2,p0" H
+ vaddl.u8 q8, d31, d25 @p2+p3 H
+ vmla.u16 q6, q4, d1[0] @(p0+q0+p1)+3*p2+2*p3 L
+ vmla.u16 q2, q8, d1[0] @(p0+q0+p1)+3*p2+2*p3 H
+ vbic q8, q10, q9 @((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
+ @&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ vbit q1, q14, q10 @choosing between po' and p0"
+ vrshrn.u16 d12, q6, #3 @((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2'
+ vrshrn.u16 d13, q2, #3 @((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2'
+ vbif q3, q1, q9 @choosing between p0 and filtered value of p0
+ vbit q5, q13, q8 @choosing between p1 and p1'
+ vbit q15, q6, q8 @choosing between p2 and p2'
+ vst1.8 {d6, d7}, [r12] @store p0
+ vst1.8 {d10, d11}, [r14] @store p1
+ vst1.8 {d30, d31}, [r3] @store p2
+ vpop {d8 - d15}
+ ldmfd sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a luma block vertical edge for cases where the
+@* boundary strength is less than 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@* tc0_table
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_luma_vert_bslt4_a9
+
+ih264_deblk_luma_vert_bslt4_a9:
+
+ stmfd sp!, {r12, lr}
+
+ sub r0, r0, #4 @pointer uc_edgePixel-4
+ ldr r12, [sp, #8] @r12 = ui_Bs
+ ldr r14, [sp, #12] @r14 = *puc_ClpTab
+ vpush {d8 - d15}
+ @loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
+ vld1.8 {d0}, [r0], r1 @row1
+ vld1.8 d2, [r0], r1 @row2
+ vld1.8 d4, [r0], r1 @row3
+ rev r12, r12 @reversing ui_bs
+ vld1.8 d6, [r0], r1 @row4
+ vmov.32 d18[0], r12 @d12[0] = ui_Bs
+ vld1.32 d16[0], [r14] @D16[0] contains cliptab
+ vld1.8 d8, [r0], r1 @row5
+ vmovl.u8 q9, d18 @q6 = uc_Bs in each 16 bt scalar
+ vld1.8 d10, [r0], r1 @row6
+ vld1.8 d12, [r0], r1 @row7
+ vtbl.8 d16, {d16}, d18 @puc_ClipTab[uc_Bs]
+ vld1.8 d14, [r0], r1 @row8
+ vld1.8 d1, [r0], r1 @row9
+ vmovl.u16 q8, d16 @
+ vld1.8 d3, [r0], r1 @row10
+ vld1.8 d5, [r0], r1 @row11
+ vld1.8 d7, [r0], r1 @row12
+ vsli.32 q8, q8, #8 @
+ vld1.8 d9, [r0], r1 @row13
+ vld1.8 d11, [r0], r1 @row14
+ vld1.8 d13, [r0], r1 @row15
+ vsli.32 q8, q8, #16 @Q8 = C0
+ vld1.8 d15, [r0], r1 @row16
+
+ @taking two 8x8 transposes
+ @2X2 transposes
+ vtrn.8 d0, d2 @row1 &2
+ vtrn.8 d4, d6 @row3&row4
+ vtrn.8 d8, d10 @row5&6
+ vtrn.8 d12, d14 @row7 & 8
+ vtrn.8 d1, d3 @row9 &10
+ vtrn.8 d5, d7 @row11 & 12
+ vtrn.8 d9, d11 @row13 &14
+ vtrn.8 d13, d15 @row15 & 16
+ @4x4 transposes
+ vtrn.16 d2, d6 @row2 & row4
+ vtrn.16 d10, d14 @row6 & row8
+ vtrn.16 d3, d7 @row10 & 12
+ vtrn.16 d11, d15 @row14 & row16
+ vtrn.32 d6, d14 @row4 & 8
+ vtrn.32 d7, d15 @row 12 & 16
+
+ @now Q3 ->p0 and Q7->q3
+ vtrn.16 d0, d4 @row1 & 3
+ vtrn.16 d8, d12 @row 5 & 7
+ vtrn.16 d1, d5 @row9 & row11
+ vtrn.16 d9, d13 @row13 & row15
+ vtrn.32 d0, d8 @row1 & row5
+ vtrn.32 d1, d9 @row9 & 13
+
+ @now Q0->p3 & Q4->q0
+ @starting processing as p0 and q0 are now ready
+ vtrn.32 d2, d10 @row2 &6
+ vrhadd.u8 q10, q3, q4 @((p0 + q0 + 1) >> 1)
+ vtrn.32 d3, d11 @row10&row14
+ vmov.i8 d19, #2
+ @now Q1->p2 & Q5->q1
+ vtrn.32 d4, d12 @row3 & 7
+ vabd.u8 q11, q3, q4 @ABS(p0 - q0)
+ vtrn.32 d5, d13 @row11 & row15
+ vaddl.u8 q12, d20, d2 @(p2 + ((p0 + q0 + 1) >> 1) L
+ @now Q2->p1,Q6->q2
+ vaddl.u8 q13, d21, d3 @(p2 + ((p0 + q0 + 1) >> 1) H
+ vmlsl.u8 q12, d4, d19 @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L
+ vmlsl.u8 q13, d5, d19 @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H
+ vdup.8 q14, r2 @alpha
+ vcle.u8 q11, q14, q11 @ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
+ vdup.i8 q14, r3 @beta
+ vabd.u8 q15, q5, q4 @ABS(q1 - q0)
+ vqshrn.s16 d24, q12, #1 @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L
+ vqshrn.s16 d25 , q13, #1 @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H
+ vcge.u8 q15, q15, q14 @ABS(q1 - q0) >= Beta
+ vabd.u8 q13, q2, q3 @ABS(p1 - p0)
+ vmin.s8 q12, q12, q8 @min(deltap1 ,C0)
+ vorr q11, q11, q15 @ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
+ vneg.s8 q15, q8 @-C0
+ vcge.u8 q13, q13, q14 @ABS(p1 - p0) >= Beta
+ vmax.s8 q12, q12, q15 @max(deltap1,-C0)
+ vorr q11, q11, q13 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)
+ vmovl.u16 q13, d18 @ui_bs
+ vaddl.u8 q9, d20, d12 @q2 + ((p0 + q0 + 1) >> 1) L
+ vceq.u32 q13, q13, #0 @ui_bs == 0
+ vsubw.u8 q9, q9, d10 @(q2 + ((p0 + q0 + 1) >> 1) - q1) L
+ vaddl.u8 q10, d21, d13 @q2 + ((p0 + q0 + 1) >> 1) H
+ vsubw.u8 q9, q9, d10 @(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L
+ vsubw.u8 q10, q10, d11 @(q2 + ((p0 + q0 + 1) >> 1) - q1) H
+ vorr q13, q13, q11 @(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs)
+ vsubw.u8 q10, q10, d11 @(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H
+ vqshrn.s16 d18, q9, #1 @((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L
+ vabd.u8 q11, q1, q3 @Ap = ABS(p2 - p0)
+ vqshrn.s16 d19, q10, #1 @((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H
+ vabd.u8 q10, q6, q4 @Aq= ABS(q2 - q0)
+ vclt.u8 q11, q11, q14 @Ap < Beta
+ vmin.s8 q9, q9, q8 @min(delatq1,C0)
+ vclt.u8 q10, q10, q14 @Aq <Beta
+ vsubl.u8 q14, d8, d6 @(q0 - p0) L
+ vmax.s8 q9, q9, q15 @max(deltaq1,-C0)
+ vsubl.u8 q15, d9, d7 @(q0 - p0) H
+ vshl.s16 q14, q14, #2 @(q0 - p0)<<2 L
+ vsub.u8 q8, q8, q11 @C0 + (Ap < Beta)
+ vshl.s16 q15, q15, #2 @(q0 - p0) << 2) H
+ vaddw.u8 q14, q14, d4 @((q0 - p0) << 2) + (p1 L
+ vaddw.u8 q15, q15, d5 @((q0 - p0) << 2) + (p1 H
+ vsubw.u8 q14, q14, d10 @((q0 - p0) << 2) + (p1 - q1) L
+ vsubw.u8 q15, q15, d11 @((q0 - p0) << 2) + (p1 - q1) H
+ vbic q11, q11, q13 @final condition for p1
+ vrshrn.s16 d28, q14, #3 @delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L
+ vrshrn.s16 d29, q15, #3 @delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H
+ vsub.u8 q8, q8, q10 @C0 + (Ap < Beta) + (Aq < Beta)
+ vbic q10, q10, q13 @final condition for q1
+ vabs.s8 q15, q14 @abs(delta)
+ vand q12, q12, q11 @delatp1
+ vand q9, q9, q10 @delta q1
+ vmin.u8 q15, q15, q8 @min((abs(delta),C)
+ vadd.i8 q2, q2, q12 @p1+deltap1
+ vadd.i8 q5, q5, q9 @q1+deltaq1
+ vbic q15, q15, q13 @abs(delta) of pixels to be changed only
+ vcge.s8 q14, q14, #0 @sign(delta)
+ vqsub.u8 q11, q3, q15 @clip(p0-delta)
+ vtrn.8 d0, d2 @row1 &2
+ vqadd.u8 q3, q3, q15 @clip(p0+delta)
+ vtrn.8 d1, d3 @row9 &10
+ vqadd.u8 q12, q4, q15 @clip(q0+delta)
+ vtrn.8 d12, d14 @row7 & 8
+ vqsub.u8 q4, q4, q15 @clip(q0-delta)
+ vtrn.8 d13, d15 @row15 & 16
+ vbif q3, q11, q14 @p0
+ vbif q4, q12, q14 @q0
+ vtrn.8 d4, d6 @row3&row4
+ vtrn.8 d8, d10 @row5&6
+ vtrn.8 d5, d7 @row11 & 12
+ vtrn.8 d9, d11 @row13 &14
+ vtrn.16 d2, d6 @row2 & row4
+ vtrn.16 d10, d14 @row6 & row8
+ vtrn.16 d3, d7 @row10 & 12
+ vtrn.16 d11, d15 @row14 & row16
+ vtrn.32 d6, d14 @row4 & 8
+ vtrn.32 d7, d15 @row 12 & 16
+ @now Q3 ->p0 and Q7->q3
+ vtrn.16 d0, d4 @row1 & 3
+ vtrn.16 d8, d12 @row 5 & 7
+ vtrn.16 d1, d5 @row9 & row11
+ vtrn.16 d9, d13 @row13 & row15
+ sub r0, r0, r1, lsl#4 @restore pointer
+ vtrn.32 d0, d8 @row1 & row5
+ vtrn.32 d1, d9 @row9 & 13
+ vtrn.32 d2, d10 @row2 &6
+ vtrn.32 d3, d11 @row10&row14
+ vtrn.32 d4, d12 @row3 & 7
+ vtrn.32 d5, d13 @row11 & row15
+ vst1.8 {d0}, [r0], r1 @row1
+ vst1.8 d2, [r0], r1 @row2
+ vst1.8 d4, [r0], r1 @row3
+ vst1.8 d6, [r0], r1 @row4
+ vst1.8 d8, [r0], r1 @row5
+ vst1.8 d10, [r0], r1 @row6
+ vst1.8 d12, [r0], r1 @row7
+ vst1.8 d14, [r0], r1 @row8
+ vst1.8 d1, [r0], r1 @row9
+ vst1.8 d3, [r0], r1 @row10
+ vst1.8 d5, [r0], r1 @row11
+ vst1.8 d7, [r0], r1 @row12
+ vst1.8 d9, [r0], r1 @row13
+ vst1.8 d11, [r0], r1 @row14
+ vst1.8 d13, [r0], r1 @row15
+ vst1.8 d15, [r0], r1 @row16
+ vpop {d8 - d15}
+ ldmfd sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a luma block vertical edge when the
+@* boundary strength is set to 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_luma_vert_bs4_a9
+
+ih264_deblk_luma_vert_bs4_a9:
+
+ stmfd sp!, {r12, lr}
+ vpush {d8 - d15}
+ sub r0, r0, #4 @pointer uc_edgePixel-4
+ @loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
+ vld1.8 d0, [r0], r1 @row1
+ vld1.8 d2, [r0], r1 @row2
+ vld1.8 d4, [r0], r1 @row3
+ vld1.8 d6, [r0], r1 @row4
+ vld1.8 d8, [r0], r1 @row5
+ vld1.8 d10, [r0], r1 @row6
+ vld1.8 d12, [r0], r1 @row7
+ vld1.8 d14, [r0], r1 @row8
+ vld1.8 d1, [r0], r1 @row9
+ vld1.8 d3, [r0], r1 @row10
+ vld1.8 d5, [r0], r1 @row11
+ vld1.8 d7, [r0], r1 @row12
+ vld1.8 d9, [r0], r1 @row13
+ vld1.8 d11, [r0], r1 @row14
+ vld1.8 d13, [r0], r1 @row15
+ vld1.8 d15, [r0], r1 @row16
+ @taking two 8x8 transposes
+ @2X2 transposes
+ vtrn.8 d0, d2 @row1 &2
+ vtrn.8 d4, d6 @row3&row4
+ vtrn.8 d8, d10 @row5&6
+ vtrn.8 d12, d14 @row7 & 8
+ vtrn.8 d1, d3 @row9 &10
+ vtrn.8 d5, d7 @row11 & 12
+ vtrn.8 d9, d11 @row13 &14
+ vtrn.8 d13, d15 @row15 & 16
+ @4x4 transposes
+ vtrn.16 d2, d6 @row2 & row4
+ vtrn.16 d10, d14 @row6 & row8
+ vtrn.16 d3, d7 @row10 & 12
+ vtrn.16 d11, d15 @row14 & row16
+ vtrn.32 d6, d14 @row4 & 8
+ vtrn.32 d7, d15 @row 12 & 16
+ @now Q3 ->p0 and Q7->q3
+ vtrn.16 d0, d4 @row1 & 3
+ vtrn.16 d8, d12 @row 5 & 7
+ vtrn.16 d1, d5 @row9 & row11
+ vtrn.16 d9, d13 @row13 & row15
+ vtrn.32 d0, d8 @row1 & row5
+ vtrn.32 d1, d9 @row9 & 13
+ @now Q0->p3 & Q4->q0
+ @starting processing as p0 and q0 are now ready
+ @now Q1->p2 & Q5->q1
+ vpush {q7} @saving in stack
+ vtrn.32 d4, d12 @row3 & 7
+ vmov.i16 q14, #2
+ vtrn.32 d5, d13 @row11 & row15
+ vaddl.u8 q8, d6, d8 @p0+q0 L
+ vtrn.32 d2, d10 @row2 &6
+ vaddl.u8 q9, d7, d9 @p0+q0 H
+ vtrn.32 d3, d11 @row10&row14
+ vaddw.u8 q10, q8, d4 @p0+q0+p1 L
+ vaddw.u8 q11, q9, d5 @p0+q0+p1 H
+ vaddl.u8 q12, d2, d10 @p2+q1 L
+ vaddl.u8 q13, d3, d11 @p2+q1 H
+ vmla.u16 q12, q10, q14 @p2 + X2(p1) + X2(p0) + X2(q0) + q1 L
+ vmla.u16 q13, q11, q14 @p2 + X2(p1) + X2(p0) + X2(q0) + q1 H
+ vmov.i8 q14, #2
+ vaddw.u8 q8, q10, d2 @p0+q0+p1+p2 L
+ vaddw.u8 q9, q11, d3 @p0+q0+p1+p2 H
+ vdup.i8 q15, r2 @duplicate alpha
+ vrshrn.u16 d20, q8, #2 @(p2 + p1 + p0 + q0 + 2) >> 2)L p1'
+ vrshrn.u16 d21, q9, #2 @(p2 + p1 + p0 + q0 + 2) >> 2)H p1'
+ vabd.u8 q11, q3, q4 @ABD(p0-q0)
+ vsra.u8 q14, q15, #2 @alpha >>2 +2
+ vabd.u8 q15, q1, q3 @Ap = ABD(p2-p0)
+ vrshrn.u16 d24, q12, #3 @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0'
+ vrshrn.u16 d25, q13, #3 @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0'
+ vdup.i8 q13, r3 @beta
+ vcgt.u8 q14, q14, q11 @ABS(p0 - q0) <((Alpha >>2) + 2)
+ vaddl.u8 q11, d6, d10 @p0+q1 L
+ vcgt.u8 q7, q13, q15 @beta>Ap
+ vaddl.u8 q15, d7, d11 @p0+q1 H
+ vaddw.u8 q11, q11, d4 @p0+q1+p1 L
+ vaddw.u8 q15, q15, d5 @p0+q1+p1 H
+ vaddw.u8 q11, q11, d4 @p0+q1+2*p1 L
+ vaddw.u8 q15, q15, d5 @p0+q1+2*p1 H
+ vand q7, q7, q14 @(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
+ vrshrn.u16 d22, q11, #2 @((X2(p1) + p0 + q1 + 2) >> 2) L p0"
+ vrshrn.u16 d23, q15, #2 @((X2(p1) + p0 + q1 + 2) >> 2) H p0"
+ vaddl.u8 q15, d2, d0 @p2+p3 L
+ vbif q12, q11, q7 @p0' or p0 "
+ vaddl.u8 q11, d3, d1 @p2+p3 H
+ vadd.u16 q15, q15, q15 @2*(p2+p3) L
+ vadd.u16 q11, q11, q11 @2*(p2+p3)H
+ vadd.u16 q8, q8, q15 @(X2(p3) + X3(p2) + p1 + p0 + q0) L
+ vadd.u16 q9, q9, q11 @(X2(p3) + X3(p2) + p1 + p0 + q0) H
+ vabd.u8 q15, q6, q4 @Aq = abs(q2-q0)
+ vabd.u8 q11, q5, q4 @ABS(Q1-Q0)
+ vrshrn.u16 d16, q8, #3 @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2'
+ vrshrn.u16 d17, q9, #3 @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2'
+ vabd.u8 q9, q2, q3 @ABS(p1-p0)
+ vcgt.u8 q15, q13, q15 @Aq < Beta
+ vcge.u8 q11, q11, q13 @ABS(q1 - q0) >= Beta
+ vcge.u8 q9, q9, q13 @ABS(p1 - p0) >= beta
+ vdup.i8 q13, r2 @duplicate alpha
+ vand q15, q15, q14 @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ vabd.u8 q14, q3, q4 @abs(p0-q0)
+ vorr q11, q11, q9 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
+ vaddl.u8 q9, d6, d8 @p0+q0 L
+ vcge.u8 q14, q14, q13 @ABS(p0 - q0) >= Alpha
+ vaddl.u8 q13, d7, d9 @p0+q0 H
+ vaddw.u8 q9, q9, d10 @p0+q0+q1 L
+ vorr q11, q11, q14 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha
+ vaddw.u8 q13, q13, d11 @p0+q0+q1 H
+ vbic q7, q7, q11 @final condn for p's
+ vmov.i8 q14, #2
+ vbif q3, q12, q11 @final p0
+ vbit q1, q8, q7 @final p2
+ vbif q10, q2, q7 @final p1
+ vaddl.u8 q12, d8, d4 @q0+p1 L
+ vmlal.u8 q12, d10, d28 @X2(q1) + q0 + p1 L
+ vaddl.u8 q8, d9, d5 @q0+p1 H
+ vmlal.u8 q8, d11, d28 @X2(q1) + q0 + p1 H
+ vmov.i16 q14, #2
+ vaddl.u8 q7, d4, d12 @p1+q2 L
+ vmla.u16 q7, q9, q14 @p1 + X2(p0) + X2(q0) + X2(q1) + q2L
+ vaddl.u8 q2, d5, d13 @p1+q2H
+ vmla.u16 q2, q13, q14 @p1 + X2(p0) + X2(q0) + X2(q1) + q2H
+ vrshrn.u16 d24, q12, #2 @(X2(q1) + q0 + p1 + 2) >> 2; L q0'
+ vrshrn.u16 d25, q8, #2 @(X2(q1) + q0 + p1 + 2) >> 2; H q0'
+ vaddw.u8 q9, q9, d12 @p0 + q0 + q1 + q2 L
+ vaddw.u8 q13, q13, d13 @p0 + q0 + q1 + q2 H
+ vrshrn.u16 d16, q7, #3 @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo"
+ vpop {q7}
+ vrshrn.u16 d17, q2, #3 @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo"
+ vrshrn.u16 d4, q9, #2 @p0 + q0 + q1 + q2 + 2)>>2 L q1'
+ vrshrn.u16 d5, q13, #2 @p0 + q0 + q1 + q2 + 2)>>2 H q1'
+ vbit q12, q8, q15 @q0' or q0"
+ vbic q15, q15, q11 @final condn for q's
+ vtrn.8 d0, d2 @row1 &2
+ vbit q5, q2, q15 @final q1
+ vtrn.8 d1, d3 @row9 &10
+ vaddl.u8 q8, d12, d14 @q2+q3 L
+ vtrn.8 d20, d6 @row3&row4
+ vaddl.u8 q2, d13, d15 @q2+q3 H
+ vtrn.8 d21, d7 @row11 & 12
+ vmla.u16 q9, q8, q14 @X2(q3) + X3(q2) + q1 + q0 + p0 L
+ vtrn.16 d2, d6 @row2 & row4
+ vmla.u16 q13, q2, q14 @X2(q3) + X3(q2) + q1 + q0 + p0 H
+ vtrn.16 d3, d7 @row10 & 12
+ vbif q4, q12, q11 @final q0
+ vtrn.16 d0, d20 @row1 & 3
+ vrshrn.u16 d18, q9, #3 @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L
+ vtrn.16 d1, d21 @row9 & row11
+ vrshrn.u16 d19, q13, #3 @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H
+ vtrn.8 d8, d10 @row5&6
+ vbit q6, q9, q15 @final q2
+ vtrn.8 d9, d11 @row13 &14
+ vtrn.8 d12, d14 @row7 & 8
+ vtrn.8 d13, d15 @row15 & 16
+ vtrn.16 d10, d14 @row6 & row8
+ vtrn.16 d11, d15 @row14 & row16
+ @now Q3 ->p0 and Q7->q3
+ vtrn.16 d8, d12 @row 5 & 7
+ vtrn.16 d9, d13 @row13 & row15
+ sub r0, r0, r1, lsl#4 @restore pointer
+ vtrn.32 d6, d14 @row4 & 8
+ vtrn.32 d7, d15 @row 12 & 16
+ vtrn.32 d0, d8 @row1 & row5
+ vtrn.32 d1, d9 @row9 & 13
+ vtrn.32 d2, d10 @row2 &6
+ vtrn.32 d3, d11 @row10&row14
+ vtrn.32 d20, d12 @row3 & 7
+ vtrn.32 d21, d13 @row11 & row15
+ vst1.8 d0, [r0], r1 @row1
+ vst1.8 d2, [r0], r1 @row2
+ vst1.8 d20, [r0], r1 @row3
+ vst1.8 d6, [r0], r1 @row4
+ vst1.8 d8, [r0], r1 @row5
+ vst1.8 d10, [r0], r1 @row6
+ vst1.8 d12, [r0], r1 @row7
+ vst1.8 d14, [r0], r1 @row8
+ vst1.8 d1, [r0], r1 @row9
+ vst1.8 d3, [r0], r1 @row10
+ vst1.8 d21, [r0], r1 @row11
+ vst1.8 d7, [r0], r1 @row12
+ vst1.8 d9, [r0], r1 @row13
+ vst1.8 d11, [r0], r1 @row14
+ vst1.8 d13, [r0], r1 @row15
+ vst1.8 d15, [r0], r1 @row16
+ vpop {d8 - d15}
+ ldmfd sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a luma block vertical edge when the
+@* boundary strength is set to 4 on calling twice
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_luma_vert_bs4_mbaff_a9
+
+ih264_deblk_luma_vert_bs4_mbaff_a9:
+
+ stmfd sp!, {lr}
+
+ sub r0, r0, #4 @pointer uc_edgePixel-4
+ vpush {d8 - d15}
+ @loading [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] for every row
+ vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+ vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+ vuzp.8 d0, d1 @D0->p3, D1->p2
+ vuzp.8 d2, d3 @D2->p1, D3->p0
+ vuzp.8 d4, d5 @D4->q0, D5->q1
+ vuzp.8 d6, d7 @D6->q2, D7->q3
+
+ vmov.i16 q14, #2
+ vaddl.u8 q4, d3, d4 @p0+q0
+ vaddw.u8 q5, q4, d2 @p0+q0+p1
+ vaddl.u8 q6, d1, d5 @p2+q1
+ vmla.u16 q6, q5, q14 @p2 + X2(p1) + X2(p0) + X2(q0) + q1
+
+ vmov.i8 d14, #2
+ vaddw.u8 q4, q5, d1 @p0+q0+p1+p2
+ vdup.i8 d15, r2 @duplicate alpha
+ vrshrn.u16 d10, q4, #2 @(p2 + p1 + p0 + q0 + 2) >> 2) p1'
+ vabd.u8 d11, d3, d4 @ABD(p0-q0)
+ vsra.u8 d14, d15, #2 @alpha >>2 +2
+ vabd.u8 d15, d1, d3 @Ap = ABD(p2-p0)
+ vrshrn.u16 d12, q6, #3 @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) p0'
+ vdup.i8 d13, r3 @beta
+ vcgt.u8 d14, d14, d11 @ABS(p0 - q0) <((Alpha >>2) + 2)
+ vaddl.u8 q8, d3, d5 @p0+q1
+ vcgt.u8 d26, d13, d15 @beta>Ap
+ vaddw.u8 q8, q8, d2 @p0+q1+p1
+ vaddw.u8 q8, q8, d2 @p0+q1+2*p1
+ vand d26, d26, d14 @(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
+ vrshrn.u16 d11, q8, #2 @((X2(p1) + p0 + q1 + 2) >> 2) p0"
+ vbif d12, d11, d26 @p0' or p0 "
+ vaddl.u8 q9, d1, d0 @p2+p3
+ vadd.u16 q9, q9, q9 @2*(p2+p3)
+ vadd.u16 q4, q4, q9 @(X2(p3) + X3(p2) + p1 + p0 + q0)
+ vabd.u8 d15, d6, d4 @Aq = abs(q2-q0)
+ vabd.u8 d11, d5, d4 @ABS(q1-q0)
+ vrshrn.u16 d8, q4, #3 @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); p2'
+ vabd.u8 d9, d2, d3 @ABS(p1-p0)
+ vcgt.u8 d15, d13, d15 @Aq < Beta
+ vcge.u8 d11, d11, d13 @ABS(q1 - q0) >= Beta
+ vcge.u8 d9, d9, d13 @ABS(p1 - p0) >= beta
+ vdup.i8 d13, r2 @duplicate alpha
+ vand d15, d15, d14 @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ vabd.u8 d14, d3, d4 @abs(p0-q0)
+ vorr d11, d11, d9 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
+ vcge.u8 d14, d14, d13 @ABS(p0 - q0) >= Alpha
+ vaddl.u8 q10, d3, d4 @p0+q0
+ vorr d11, d11, d14 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha
+ vaddw.u8 q10, q10, d5 @p0+q0+q1
+ vbic d26, d26, d11 @final condn for p's
+ vmov.i8 d14, #2
+ vbif d3, d12, d11 @final p0
+ vbit d1, d8, d26 @final p2
+ vbif d10, d2, d26 @final p1
+ vaddl.u8 q6, d4, d2 @q0+p1
+ vmlal.u8 q6, d5, d14 @X2(q1) + q0 + p1
+
+ vaddl.u8 q11, d2, d6 @p1+q2
+ vmla.u16 q11, q10, q14 @p1 + X2(p0) + X2(q0) + X2(q1) + q2
+ vrshrn.u16 d12, q6, #2 @(X2(q1) + q0 + p1 + 2) >> 2; q0'
+ vaddw.u8 q10, q10, d6 @p0 + q0 + q1 + q2
+ vrshrn.u16 d8, q11, #3 @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 qo"
+
+ vrshrn.u16 d2, q10, #2 @p0 + q0 + q1 + q2 + 2)>>2 q1'
+ vbit d12, d8, d15 @q0' or q0"
+ vbic d15, d15, d11 @final condn for q's
+ vbit d5, d2, d15 @final q1
+ vaddl.u8 q12, d6, d7 @q2+q3
+ vmla.u16 q10, q12, q14 @X2(q3) + X3(q2) + q1 + q0 + p0
+ vbif d4, d12, d11 @final q0
+ vrshrn.u16 d9, q10, #3 @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3;
+ vbit d6, d9, d15 @final q2
+ vand d2, d10, d10 @D0->p3, D1->p2, D2->p1, D3->p0, D4->q0, D5->q1, D6->q2, D7->q3
+
+ vzip.8 d0, d1 @D0,D1 -> [p3:p2]
+ vzip.8 d2, d3 @D2,D3 -> [p1:p0]
+ vzip.8 d4, d5 @D4,D5 -> [q0:q1]
+ vzip.8 d6, d7 @D6,D7 -> [q2:q3]
+
+ sub r0, r0, r1, lsl#3 @restore pointer
+
+ @storing [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] in every row
+ vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+ vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a luma block vertical edge for cases where the
+@* boundary strength is less than 4 on calling twice
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@* tc0_table
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_luma_vert_bslt4_mbaff_a9
+
+ih264_deblk_luma_vert_bslt4_mbaff_a9:
+
+ stmfd sp!, {r12, lr}
+
+ sub r0, r0, #4 @pointer uc_edgePixel-4
+ ldr r12, [sp, #8] @r12 = ui_Bs
+ ldr r14, [sp, #12] @r14 = pu1_ClipTab
+ vpush {d8 - d15}
+ @loading [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] for every row
+ vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+ vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+ vuzp.8 d0, d1 @D0->p3, D1->p2
+ vuzp.8 d2, d3 @D2->p1, D3->p0
+ vuzp.8 d4, d5 @D4->q0, D5->q1
+ vuzp.8 d6, d7 @D6->q2, D7->q3
+
+ rev r12, r12 @reversing ui_bs
+ vmov.32 d8[0], r12 @D8[0] = ui_Bs
+ vld1.32 d9[0], [r14] @D9[0] contains cliptab
+ vmovl.u8 q15, d8 @D30 = ui_Bs in each 16 bt scalar
+ vtbl.8 d8, {d9}, d30 @puc_ClipTab[ui_Bs]
+ vsli.16 d8, d8, #8 @D8 = C0
+
+ vrhadd.u8 d10, d3, d4 @((p0 + q0 + 1) >> 1)
+ vmov.i8 d31, #2
+ vabd.u8 d11, d3, d4 @ABS(p0 - q0)
+ vaddl.u8 q6, d10, d1 @(p2 + ((p0 + q0 + 1) >> 1)
+ vmlsl.u8 q6, d2, d31 @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1))
+ vdup.8 d14, r2 @alpha
+ vcle.u8 d11, d14, d11 @ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
+ vdup.i8 d14, r3 @beta
+ vabd.u8 d15, d5, d4 @ABS(q1 - q0)
+ vqshrn.s16 d12, q6, #1 @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1)
+ vcge.u8 d15, d15, d14 @ABS(q1 - q0) >= Beta
+ vabd.u8 d13, d2, d3 @ABS(p1 - p0)
+ vmin.s8 d12, d12, d8 @min(deltap1 ,C0)
+ vorr d11, d11, d15 @ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
+ vneg.s8 d15, d8 @-C0
+ vcge.u8 d13, d13, d14 @ABS(p1 - p0) >= Beta
+ vmax.s8 d12, d12, d15 @max(deltap1,-C0)
+ vorr d11, d11, d13 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)
+ vceq.u16 d13, d30, #0 @ui_bs == 0
+ vaddl.u8 q14, d10, d6 @q2 + ((p0 + q0 + 1) >> 1)
+ vsubw.u8 q14, q14, d5 @q2 + ((p0 + q0 + 1) >> 1) - q1
+ vsubw.u8 q14, q14, d5 @q2 + ((p0 + q0 + 1) >> 1) - 2*q1
+ vorr d13, d13, d11 @(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
+ @|| (ui_bs == 0)
+ vqshrn.s16 d9, q14, #1 @(q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1
+ vabd.u8 d11, d1, d3 @Ap = ABS(p2 - p0)
+ vabd.u8 d10, d6, d4 @Aq= ABS(q2 - q0)
+ vclt.u8 d11, d11, d14 @Ap < Beta
+ vmin.s8 d9, d9, d8 @min(deltaq1,C0)
+ vclt.u8 d10, d10, d14 @Aq < Beta
+ vmax.s8 d9, d9, d15 @max(deltaq1,-C0)
+ vsubl.u8 q7, d4, d3 @q0 - p0
+ vshl.s16 q7, q7, #2 @(q0 - p0) << 2
+ vsub.u8 d8, d8, d11 @C0 + (Ap < Beta)
+ vaddw.u8 q7, q7, d2 @((q0 - p0) << 2) + p1
+ vsubw.u8 q7, q7, d5 @((q0 - p0) << 2) + (p1 - q1)
+ vbic d11, d11, d13 @final condition for p1
+ vrshr.s16 q15, q7, #3 @delta = (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3
+ vsub.u8 d8, d8, d10 @C0 + (Ap < Beta) + (Aq < Beta)
+ vbic d10, d10, d13 @final condition for q1
+ vabs.s16 q14, q15
+ vmovn.i16 d15, q14 @abs(delta)
+ vand d12, d12, d11 @delatp1
+ vand d9, d9, d10 @deltaq1
+ vmin.u8 d15, d15, d8 @min((abs(delta),C)
+ vadd.i8 d2, d2, d12 @p1+deltap1
+ vadd.i8 d5, d5, d9 @q1+deltaq1
+ vbic d15, d15, d13 @abs(delta) of pixels to be changed only
+ vcge.s16 q14, q15, #0
+ vmovn.i16 d14, q14 @sign(delta)
+ vqsub.u8 d11, d3, d15 @clip(p0-delta)
+ vqadd.u8 d3, d3, d15 @clip(p0+delta)
+ vqadd.u8 d12, d4, d15 @clip(q0+delta)
+ vqsub.u8 d4, d4, d15 @clip(q0-delta)
+ vbif d3, d11, d14 @p0
+ vbif d4, d12, d14 @q0
+
+ sub r0, r0, r1, lsl#3 @restore pointer
+ @D0->p3, D1->p2, D2->p1, D3->p0, D4->q0, D5->q1, D6->q2, D7->q3
+ vzip.8 d0, d1 @D0,D1 -> [p3:p2]
+ vzip.8 d2, d3 @D2,D3 -> [p1:p0]
+ vzip.8 d4, d5 @D4,D5 -> [q0:q1]
+ vzip.8 d6, d7 @D6,D7 -> [q2:q3]
+
+ @storing [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] in every row
+ vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+ vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r12, pc}
+
+
+
diff --git a/common/arm/ih264_default_weighted_pred_a9q.s b/common/arm/ih264_default_weighted_pred_a9q.s
new file mode 100755
index 0000000..94cda46
--- /dev/null
+++ b/common/arm/ih264_default_weighted_pred_a9q.s
@@ -0,0 +1,359 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_default_weighted_pred_a9q.s
+@*
+@* @brief
+@* Contains function definitions for default weighted prediction.
+@* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
+@*
+@* @author
+@* Kaushik Senthoor R
+@*
+@* @par List of Functions:
+@*
+@* - ih264_default_weighted_pred_luma_a9q()
+@* - ih264_default_weighted_pred_chroma_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@*******************************************************************************
+@* @function
+@* ih264_default_weighted_pred_luma_a9q()
+@*
+@* @brief
+@* This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma.
+@*
+@* @par Description:
+@* This function gets two ht x wd blocks, calculates their rounded-average and
+@* stores it in the destination block.
+@*
+@* @param[in] pu1_src1:
+@* UWORD8 Pointer to the buffer containing the first input block.
+@*
+@* @param[in] pu1_src2:
+@* UWORD8 Pointer to the buffer containing the second input block.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd1
+@* Stride of the first input buffer
+@*
+@* @param[in] src_strd2
+@* Stride of the second input buffer
+@*
+@* @param[in] dst_strd
+@* Stride of the destination buffer
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
+@*
+@*******************************************************************************
+@*/
+@void ih264_default_weighted_pred_luma_a9q(UWORD8 *pu1_src1,
+@ UWORD8 *pu1_src2,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd1,
+@ WORD32 src_strd2,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => pu1_src1
+@ r1 => pu1_src2
+@ r2 => pu1_dst
+@ r3 => src_strd1
+@ [sp] => src_strd2 (r4)
+@ [sp+4] => dst_strd (r5)
+@ [sp+8] => ht (r6)
+@ [sp+12] => wd (r7)
+@
+.text
+.p2align 2
+
+ .global ih264_default_weighted_pred_luma_a9q
+
+ih264_default_weighted_pred_luma_a9q:
+
+ stmfd sp!, {r4-r7, r14} @stack stores the values of the arguments
+ ldr r7, [sp, #32] @Load wd
+ ldr r4, [sp, #20] @Load src_strd2
+ ldr r5, [sp, #24] @Load dst_strd
+ cmp r7, #16
+ ldr r6, [sp, #28] @Load ht
+ vpush {d8-d15}
+ beq loop_16 @branch if wd is 16
+ cmp r7, #8
+ beq loop_8 @branch if wd is 8
+
+loop_4: @each iteration processes four rows
+
+ vld1.32 d0[0], [r0], r3 @load row 1 in source 1
+ vld1.32 d0[1], [r0], r3 @load row 2 in source 1
+ vld1.32 d2[0], [r1], r4 @load row 1 in source 2
+ vld1.32 d2[1], [r1], r4 @load row 2 in source 2
+
+ vld1.32 d1[0], [r0], r3 @load row 3 in source 1
+ vld1.32 d1[1], [r0], r3 @load row 4 in source 1
+ vrhadd.u8 d0, d0, d2
+ vld1.32 d3[0], [r1], r4 @load row 3 in source 2
+ vld1.32 d3[1], [r1], r4 @load row 4 in source 2
+
+ subs r6, r6, #4 @decrement ht by 4
+ vst1.32 d0[0], [r2], r5 @load row 1 in destination
+ vst1.32 d0[1], [r2], r5 @load row 2 in destination
+ vrhadd.u8 d1, d1, d3
+ vst1.32 d1[0], [r2], r5 @load row 3 in destination
+ vst1.32 d1[1], [r2], r5 @load row 4 in destination
+
+ bgt loop_4 @if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_8: @each iteration processes four rows
+
+ vld1.8 d0, [r0], r3 @load row 1 in source 1
+ vld1.8 d4, [r1], r4 @load row 1 in source 2
+ vld1.8 d1, [r0], r3 @load row 2 in source 1
+ vld1.8 d5, [r1], r4 @load row 2 in source 2
+ vld1.8 d2, [r0], r3 @load row 3 in source 1
+ vrhadd.u8 q0, q0, q2
+ vld1.8 d6, [r1], r4 @load row 3 in source 2
+ vld1.8 d3, [r0], r3 @load row 4 in source 1
+ vrhadd.u8 d2, d2, d6
+ vld1.8 d7, [r1], r4 @load row 4 in source 2
+
+ subs r6, r6, #4 @decrement ht by 4
+ vst1.8 d0, [r2], r5 @load row 1 in destination
+ vrhadd.u8 d3, d3, d7
+ vst1.8 d1, [r2], r5 @load row 2 in destination
+ vst1.8 d2, [r2], r5 @load row 3 in destination
+ vst1.8 d3, [r2], r5 @load row 4 in destination
+
+ bgt loop_8 @if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_16: @each iteration processes eight rows
+
+ vld1.8 {q0}, [r0], r3 @load row 1 in source 1
+ vld1.8 {q8}, [r1], r4 @load row 1 in source 2
+ vld1.8 {q1}, [r0], r3 @load row 2 in source 1
+ vld1.8 {q9}, [r1], r4 @load row 2 in source 2
+ vrhadd.u8 q0, q0, q8
+ vld1.8 {q2}, [r0], r3 @load row 3 in source 1
+ vld1.8 {q10}, [r1], r4 @load row 3 in source 2
+ vrhadd.u8 q1, q1, q9
+ vld1.8 {q3}, [r0], r3 @load row 4 in source 1
+ vld1.8 {q11}, [r1], r4 @load row 4 in source 2
+ vrhadd.u8 q2, q2, q10
+ vld1.8 {q4}, [r0], r3 @load row 5 in source 1
+ vld1.8 {q12}, [r1], r4 @load row 5 in source 2
+ vrhadd.u8 q3, q3, q11
+ vld1.8 {q5}, [r0], r3 @load row 6 in source 1
+ vld1.8 {q13}, [r1], r4 @load row 6 in source 2
+ vrhadd.u8 q4, q4, q12
+ vld1.8 {q6}, [r0], r3 @load row 7 in source 1
+ vld1.8 {q14}, [r1], r4 @load row 7 in source 2
+ vrhadd.u8 q5, q5, q13
+ vld1.8 {q7}, [r0], r3 @load row 8 in source 1
+ vld1.8 {q15}, [r1], r4 @load row 8 in source 2
+
+ vrhadd.u8 q6, q6, q14
+ vst1.8 {q0}, [r2], r5 @load row 1 in destination
+ vst1.8 {q1}, [r2], r5 @load row 2 in destination
+ vrhadd.u8 q7, q7, q15
+ vst1.8 {q2}, [r2], r5 @load row 3 in destination
+ vst1.8 {q3}, [r2], r5 @load row 4 in destination
+ subs r6, r6, #8 @decrement ht by 8
+ vst1.8 {q4}, [r2], r5 @load row 5 in destination
+ vst1.8 {q5}, [r2], r5 @load row 6 in destination
+ vst1.8 {q6}, [r2], r5 @load row 7 in destination
+ vst1.8 {q7}, [r2], r5 @load row 8 in destination
+
+ bgt loop_16 @if greater than 0 repeat the loop again
+
+end_loops:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r7, r15} @Reload the registers from sp
+
+
+@*******************************************************************************
+@* @function
+@* ih264_default_weighted_pred_chroma_a9q()
+@*
+@* @brief
+@* This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma.
+@*
+@* @par Description:
+@* This function gets two ht x wd blocks, calculates their rounded-average and
+@* stores it in the destination block for U and V.
+@*
+@* @param[in] pu1_src1:
+@* UWORD8 Pointer to the buffer containing the first input block.
+@*
+@* @param[in] pu1_src2:
+@* UWORD8 Pointer to the buffer containing the second input block.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd1
+@* Stride of the first input buffer
+@*
+@* @param[in] src_strd2
+@* Stride of the second input buffer
+@*
+@* @param[in] dst_strd
+@* Stride of the destination buffer
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
+@*
+@*******************************************************************************
+@*/
+@void ih264_default_weighted_pred_chroma_a9q(UWORD8 *pu1_src1,
+@ UWORD8 *pu1_src2,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd1,
+@ WORD32 src_strd2,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => pu1_src1
+@ r1 => pu1_src2
+@ r2 => pu1_dst
+@ r3 => src_strd1
+@ [sp] => src_strd2 (r4)
+@ [sp+4] => dst_strd (r5)
+@ [sp+8] => ht (r6)
+@ [sp+12] => wd (r7)
+@
+
+
+ .global ih264_default_weighted_pred_chroma_a9q
+
+ih264_default_weighted_pred_chroma_a9q:
+
+ stmfd sp!, {r4-r7, r14} @stack stores the values of the arguments
+ ldr r7, [sp, #32] @Load wd
+ ldr r4, [sp, #20] @Load src_strd2
+ ldr r5, [sp, #24] @Load dst_strd
+ cmp r7, #8
+ ldr r6, [sp, #28] @Load ht
+ vpush {d8-d15}
+ beq loop_8_uv @branch if wd is 8
+ cmp r7, #4
+ beq loop_4_uv @branch if wd is 4
+
+loop_2_uv: @each iteration processes two rows
+
+ vld1.32 d0[0], [r0], r3 @load row 1 in source 1
+ vld1.32 d0[1], [r0], r3 @load row 2 in source 1
+
+ vld1.32 d1[0], [r1], r4 @load row 1 in source 2
+ vld1.32 d1[1], [r1], r4 @load row 2 in source 2
+
+ vrhadd.u8 d0, d0, d1
+
+ subs r6, r6, #2 @decrement ht by 2
+ vst1.32 d0[0], [r2], r5 @load row 1 in destination
+ vst1.32 d0[1], [r2], r5 @load row 2 in destination
+
+ bgt loop_2_uv @if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_4_uv: @each iteration processes two rows
+
+ vld1.8 d0, [r0], r3 @load row 1 in source 1
+ vld1.8 d2, [r1], r4 @load row 1 in source 2
+ vld1.8 d1, [r0], r3 @load row 2 in source 1
+ vrhadd.u8 d0, d0, d2
+ vld1.8 d3, [r1], r4 @load row 2 in source 2
+
+ vrhadd.u8 d1, d1, d3
+ vst1.8 d0, [r2], r5 @load row 1 in destination
+ subs r6, r6, #2 @decrement ht by 2
+ vst1.8 d1, [r2], r5 @load row 2 in destination
+
+ bgt loop_4_uv @if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_8_uv: @each iteration processes four rows
+
+ vld1.8 {q0}, [r0], r3 @load row 1 in source 1
+ vld1.8 {q4}, [r1], r4 @load row 1 in source 2
+ vld1.8 {q1}, [r0], r3 @load row 2 in source 1
+ vrhadd.u8 q0, q0, q4
+ vld1.8 {q5}, [r1], r4 @load row 2 in source 2
+ vld1.8 {q2}, [r0], r3 @load row 3 in source 1
+ vrhadd.u8 q1, q1, q5
+ vld1.8 {q6}, [r1], r4 @load row 3 in source 2
+ vld1.8 {q3}, [r0], r3 @load row 4 in source 1
+ vrhadd.u8 q2, q2, q6
+ vld1.8 {q7}, [r1], r4 @load row 4 in source 2
+
+ vst1.8 {q0}, [r2], r5 @load row 1 in destination
+ vrhadd.u8 q3, q3, q7
+ vst1.8 {q1}, [r2], r5 @load row 2 in destination
+ subs r6, r6, #4 @decrement ht by 4
+ vst1.8 {q2}, [r2], r5 @load row 3 in destination
+ vst1.8 {q3}, [r2], r5 @load row 4 in destination
+
+ bgt loop_8_uv @if greater than 0 repeat the loop again
+
+end_loops_uv:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r7, r15} @Reload the registers from sp
+
+
diff --git a/common/arm/ih264_ihadamard_scaling_a9.s b/common/arm/ih264_ihadamard_scaling_a9.s
new file mode 100755
index 0000000..687099a
--- /dev/null
+++ b/common/arm/ih264_ihadamard_scaling_a9.s
@@ -0,0 +1,250 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264_ihadamard_scaling_a9.s
+@ *
+@ * @brief
+@ * Contains function definitions for inverse hadamard transform on 4x4 DC outputs
+@ * of 16x16 intra-prediction
+@ *
+@ * @author
+@ * Mohit
+@ *
+@ * @par List of Functions:
+@ * - ih264_ihadamard_scaling_4x4_a9()
+@ * - ih264_ihadamard_scaling_2x2_uv_a9()
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+@ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
+@ * of a 16x16 intra prediction macroblock, and then performs scaling.
+@ * prediction buffer
+@ *
+@ * @par Description:
+@ * The DC coefficients pass through a 2-stage inverse hadamard transform.
+@ * This inverse transformed content is scaled to based on Qp value.
+@ *
+@ * @param[in] pi2_src
+@ * input 4x4 block of DC coefficients
+@ *
+@ * @param[out] pi2_out
+@ * output 4x4 block
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * pointer to scaling list
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * pointer to weight matrix
+@ *
+@ * @param[in] u4_qp_div_6
+@ * Floor (qp/6)
+@ *
+@ * @param[in] pi4_tmp
+@ * temporary buffer of size 1*16
+@ *
+@ * @returns none
+@ *
+@ * @remarks none
+@ *
+@ *******************************************************************************
+@ */
+@ *
+@ *******************************************************************************
+@ */
+@ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src,
+@ WORD16* pi2_out,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD32* pi4_tmp)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pi2_out
+@r2 => *pu2_iscal_mat
+@r3 => *pu2_weigh_mat
+@r4 => u4_qp_div_6
+
+.text
+.p2align 2
+
+ .global ih264_ihadamard_scaling_4x4_a9
+
+ih264_ihadamard_scaling_4x4_a9:
+
+@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
+@If the macro value changes need to change the instruction according to it.
+@Only one shift is done in horizontal inverse because,
+@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+ stmfd sp!, {r4-r12, r14} @ stack stores the values of the arguments
+ ldr r4, [sp, #40] @ Loads u4_qp_div_6
+ vdup.s32 q10, r4 @ Populate the u4_qp_div_6 in Q10
+ ldrh r6, [r3] @ load pu2_weight_mat[0] , H for unsigned halfword load
+ ldrh r7, [r2] @ load pu2_iscal_mat[0] , H for unsigned halfword load
+ mul r6, r6, r7 @ pu2_iscal_mat[0]*pu2_weigh_mat[0]
+ vdup.s32 q9, r6 @ Populate pu2_iscal_mat[0]*pu2_weigh_mat[0] 32-bit in Q9
+ vpush {d8-d15}
+@=======================INVERSE HADAMARD TRANSFORM================================
+
+ vld4.s16 {d0, d1, d2, d3}, [r0] @load x4,x5,x6,x7
+ vaddl.s16 q12, d0, d3 @x0 = x4 + x7
+ vaddl.s16 q13, d1, d2 @x1 = x5 + x6
+ vsubl.s16 q14, d1, d2 @x2 = x5 - x6
+ vsubl.s16 q15, d0, d3 @x3 = x4 - x7
+
+ vadd.s32 q2, q12, q13 @pi4_tmp_ptr[0] = x0 + x1
+ vadd.s32 q3, q15, q14 @pi4_tmp_ptr[1] = x3 + x2
+ vsub.s32 q4, q12, q13 @pi4_tmp_ptr[2] = x0 - x1
+ vsub.s32 q5, q15, q14 @pi4_tmp_ptr[3] = x3 - x2
+
+ vtrn.32 q2, q3 @Transpose the register for vertical transform
+ vtrn.32 q4, q5
+
+ vswp d5, d8 @Q2 = x4, Q4 = x6
+ vswp d7, d10 @Q3 = x5, Q5 = x7
+
+
+ vadd.s32 q12, q2, q5 @x0 = x4+x7
+ vadd.s32 q13, q3, q4 @x1 = x5+x6
+ vsub.s32 q14, q3, q4 @x2 = x5-x6
+ vsub.s32 q15, q2, q5 @x3 = x4-x7
+
+ vadd.s32 q0, q12, q13 @pi4_tmp_ptr[0] = x0 + x1
+ vadd.s32 q1, q15, q14 @pi4_tmp_ptr[1] = x3 + x2
+ vsub.s32 q2, q12, q13 @pi4_tmp_ptr[2] = x0 - x1
+ vsub.s32 q3, q15, q14 @pi4_tmp_ptr[3] = x3 - x2
+
+
+ vmul.s32 q0, q0, q9 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+ vmul.s32 q1, q1, q9 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+ vmul.s32 q2, q2, q9 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+ vmul.s32 q3, q3, q9 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+ vshl.s32 q0, q0, q10 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3
+ vshl.s32 q1, q1, q10 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7
+ vshl.s32 q2, q2, q10 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11
+ vshl.s32 q3, q3, q10 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15
+
+ vqrshrn.s32 d0, q0, #0x6 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+ vqrshrn.s32 d1, q1, #0x6 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+ vqrshrn.s32 d2, q2, #0x6 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+ vqrshrn.s32 d3, q3, #0x6 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+ vst1.s16 {d0, d1, d2, d3}, [r1] @IV row store the value
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+
+
+@ *******************************************************************************
+@ */
+@ * @brief This function performs a 2x2 inverse hadamard transform for chroma block
+@ *
+@ * @par Description:
+@ * The DC coefficients pass through a 2-stage inverse hadamard transform.
+@ * This inverse transformed content is scaled to based on Qp value.
+@ * Both DC blocks of U and v blocks are processesd
+@ *
+@ * @param[in] pi2_src
+@ * input 1x8 block of ceffs. First 4 are from U and next from V
+@ *
+@ * @param[out] pi2_out
+@ * output 1x8 block
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * pointer to scaling list
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * pointer to weight matrix
+@ *
+@ * @param[in] u4_qp_div_6
+@ * Floor (qp/6)
+@ *
+@ * @returns none
+@ *
+@ * @remarks none
+@ *
+@ *******************************************************************************
+@ */
+@ *
+@ *******************************************************************************
+@ */
+@ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
+@ WORD16* pi2_out,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+
+ .global ih264_ihadamard_scaling_2x2_uv_a9
+ih264_ihadamard_scaling_2x2_uv_a9:
+
+@Registers used
+@ r0 : *pi2_src
+@ r1 : *pi2_out
+@ r2 : *pu2_iscal_mat
+@ r3 : *pu2_weigh_mat
+
+ vld1.u16 d26[0], [r2]
+ vld1.u16 d27[0], [r3]
+ vmull.u16 q15, d26, d27 @pu2_iscal_mat[0] * pu2_weigh_mat[0]
+ vdup.u32 q15, d30[0]
+
+ vld1.u16 d28[0], [sp] @load qp/6
+
+ vpush {d8-d15}
+
+ vmov.u16 d29, #5
+ vsubl.u16 q14, d28, d29 @qp\6 - 5
+ vdup.s32 q14, d28[0]
+
+ vld2.s16 {d0, d1}, [r0] @load 8 dc coeffs
+ @i2_x4,i2_x6,i2_y4,i1_y6 -> d0
+ @i2_x5,i2_x7,i2_y5,i1_y6 -> d1
+
+ vaddl.s16 q1, d0, d1 @ i4_x0 = i4_x4 + i4_x5;...x2
+ vsubl.s16 q2, d0, d1 @ i4_x1 = i4_x4 - i4_x5;...x3
+
+ vtrn.s32 q1, q2 @i4_x0 i4_x1 -> q1
+
+ vadd.s32 q3, q1, q2 @i4_x4 = i4_x0+i4_x2;.. i4_x5
+ vsub.s32 q1, q1, q2 @i4_x6 = i4_x0-i4_x2;.. i4_x7
+
+ vmul.s32 q5, q3, q15
+ vmul.s32 q6, q1, q15
+
+ vshl.s32 q7, q5, q14
+ vshl.s32 q8, q6, q14
+
+ vmovn.s32 d18, q7 @i4_x4 i4_x5 i4_y4 i4_y5
+ vmovn.s32 d19, q8 @i4_x6 i4_x7 i4_y6 i4_y7
+
+ vst2.s32 {d18-d19}, [r1]
+
+ vpop {d8-d15}
+ bx lr
+
+
diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s
new file mode 100755
index 0000000..afd2860
--- /dev/null
+++ b/common/arm/ih264_inter_pred_chroma_a9q.s
@@ -0,0 +1,254 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_chroma_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Ittaim
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_chroma_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Interprediction chroma filter
+@*
+@* @par Description:
+@* Applies filtering to chroma samples as mentioned in
+@* sec 8.4.2.2.2 titled "chroma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in]uc_dx
+@* dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
+@*
+@* @param[in] uc_dy
+@* dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_chroma(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ UWORD8 u1_dx,
+@ UWORD8 u1_dy,
+@ WORD32 ht,
+@ WORD32 wd)
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => u1_dx
+@ r5 => u1_dy
+@ r6 => height
+@ r7 => width
+@
+.text
+.p2align 2
+
+ .global ih264_inter_pred_chroma_a9q
+
+ih264_inter_pred_chroma_a9q:
+
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r4, [sp, #104]
+ ldr r5, [sp, #108]
+ ldr r6, [sp, #112]
+ ldr r7, [sp, #116]
+
+ rsb r8, r4, #8 @8-u1_dx
+ rsb r9, r5, #8 @8-u1_dy
+ mul r10, r8, r9
+ mul r11, r4, r9
+
+ vdup.u8 d28, r10
+ vdup.u8 d29, r11
+
+ mul r10, r8, r5
+ mul r11, r4, r5
+
+ vdup.u8 d30, r10
+ vdup.u8 d31, r11
+
+ subs r12, r7, #2 @if wd=4 branch to loop_4
+ beq loop_2
+ subs r12, r7, #4 @if wd=8 branch to loop_8
+ beq loop_4
+
+loop_8:
+ sub r6, #1
+ vld1.8 {d0, d1, d2}, [r0], r2 @ Load row0
+ vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1
+ vext.8 d3, d0, d1, #2
+ vext.8 d8, d5, d6, #2
+
+ vmull.u8 q5, d0, d28
+ vmlal.u8 q5, d5, d30
+ vmlal.u8 q5, d3, d29
+ vmlal.u8 q5, d8, d31
+ vext.8 d9, d6, d7, #2
+ vext.8 d4, d1, d2, #2
+
+inner_loop_8:
+ vmull.u8 q6, d6, d30
+ vmlal.u8 q6, d1, d28
+ vmlal.u8 q6, d9, d31
+ vmlal.u8 q6, d4, d29
+ vmov d0, d5
+ vmov d3, d8
+
+ vqrshrun.s16 d14, q5, #6
+ vmov d1, d6
+ vmov d4, d9
+
+ vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1
+ vqrshrun.s16 d15, q6, #6
+
+ vext.8 d8, d5, d6, #2
+ subs r6, #1
+ vext.8 d9, d6, d7, #2
+ vst1.8 {q7}, [r1], r3 @ Store dest row
+
+ vmull.u8 q5, d0, d28
+ vmlal.u8 q5, d5, d30
+ vmlal.u8 q5, d3, d29
+ vmlal.u8 q5, d8, d31
+ bne inner_loop_8
+
+ vmull.u8 q6, d6, d30
+ vmlal.u8 q6, d1, d28
+ vmlal.u8 q6, d9, d31
+ vmlal.u8 q6, d4, d29
+
+ vqrshrun.s16 d14, q5, #6
+ vqrshrun.s16 d15, q6, #6
+
+ vst1.8 {q7}, [r1], r3 @ Store dest row
+
+ b end_func
+
+loop_4:
+ sub r6, #1
+ vld1.8 {d0, d1}, [r0], r2 @ Load row0
+ vld1.8 {d2, d3}, [r0], r2 @ Load row1
+ vext.8 d1, d0, d1, #2
+ vext.8 d3, d2, d3, #2
+
+ vmull.u8 q2, d2, d30
+ vmlal.u8 q2, d0, d28
+ vmlal.u8 q2, d3, d31
+ vmlal.u8 q2, d1, d29
+
+inner_loop_4:
+ subs r6, #1
+ vmov d0, d2
+ vmov d1, d3
+
+ vld1.8 {d2, d3}, [r0], r2 @ Load row1
+ vqrshrun.s16 d6, q2, #6
+
+ vext.8 d3, d2, d3, #2
+ vst1.8 {d6}, [r1], r3 @ Store dest row
+
+ vmull.u8 q2, d0, d28
+ vmlal.u8 q2, d2, d30
+ vmlal.u8 q2, d1, d29
+ vmlal.u8 q2, d3, d31
+ bne inner_loop_4
+
+ vqrshrun.s16 d6, q2, #6
+ vst1.8 {d6}, [r1], r3 @ Store dest row
+
+ b end_func
+
+loop_2:
+ vld1.8 {d0}, [r0], r2 @ Load row0
+ vext.8 d1, d0, d0, #2
+ vld1.8 {d2}, [r0], r2 @ Load row1
+ vext.8 d3, d2, d2, #2
+ vmull.u8 q2, d0, d28
+ vmlal.u8 q2, d1, d29
+ vmlal.u8 q2, d2, d30
+ vmlal.u8 q2, d3, d31
+ vld1.8 {d6}, [r0] @ Load row2
+ vqrshrun.s16 d4, q2, #6
+ vext.8 d7, d6, d6, #2
+ vst1.32 d4[0], [r1], r3 @ Store dest row0
+ vmull.u8 q4, d2, d28
+ vmlal.u8 q4, d3, d29
+ vmlal.u8 q4, d6, d30
+ vmlal.u8 q4, d7, d31
+ subs r6, #2
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 d8[0], [r1], r3 @ Store dest row1
+ bne loop_2 @ repeat if ht=2
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @ Restoring registers from stack
+
diff --git a/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s
new file mode 100755
index 0000000..ea6bba0
--- /dev/null
+++ b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s
@@ -0,0 +1,245 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_horz_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_horz_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Interprediction luma filter for horizontal input
+@*
+@* @par Description:
+@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
+@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@ @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_luma_horz (
+@ UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd )
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r5 => ht
+@ r6 => wd
+
+.text
+.p2align 2
+
+
+ .global ih264_inter_pred_luma_horz_a9q
+
+ih264_inter_pred_luma_horz_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r5, [sp, #104] @Loads ht
+ sub r0, r0, #2 @pu1_src-2
+ ldr r6, [sp, #108] @Loads wd
+ vmov.i8 d0, #5 @filter coeff
+ subs r12, r6, #8 @if wd=8 branch to loop_8
+ vmov.i8 d1, #20 @filter coeff
+ beq loop_8
+
+ subs r12, r6, #4 @if wd=4 branch to loop_4
+ beq loop_4
+
+loop_16: @when wd=16
+ @// Processing row0 and row1
+ vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 ;for checking loop
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1
+ vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0)
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0)
+ vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0)
+ vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1)
+ vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0)
+ vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1)
+ vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0)
+ vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1)
+ vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1)
+ vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0)
+ vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1)
+ vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1)
+ vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0)
+ vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1)
+ vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0)
+ vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1)
+ vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1)
+ vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1)
+ vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0)
+ vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1)
+ vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+ vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1)
+ vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1)
+ vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2)
+ vst1.8 {d20, d21}, [r1], r3 @//Store dest row0
+ vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2)
+ vqrshrun.s16 d24, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1)
+ vst1.8 {d23, d24}, [r1], r3 @//Store dest row1
+ subs r5, r5, #2 @ 2 rows done, decrement by 2
+
+ beq end_func
+ b loop_16 @ loop if height == 8 or 16
+
+loop_8:
+@// Processing row0 and row1
+ vld1.8 {d5, d6}, [r0], r2 @// Load row1
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vld1.8 {d2, d3}, [r0], r2 @// Load row0
+ vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1)
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1)
+ vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1)
+ vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0)
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0)
+ vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vst1.8 {d23}, [r1], r3 @//Store dest row0
+ vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vst1.8 {d20}, [r1], r3 @//Store dest row1
+ subs r5, r5, #2 @ 2 rows done, decrement by 2
+
+ beq end_func @ Branch if height==4
+
+ b loop_8 @looping if height =8 or 16
+
+loop_4:
+ vld1.8 {d5, d6}, [r0], r2 @// Load row1
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vld1.8 {d2, d3}, [r0], r2 @// Load row0
+ vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1)
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1)
+ vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1)
+ vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1)
+ vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0)
+ vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0)
+ vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vst1.32 d23[0], [r1], r3 @//Store dest row0
+ vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vst1.32 d20[0], [r1], r3 @//Store dest row1
+ subs r5, r5, #2 @ 2 rows done, decrement by 2
+ beq end_func
+
+ b loop_4
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s
new file mode 100755
index 0000000..5b29e02
--- /dev/null
+++ b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s
@@ -0,0 +1,301 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_vert_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_vert_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * Interprediction luma filter for vertical input
+@ *
+@ * @par Description:
+@ * Applies a 6 tap vertcal filter.The output is clipped to 8 bits
+@ * sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@ *
+@ * @param[in] pu1_src
+@ * UWORD8 pointer to the source
+@ *
+@ * @param[out] pu1_dst
+@ * UWORD8 pointer to the destination
+@ *
+@ * @param[in] src_strd
+@ * integer source stride
+@ *
+@ * @param[in] dst_strd
+@ * integer destination stride
+@ *
+@ * @param[in] ht
+@ * integer height of the array
+@ *
+@ * @param[in] wd
+@ * integer width of the array
+@ *
+@ * @returns
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+
+@void ih264_inter_pred_luma_vert (
+@ UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd )
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r5 => ht
+@ r6 => wd
+
+.text
+.p2align 2
+
+
+ .global ih264_inter_pred_luma_vert_a9q
+
+ih264_inter_pred_luma_vert_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r5, [sp, #104] @Loads ht
+ sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd
+ ldr r6, [sp, #108] @Loads wd
+ vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11
+
+ subs r12, r6, #8 @if wd=8 branch to loop_8
+ vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12
+ beq loop_8
+
+ subs r12, r6, #4 @if wd=4 branch to loop_4
+ beq loop_4
+
+loop_16: @when wd=16
+
+ vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0]
+ vaddl.u8 q6, d4, d6 @ temp1 = src[2_0] + src[3_0]
+ vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q7, d0, d10 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q7, q6, q11 @ temp += temp1 * 20
+ vaddl.u8 q10, d1, d11 @ temp4 = src[0_8] + src[5_8]
+ vaddl.u8 q9, d5, d7 @ temp3 = src[2_8] + src[3_8]
+ vmla.u16 q10, q9, q11 @ temp4 += temp3 * 20
+ vld1.u32 {q0}, [r0], r2
+ vaddl.u8 q13, d3, d9 @ temp5 = src[1_8] + src[4_8]
+ vaddl.u8 q6, d6, d8
+ vmls.u16 q7, q8, q12 @ temp -= temp2 * 5
+ vaddl.u8 q8, d2, d0
+ vaddl.u8 q9, d4, d10
+ vmla.u16 q8, q6, q11
+ vmls.u16 q10, q13, q12 @ temp4 -= temp5 * 5
+ vaddl.u8 q13, d5, d11
+ vaddl.u8 q6, d7, d9
+ vqrshrun.s16 d30, q7, #5 @ dst[0_0] = CLIP_U8((temp +16) >> 5)
+ vaddl.u8 q7, d3, d1
+ vld1.u32 {q1}, [r0], r2
+ vmla.u16 q7, q6, q11
+ vmls.u16 q8, q9, q12
+ vqrshrun.s16 d31, q10, #5 @ dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ vaddl.u8 q9, d4, d2
+ vaddl.u8 q6, d8, d10
+
+ vst1.u32 {q15}, [r1], r3 @ Vector store to dst[0_0]
+ vmla.u16 q9, q6, q11
+ vaddl.u8 q10, d6, d0
+ vmls.u16 q7, q13, q12
+ vqrshrun.s16 d30, q8, #5
+ vaddl.u8 q6, d9, d11
+ vaddl.u8 q8, d5, d3
+ vaddl.u8 q13, d7, d1
+ vmla.u16 q8, q6, q11
+ vmls.u16 q9, q10, q12
+ vld1.u32 {q2}, [r0], r2
+
+ vqrshrun.s16 d31, q7, #5
+ vaddl.u8 q6, d10, d0
+ vaddl.u8 q7, d6, d4
+ vaddl.u8 q10, d8, d2
+ vmla.u16 q7, q6, q11
+ vmls.u16 q8, q13, q12
+ vst1.u32 {q15}, [r1], r3 @store row 1
+ vqrshrun.s16 d30, q9, #5
+ vaddl.u8 q9, d7, d5
+ vaddl.u8 q6, d11, d1
+ vmla.u16 q9, q6, q11
+ vaddl.u8 q13, d9, d3
+ vmls.u16 q7, q10, q12
+
+ vqrshrun.s16 d31, q8, #5
+ vmls.u16 q9, q13, q12
+ vaddl.u8 q6, d0, d2 @ temp1 = src[2_0] + src[3_0]
+ vst1.u32 {q15}, [r1], r3 @store row 2
+ vaddl.u8 q8, d10, d4 @ temp2 = src[1_0] + src[4_0]
+ vaddl.u8 q10, d9, d7 @ temp4 = src[0_8] + src[5_8]
+ vqrshrun.s16 d30, q7, #5
+ vaddl.u8 q13, d5, d11 @ temp5 = src[1_8] + src[4_8]
+ vaddl.u8 q7, d8, d6 @ temp = src[0_0] + src[5_0]
+ vqrshrun.s16 d31, q9, #5
+ vmla.u16 q7, q6, q11 @ temp += temp1 * 20
+ vaddl.u8 q9, d1, d3 @ temp3 = src[2_8] + src[3_8]
+ vst1.u32 {q15}, [r1], r3 @store row 3
+ subs r5, r5, #4 @ 4 rows processed, decrement by 4
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+
+ b loop_16 @ looping if height = 8 or 16
+
+loop_8:
+@// Processing row0 and row1
+
+ vld1.u32 d0, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 d1, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 d2, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 d3, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 d4, [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 d5, [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q4, q3, q11 @ temp += temp1 * 20
+ vld1.u32 d6, [r0], r2
+ vaddl.u8 q7, d3, d4
+ vaddl.u8 q8, d1, d6
+ vaddl.u8 q9, d2, d5
+ vmls.u16 q4, q5, q12 @ temp -= temp2 * 5
+ vmla.u16 q8, q7, q11
+ vld1.u32 d7, [r0], r2
+ vaddl.u8 q10, d4, d5
+ vaddl.u8 q6, d2, d7
+ vaddl.u8 q5, d3, d6
+ vmls.u16 q8, q9, q12
+ vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+ vmla.u16 q6, q10, q11
+ vld1.u32 d0, [r0], r2
+ vaddl.u8 q7, d5, d6
+ vqrshrun.s16 d27, q8, #5
+ vaddl.u8 q10, d3, d0
+ vmls.u16 q6, q5, q12
+ vst1.u32 d26, [r1], r3 @ Vector store to dst[0_0]
+ vaddl.u8 q9, d4, d7
+ vmla.u16 q10, q7, q11
+ vst1.u32 d27, [r1], r3
+ vqrshrun.s16 d28, q6, #5
+ vst1.u32 d28, [r1], r3
+ vmls.u16 q10, q9, q12
+ vqrshrun.s16 d29, q10, #5
+ vst1.u32 d29, [r1], r3 @store row 3
+
+ subs r5, r5, #4 @ 4 rows processed, decrement by 4
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+
+ b loop_8 @looping if height == 8 or 16
+
+
+loop_4:
+@// Processing row0 and row1
+
+ vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 d2[0], [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 d3[0], [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 d4[0], [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 d5[0], [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q4, q3, q11 @ temp += temp1 * 20
+ vld1.u32 d6[0], [r0], r2
+ vaddl.u8 q7, d3, d4
+ vaddl.u8 q8, d1, d6
+ vaddl.u8 q9, d2, d5
+ vmls.u16 q4, q5, q12 @ temp -= temp2 * 5
+ vld1.u32 d7[0], [r0], r2
+ vmla.u16 q8, q7, q11
+ vaddl.u8 q10, d4, d5
+ vaddl.u8 q6, d2, d7
+ vaddl.u8 q5, d3, d6
+ vmls.u16 q8, q9, q12
+ vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+ vmla.u16 q6, q10, q11
+ vld1.u32 d0[0], [r0], r2
+ vaddl.u8 q7, d5, d6
+ vqrshrun.s16 d27, q8, #5
+ vaddl.u8 q10, d3, d0
+ vmls.u16 q6, q5, q12
+ vst1.u32 d26[0], [r1], r3 @ Vector store to dst[0_0]
+ vaddl.u8 q9, d4, d7
+ vmla.u16 q10, q7, q11
+ vst1.u32 d27[0], [r1], r3
+ vqrshrun.s16 d28, q6, #5
+ vst1.u32 d28[0], [r1], r3
+ vmls.u16 q10, q9, q12
+ vqrshrun.s16 d29, q10, #5
+ vst1.u32 d29[0], [r1], r3 @store row 3
+
+ subs r5, r5, #8
+ subeq r0, r0, r2, lsl #2
+ subeq r0, r0, r2
+ beq loop_4 @ Loop if height==8
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_bilinear_a9q.s b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s
new file mode 100755
index 0000000..6a3c83d
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s
@@ -0,0 +1,398 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_bilinear_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_bilinear_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@ *******************************************************************************
+@ * function:ih264_inter_pred_luma_bilinear
+@ *
+@* @brief
+@* This routine applies the bilinear filter to the predictors .
+@* The filtering operation is described in
+@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @par Description:
+@\note
+@* This function is called to obtain pixels lying at the following
+@* locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) .
+@* The function averages the two adjacent values from the two input arrays in horizontal direction.
+@*
+@*
+@* @param[in] pu1_src1:
+@* UWORD8 Pointer to the buffer containing the first input array.
+@*
+@* @param[in] pu1_src2:
+@* UWORD8 Pointer to the buffer containing the second input array.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output of bilinear filter is stored.
+@*
+@* @param[in] src_strd1
+@* Stride of the first input buffer
+@*
+@* @param[in] src_strd2
+@* Stride of the second input buffer
+@*
+@* @param[in] dst_strd
+@* integer destination stride of pu1_dst
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
+@ UWORD8 *pu1_src2,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd1,
+@ WORD32 src_strd2,
+@ WORD32 dst_strd,
+@ WORD32 height,
+@ WORD32 width)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src1
+@ r1 => *pu1_src2
+@ r2 => *pu1_dst
+@ r3 => src_strd1
+@ r4 => src_strd2
+@ r5 => dst_strd
+@ r6 => height
+@ r7 => width
+@
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_bilinear_a9q
+
+ih264_inter_pred_luma_bilinear_a9q:
+
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r4, [sp, #104]
+ ldr r5, [sp, #108] @
+ ldr r6, [sp, #112]
+ ldr r7, [sp, #116]
+
+ subs r12, r7, #4 @if wd=4 branch to loop_4
+ beq loop_4
+ subs r12, r7, #8 @if wd=8 branch to loop_8
+ beq loop_8
+
+loop_16: @when wd=16
+
+ vld1.8 {q0}, [r0], r3 @// Load row0 ;src1
+ vld1.8 {q2}, [r1], r4 @// Load row0 ;src2
+ vld1.8 {q1}, [r0], r3 @// Load row1 ;src1
+ vaddl.u8 q10, d0, d4
+ vld1.8 {q3}, [r1], r4 @// Load row1 ;src2
+ vaddl.u8 q11, d1, d5
+ vld1.8 {q4}, [r0], r3 @// Load row2 ;src1
+ vaddl.u8 q12, d2, d6
+ vld1.8 {q5}, [r0], r3 @// Load row3 ;src1
+ vaddl.u8 q13, d3, d7
+ vld1.8 {q6}, [r1], r4 @// Load row2 ;src2
+ vaddl.u8 q8, d8, d12
+ vld1.8 {q7}, [r1], r4 @// Load row3 ;src2
+ vaddl.u8 q9, d9, d13
+ vqrshrun.s16 d28, q10, #1
+ vqrshrun.s16 d29, q11, #1
+ vaddl.u8 q10, d10, d14
+ vqrshrun.s16 d30, q12, #1
+ vqrshrun.s16 d31, q13, #1
+ vst1.8 {q14}, [r2], r5 @//Store dest row0
+ vaddl.u8 q11, d11, d15
+ vst1.8 {q15}, [r2], r5 @//Store dest row1
+ vqrshrun.s16 d28, q8, #1
+ vld1.8 {q0}, [r0], r3 @// Load row4 ;src1
+ vqrshrun.s16 d29, q9, #1
+ vld1.8 {q1}, [r0], r3 @// Load row5 ;src1
+ vqrshrun.s16 d30, q10, #1
+ vld1.8 {q2}, [r1], r4 @// Load row4 ;src2
+ vqrshrun.s16 d31, q11, #1
+ vld1.8 {q3}, [r1], r4 @// Load row5 ;src2
+ vaddl.u8 q10, d0, d4
+ vst1.8 {q14}, [r2], r5 @//Store dest row2
+ vaddl.u8 q13, d3, d7
+ vst1.8 {q15}, [r2], r5 @//Store dest row3
+ vaddl.u8 q11, d1, d5
+ vld1.8 {q4}, [r0], r3 @// Load row6 ;src1
+ vaddl.u8 q12, d2, d6
+ vld1.8 {q5}, [r0], r3 @// Load row7 ;src1
+ vqrshrun.s16 d28, q10, #1
+ vld1.8 {q6}, [r1], r4 @// Load row6 ;src2
+ vqrshrun.s16 d29, q11, #1
+ vld1.8 {q7}, [r1], r4 @// Load row7 ;src2
+ vaddl.u8 q8, d8, d12
+ vaddl.u8 q9, d9, d13
+ vaddl.u8 q10, d10, d14
+ vqrshrun.s16 d30, q12, #1
+ vqrshrun.s16 d31, q13, #1
+ vst1.8 {q14}, [r2], r5 @//Store dest row4
+ vaddl.u8 q11, d11, d15
+ vst1.8 {q15}, [r2], r5 @//Store dest row5
+ vqrshrun.s16 d28, q8, #1
+ vqrshrun.s16 d30, q10, #1
+ vqrshrun.s16 d29, q9, #1
+ vld1.8 {q2}, [r1], r4 @// Load row8 ;src2
+ vqrshrun.s16 d31, q11, #1
+ vst1.8 {q14}, [r2], r5 @//Store dest row6
+ subs r12, r6, #8
+ vst1.8 {q15}, [r2], r5 @//Store dest row7
+
+ beq end_func @ end function if ht=8
+
+ vld1.8 {q0}, [r0], r3 @// Load row8 ;src1
+ vaddl.u8 q10, d0, d4
+ vld1.8 {q1}, [r0], r3 @// Load row9 ;src1
+ vaddl.u8 q11, d1, d5
+ vld1.8 {q3}, [r1], r4 @// Load row9 ;src2
+ vqrshrun.s16 d28, q10, #1
+ vld1.8 {q4}, [r0], r3 @// Load row10 ;src1
+ vqrshrun.s16 d29, q11, #1
+ vld1.8 {q5}, [r0], r3 @// Load row11 ;src1
+ vaddl.u8 q12, d2, d6
+ vld1.8 {q6}, [r1], r4 @// Load row10 ;src2
+ vaddl.u8 q13, d3, d7
+ vld1.8 {q7}, [r1], r4 @// Load row11 ;src2
+ vaddl.u8 q8, d8, d12
+ vaddl.u8 q9, d9, d13
+ vaddl.u8 q10, d10, d14
+ vqrshrun.s16 d30, q12, #1
+ vst1.8 {q14}, [r2], r5 @//Store dest row8
+ vqrshrun.s16 d31, q13, #1
+ vst1.8 {q15}, [r2], r5 @//Store dest row9
+ vqrshrun.s16 d28, q8, #1
+ vld1.8 {q0}, [r0], r3 @// Load row12 ;src1
+ vaddl.u8 q11, d11, d15
+ vld1.8 {q1}, [r0], r3 @// Load row13 ;src1
+ vqrshrun.s16 d29, q9, #1
+ vld1.8 {q2}, [r1], r4 @// Load row12 ;src2
+ vqrshrun.s16 d30, q10, #1
+ vld1.8 {q3}, [r1], r4 @// Load row13 ;src2
+ vqrshrun.s16 d31, q11, #1
+ vst1.8 {q14}, [r2], r5 @//Store dest row10
+ vaddl.u8 q10, d0, d4
+ vst1.8 {q15}, [r2], r5 @//Store dest row11
+ vaddl.u8 q11, d1, d5
+ vld1.8 {q4}, [r0], r3 @// Load row14 ;src1
+ vaddl.u8 q13, d3, d7
+ vld1.8 {q5}, [r0], r3 @// Load row15 ;src1
+ vaddl.u8 q12, d2, d6
+ vld1.8 {q6}, [r1], r4 @// Load row14 ;src2
+ vaddl.u8 q8, d8, d12
+ vld1.8 {q7}, [r1], r4 @// Load row15 ;src2
+ vaddl.u8 q9, d9, d13
+ vqrshrun.s16 d28, q10, #1
+ vqrshrun.s16 d29, q11, #1
+ vaddl.u8 q10, d10, d14
+ vst1.8 {q14}, [r2], r5 @//Store dest row12
+ vqrshrun.s16 d30, q12, #1
+ vqrshrun.s16 d31, q13, #1
+ vaddl.u8 q11, d11, d15
+ vst1.8 {q15}, [r2], r5 @//Store dest row13
+ vqrshrun.s16 d28, q8, #1
+ vqrshrun.s16 d29, q9, #1
+ vqrshrun.s16 d30, q10, #1
+ vst1.8 {q14}, [r2], r5 @//Store dest row14
+ vqrshrun.s16 d31, q11, #1
+ vst1.8 {q15}, [r2], r5 @//Store dest row15
+ b end_func
+
+
+
+loop_8: @wd=8;
+ vld1.8 {d0}, [r0], r3 @// Load row0 ;src1
+ vld1.8 {d4}, [r1], r4 @// Load row0 ;src2
+ vld1.8 {d1}, [r0], r3 @// Load row1 ;src1
+ vaddl.u8 q10, d0, d4
+ vld1.8 {d5}, [r1], r4 @// Load row1 ;src2
+ vld1.8 {d2}, [r0], r3 @// Load row2 ;src1
+ vqrshrun.s16 d28, q10, #1
+ vld1.8 {d6}, [r1], r4 @// Load row2 ;src2
+ vaddl.u8 q11, d1, d5
+ vld1.8 {d3}, [r0], r3 @// Load row3 ;src1
+ vaddl.u8 q12, d2, d6
+ vst1.8 {d28}, [r2], r5 @//Store dest row0
+ vqrshrun.s16 d29, q11, #1
+ vld1.8 {d7}, [r1], r4 @// Load row3 ;src2
+ vqrshrun.s16 d30, q12, #1
+ vst1.8 {d29}, [r2], r5 @//Store dest row1
+ vaddl.u8 q13, d3, d7
+ vst1.8 {d30}, [r2], r5 @//Store dest row2
+ vqrshrun.s16 d31, q13, #1
+ subs r12, r6, #4
+ vst1.8 {d31}, [r2], r5 @//Store dest row3
+ beq end_func @ end function if ht=4
+
+ vld1.8 {d12}, [r1], r4 @// Load row4 ;src2
+ vld1.8 {d8}, [r0], r3 @// Load row4 ;src1
+ vld1.8 {d9}, [r0], r3 @// Load row5 ;src1
+ vaddl.u8 q8, d8, d12
+ vld1.8 {d13}, [r1], r4 @// Load row5 ;src2
+ vld1.8 {d10}, [r0], r3 @// Load row6;src1
+ vaddl.u8 q9, d9, d13
+ vld1.8 {d14}, [r1], r4 @// Load row6 ;src2
+ vqrshrun.s16 d28, q8, #1
+ vld1.8 {d11}, [r0], r3 @// Load row7 ;src1
+ vqrshrun.s16 d29, q9, #1
+ vst1.8 {d28}, [r2], r5 @//Store dest row4
+ vaddl.u8 q10, d10, d14
+ vst1.8 {d29}, [r2], r5 @//Store dest row5
+ vqrshrun.s16 d30, q10, #1
+ vld1.8 {d15}, [r1], r4 @// Load row7 ;src2
+ vaddl.u8 q11, d11, d15
+ vst1.8 {d30}, [r2], r5 @//Store dest row6
+ vqrshrun.s16 d31, q11, #1
+ subs r12, r6, #8
+ vst1.8 {d31}, [r2], r5 @//Store dest row7
+ beq end_func @ end function if ht=8
+
+ vld1.8 {d0}, [r0], r3 @// Load row8 ;src1
+ vld1.8 {d4}, [r1], r4 @// Load row8 ;src2
+ vld1.8 {d1}, [r0], r3 @// Load row9 ;src1
+ vaddl.u8 q10, d0, d4
+ vld1.8 {d5}, [r1], r4 @// Load row9 ;src2
+ vld1.8 {d2}, [r0], r3 @// Load row10 ;src1
+ vaddl.u8 q11, d1, d5
+ vld1.8 {d6}, [r1], r4 @// Load row10 ;src2
+ vqrshrun.s16 d28, q10, #1
+ vld1.8 {d3}, [r0], r3 @// Load row11 ;src1
+ vaddl.u8 q12, d2, d6
+ vld1.8 {d7}, [r1], r4 @// Load row11 ;src2
+ vqrshrun.s16 d29, q11, #1
+ vld1.8 {d8}, [r0], r3 @// Load row12 ;src1
+ vaddl.u8 q13, d3, d7
+ vst1.8 {d28}, [r2], r5 @//Store dest row8
+ vqrshrun.s16 d30, q12, #1
+ vld1.8 {d12}, [r1], r4 @// Load row12 ;src2
+ vqrshrun.s16 d31, q13, #1
+ vst1.8 {d29}, [r2], r5 @//Store dest row9
+ vaddl.u8 q8, d8, d12
+ vld1.8 {d9}, [r0], r3 @// Load row13 ;src1
+ vqrshrun.s16 d28, q8, #1
+ vld1.8 {d13}, [r1], r4 @// Load row13 ;src2
+ vld1.8 {d10}, [r0], r3 @// Load row14;src1
+ vaddl.u8 q9, d9, d13
+ vld1.8 {d11}, [r0], r3 @// Load row15 ;src1
+ vld1.8 {d14}, [r1], r4 @// Load row14 ;src2
+ vqrshrun.s16 d29, q9, #1
+ vld1.8 {d15}, [r1], r4 @// Load roW15 ;src2
+ vaddl.u8 q10, d10, d14
+ vst1.8 {d30}, [r2], r5 @//Store dest row10
+ vaddl.u8 q11, d11, d15
+ vst1.8 {d31}, [r2], r5 @//Store dest row11
+ vqrshrun.s16 d30, q10, #1
+ vst1.8 {d28}, [r2], r5 @//Store dest row12
+ vqrshrun.s16 d31, q11, #1
+ vst1.8 {d29}, [r2], r5 @//Store dest row13
+ vst1.8 {d30}, [r2], r5 @//Store dest row14
+ vst1.8 {d31}, [r2], r5 @//Store dest row15
+
+ b end_func
+
+
+
+loop_4:
+ vld1.32 d0[0], [r0], r3 @// Load row0 ;src1
+ vld1.32 d4[0], [r1], r4 @// Load row0 ;src2
+ vld1.32 d1[0], [r0], r3 @// Load row1 ;src1
+ vaddl.u8 q10, d0, d4
+ vld1.32 d5[0], [r1], r4 @// Load row1 ;src2
+ vld1.32 d2[0], [r0], r3 @// Load row2 ;src1
+ vqrshrun.s16 d28, q10, #1
+ vld1.32 d6[0], [r1], r4 @// Load row2 ;src2
+ vaddl.u8 q11, d1, d5
+ vld1.32 d3[0], [r0], r3 @// Load row3 ;src1
+ vaddl.u8 q12, d2, d6
+ vst1.32 d28[0], [r2], r5 @//Store dest row0
+ vqrshrun.s16 d29, q11, #1
+ vld1.32 d7[0], [r1], r4 @// Load row3 ;src2
+ vqrshrun.s16 d30, q12, #1
+ vst1.32 d29[0], [r2], r5 @//Store dest row1
+ vaddl.u8 q13, d3, d7
+ vst1.32 d30[0], [r2], r5 @//Store dest row2
+ vqrshrun.s16 d31, q13, #1
+ subs r12, r6, #4
+ vst1.32 d31[0], [r2], r5 @//Store dest row3
+ beq end_func @ end function if ht=4
+
+ vld1.32 d12[0], [r1], r4 @// Load row4 ;src2
+ vld1.32 d8[0], [r0], r3 @// Load row4 ;src1
+ vld1.32 d9[0], [r0], r3 @// Load row5 ;src1
+ vaddl.u8 q8, d8, d12
+ vld1.32 d13[0], [r1], r4 @// Load row5 ;src2
+ vld1.32 d10[0], [r0], r3 @// Load row6;src1
+ vaddl.u8 q9, d9, d13
+ vld1.32 d14[0], [r1], r4 @// Load row6 ;src2
+ vqrshrun.s16 d28, q8, #1
+ vld1.32 d11[0], [r0], r3 @// Load row7 ;src1
+ vqrshrun.s16 d29, q9, #1
+ vst1.32 d28[0], [r2], r5 @//Store dest row4
+ vaddl.u8 q10, d10, d14
+ vst1.32 d29[0], [r2], r5 @//Store dest row5
+ vqrshrun.s16 d30, q10, #1
+ vld1.32 d15[0], [r1], r4 @// Load row7 ;src2
+ vaddl.u8 q11, d11, d15
+ vst1.32 d30[0], [r2], r5 @//Store dest row6
+ vqrshrun.s16 d31, q11, #1
+ vst1.32 d31[0], [r2], r5 @//Store dest row7
+
+end_func:
+
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_copy_a9q.s b/common/arm/ih264_inter_pred_luma_copy_a9q.s
new file mode 100755
index 0000000..8ba2fbf
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_copy_a9q.s
@@ -0,0 +1,253 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Interprediction luma function for copy
+@*
+@* @par Description:
+@* Copies the array of width 'wd' and height 'ht' from the location pointed
+@* by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264_inter_pred_luma_copy (
+@ UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd )
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r7 => ht
+@ r12 => wd
+
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_copy_a9q
+
+ih264_inter_pred_luma_copy_a9q:
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r12, [sp, #108] @Loads wd
+ ldr r7, [sp, #104] @Loads ht
+ cmp r7, #0 @checks ht == 0
+ ble end_loops
+ tst r12, #15 @checks wd for multiples for 4 & 8
+ beq core_loop_wd_16
+ tst r12, #7 @checks wd for multiples for 4 & 8
+ beq core_loop_wd_8
+ sub r11, r12, #4
+
+outer_loop_wd_4:
+ subs r4, r12, #0 @checks wd == 0
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4:
+ vld1.32 {d0[0]}, [r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r5, r0, r2 @pu1_src_tmp += src_strd
+ add r6, r1, r3 @pu1_dst_tmp += dst_strd
+ vst1.32 {d0[0]}, [r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r0, r0, #4 @pu1_src += 4
+ vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ subs r4, r4, #4 @(wd -4)
+ vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r1, r1, #4 @pu1_dst += 4
+ vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs r7, r7, #4 @ht - 4
+ sub r0, r5, r11 @pu1_src = pu1_src_tmp
+ sub r1, r6, r11 @pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_4
+
+end_loops:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+
+
+core_loop_wd_8:
+ sub r11, r12, #8
+
+outer_loop_wd_8:
+ subs r4, r12, #0 @checks wd
+ ble end_inner_loop_wd_8
+
+inner_loop_wd_8:
+ add r5, r0, r2 @pu1_src_tmp += src_strd
+ vld1.8 {d0}, [r0]! @vld1_u8(pu1_src_tmp)
+ add r6, r1, r3 @pu1_dst_tmp += dst_strd
+ vst1.8 {d0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {d1}, [r5], r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ subs r4, r4, #8 @wd - 8(Loop condition)
+ vld1.8 {d2}, [r5], r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {d3}, [r5], r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ bgt inner_loop_wd_8
+
+end_inner_loop_wd_8:
+ subs r7, r7, #4 @ht -= 4
+ sub r0, r5, r11 @pu1_src = pu1_src_tmp
+ sub r1, r6, r11 @pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_8
+
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+core_loop_wd_16:
+ sub r11, r12, #16
+
+outer_loop_wd_16:
+ subs r4, r12, #0 @checks wd
+ ble end_inner_loop_wd_16
+
+inner_loop_wd_16:
+ add r5, r0, r2 @pu1_src_tmp += src_strd
+ vld1.8 {q0}, [r0]! @vld1_u8(pu1_src_tmp)
+ add r6, r1, r3 @pu1_dst_tmp += dst_strd
+ vst1.8 {q0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {q1}, [r5], r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ subs r4, r4, #16 @wd - 8(Loop condition)
+ vld1.8 {q2}, [r5], r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {q3}, [r5], r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ bgt inner_loop_wd_16
+
+end_inner_loop_wd_16:
+ subs r7, r7, #4 @ht -= 4
+ sub r0, r5, r11 @pu1_src = pu1_src_tmp
+ sub r1, r6, r11 @pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_16
+
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+
+@ /*
+@ ********************************************************************************
+@ *
+@ * @brief This function copies a 4x4 block to destination
+@ *
+@ * @par Description:
+@ * Copies a 4x4 block to destination, where both src and dst are interleaved
+@ *
+@ * @param[in] pi2_src
+@ * Source
+@ *
+@ * @param[in] pu1_out
+@ * Output pointer
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction buffer stride
+@ *
+@ * @param[in] out_strd
+@ * output buffer buffer Stride
+@ *
+@ * @returns none
+@ *
+@ * @remarks none
+@ * Currently wd and height is not used, ie a 4x4 block is always copied
+@ *
+@ *******************************************************************************
+@ */
+@ void ih264_interleave_copy(WORD16 *pi2_src,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd
+@ WORD32 wd
+@ WORD32 ht)
+@ Register Usage
+@ r0 : pi2_src
+@ r1 : pu1_out
+@ r2 : src_strd
+@ r3 : out_strd
+@ Neon registers d0-d7, d16-d30 are used
+@ No need for pushing arm and neon registers
+
+ .global ih264_interleave_copy_a9
+ih264_interleave_copy_a9:
+
+ vld1.u8 d2, [r0], r2 @load src plane 1 => d2 &pred palne 2 => d3
+ vld1.u8 d3, [r0], r2
+ vld1.u8 d4, [r0], r2
+ vld1.u8 d5, [r0], r2
+
+ mov r0, r1
+
+ vld1.u8 d18, [r1], r3 @load out [8 bit size) -8 coeffs
+ vld1.u8 d19, [r1], r3
+ vmov.u16 q15, #0x00ff
+ vld1.u8 d20, [r1], r3
+ vld1.u8 d21, [r1], r3
+
+ vbit.u8 q9, q1, q15
+ vbit.u8 q10, q2, q15
+
+ vst1.u8 d18, [r0], r3 @store out
+ vst1.u8 d19, [r0], r3
+ vst1.u8 d20, [r0], r3
+ vst1.u8 d21, [r0], r3
+
+ bx lr
+
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
new file mode 100755
index 0000000..43321a8
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
@@ -0,0 +1,441 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Mohit
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* This function implements a two stage cascaded six tap filter. It
+@* applies the six tap filter in the vertical direction on the
+@* predictor values, followed by applying the same filter in the
+@* horizontal direction on the output of the first stage. The six tap
+@* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
+@* interpolation process"
+@*
+@* @par Description:
+@* This function is called to obtain pixels lying at the following
+@* location (1/2,1/2). The function interpolates
+@* the predictors first in the horizontal direction and then in the
+@* vertical direction to output the (1/2,1/2).
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations: UNUSED in this function.
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/;
+
+@void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd,
+@ UWORD8* pu1_tmp,
+@ UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r8 => ht
+@ r9 => wd
+
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q
+
+ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r8, [sp, #104] @ loads ht
+ sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd
+ sub r0, r0, #2 @pu1_src-2
+ ldr r9, [sp, #108] @ loads wd
+
+ vmov.s16 d0, #20 @ Filter coeff 20
+ vmov.s16 d1, #5 @ Filter coeff 5
+ subs r12, r9, #4 @if wd=4 branch to loop_4
+ beq loop_4
+ subs r12, r9, #8 @if wd=8 branch to loop_8
+ beq loop_8
+
+ mov r10, #8
+ sub r7, r3, r10
+ @when wd=16
+
+loop_16:
+ vld1.u32 {d2, d3, d4}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {d5, d6, d7}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {d8, d9, d10}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {d11, d12, d13}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {d14, d15, d16}, [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 {d17, d18, d19}, [r0], r2 @ Vector load from src[5_0]
+
+ @ vERTICAL FILTERING FOR ROW 0
+ vaddl.u8 q10, d8, d11 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q12, d2, d17 @ temp2 = src[0_0] + src[5_0]
+ vaddl.u8 q11, d5, d14 @ temp = src[1_0] + src[4_0]
+ vaddl.u8 q13, d3, d18 @ temp2 = src[0_0] + src[5_0]
+ vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20
+ vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q10, d6, d15 @ temp = src[1_0] + src[4_0]
+ vaddl.u8 q11, d9, d12 @ temp3 = src[2_0] + src[3_0]
+ vaddl.u8 q14, d4, d19 @ temp2 = src[0_0] + src[5_0]
+ vmla.u16 q13, q11, d0[0] @ temp4 += temp3 * 20
+ vmls.s16 q13, q10, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q11, d10, d13 @ temp3 = src[2_0] + src[3_0]
+ vaddl.u8 q10, d7, d16 @ temp = src[1_0] + src[4_0]
+ vmla.u16 q14, q11, d0[0] @ temp4 += temp3 * 20
+ vmls.s16 q14, q10, d1[0] @ temp -= temp2 * 5
+ vext.16 q10, q12, q13, #5 @//extract a[5] (column1)
+
+ @Q12,Q13,Q14 HAVE VERTICAL FILTERED VALUES
+ @CASCADED FILTERING FOR ROW 0
+ vext.16 q11, q12, q13, #2 @//extract a[2] (column1)
+ vaddl.s16 q1, d20, d24 @// a0 + a5 (column1)
+ vaddl.s16 q15, d21, d25 @// a0 + a5 (column1)
+ vmlal.s16 q1, d22, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1)
+ vext.16 q11, q12, q13, #1 @//extract a[1] (column1)
+ vext.16 q10, q12, q13, #3 @//extract a[3] (column1)
+ vmlsl.s16 q1, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlal.s16 q1, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vext.16 q11, q12, q13, #4 @//extract a[4] (column1)
+ vext.16 q10, q13, q14, #5 @//extract a[5] (column2)
+ vmlsl.s16 q1, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vqrshrun.s32 d22, q1, #10
+ vqrshrun.s32 d23, q15, #10
+ vqshrun.s16 d22, q11, #0
+ vst1.u8 {d22}, [r1], r10 @//Store dest row0, column 1; (1/2,1/2)
+ vext.16 q11, q13, q14, #2 @//extract a[2] (column2)
+ vaddl.s16 q1, d20, d26 @// a0 + a5 (column2)
+ vaddl.s16 q15, d21, d27 @// a0 + a5 (column2)
+ vmlal.s16 q1, d22, d0[0] @// a0 + a5 + 20a2 (column2)
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column2)
+ vext.16 q10, q13, q14, #3 @//extract a[3] (column2)
+ vext.16 q11, q13, q14, #1 @//extract a[1] (column2)
+ vmlal.s16 q1, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2)
+ vext.16 q10, q13, q14, #4 @//extract a[4] (column2)
+ vmlsl.s16 q1, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2)
+ vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2)
+ vmlsl.s16 q1, d20, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2)
+ vmlsl.s16 q15, d21, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2)
+ vqrshrun.s32 d20, q1, #10
+ vqrshrun.s32 d21, q15, #10
+ vld1.u32 {d2, d3, d4}, [r0], r2 @ Vector load from src[6_0]
+ vqshrun.s16 d22, q10, #0
+ vst1.u8 {d22}, [r1], r7 @//Store dest row0 ,column 2; (1/2,1/2)
+
+ @ vERTICAL FILTERING FOR ROW 1
+ vaddl.u8 q10, d11, d14 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q12, d5, d2 @ temp2 = src[0_0] + src[5_0]
+ vaddl.u8 q11, d8, d17 @ temp = src[1_0] + src[4_0]
+ vaddl.u8 q13, d6, d3 @ temp2 = src[0_0] + src[5_0]
+ vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20
+ vaddl.u8 q10, d9, d18 @ temp = src[1_0] + src[4_0]
+ vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q11, d12, d15 @ temp3 = src[2_0] + src[3_0]
+ vaddl.u8 q14, d7, d4 @ temp2 = src[0_0] + src[5_0]
+ vmla.u16 q13, q11, d0[0] @ temp4 += temp3 * 20
+ vaddl.u8 q11, d13, d16 @ temp3 = src[2_0] + src[3_0]
+ vmls.s16 q13, q10, d1[0] @ temp -= temp2 * 5
+ vmla.u16 q14, q11, d0[0] @ temp4 += temp3 * 20
+ vaddl.u8 q10, d10, d19 @ temp = src[1_0] + src[4_0]
+ vmls.s16 q14, q10, d1[0] @ temp -= temp2 * 5
+ vext.16 q10, q12, q13, #5 @//extract a[5] (column1)
+
+ @Q12,Q13,Q14 HAVE VERTICAL FILTERED VALUES
+ @CASCADED FILTERING FOR ROW 1
+ vext.16 q11, q12, q13, #2 @//extract a[2] (column1)
+ vaddl.s16 q3, d20, d24 @// a0 + a5 (column1)
+ vaddl.s16 q15, d21, d25 @// a0 + a5 (column1)
+ vmlal.s16 q3, d22, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1)
+ vext.16 q11, q12, q13, #1 @//extract a[1] (column1)
+ vext.16 q10, q12, q13, #3 @//extract a[3] (column1)
+ vmlsl.s16 q3, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlal.s16 q3, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vext.16 q11, q12, q13, #4 @//extract a[4] (column1)
+ vext.16 q10, q13, q14, #5 @//extract a[5] (column2)
+ vmlsl.s16 q3, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vqrshrun.s32 d22, q3, #10
+ vqrshrun.s32 d23, q15, #10
+ vqshrun.s16 d22, q11, #0
+ vst1.u8 {d22}, [r1], r10 @//Store dest row1, column 1; (1/2,1/2)
+ vext.16 q11, q13, q14, #2 @//extract a[2] (column2)
+ vaddl.s16 q3, d20, d26 @// a0 + a5 (column2)
+ vaddl.s16 q15, d21, d27 @// a0 + a5 (column2)
+ vmlal.s16 q3, d22, d0[0] @// a0 + a5 + 20a2 (column2)
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column2)
+ vext.16 q10, q13, q14, #3 @//extract a[3] (column2)
+ vext.16 q11, q13, q14, #1 @//extract a[1] (column2)
+ vmlal.s16 q3, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2)
+ vext.16 q10, q13, q14, #4 @//extract a[4] (column2)
+ vmlsl.s16 q3, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2)
+ vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2)
+ vmlsl.s16 q3, d20, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2)
+ vmlsl.s16 q15, d21, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2)
+ vqrshrun.s32 d20, q3, #10
+ vqrshrun.s32 d21, q15, #10
+ vqshrun.s16 d22, q10, #0
+ vst1.u8 {d22}, [r1], r7 @//Store dest row1 ,column 2; (1/2,1/2)
+
+ subs r8, r8, #2 @ 2 rows processed, decrement by 2
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+
+ b loop_16 @ looping if height = 8 or 16
+
+loop_8:
+ vld1.u32 {d2, d3}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {d4, d5}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {d6, d7}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {d8, d9}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {d10, d11}, [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 {d12, d13}, [r0], r2 @ Vector load from src[5_0]
+
+ @ vERTICAL FILTERING FOR ROW 0
+ vaddl.u8 q10, d6, d8 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q11, d4, d10 @ temp2 = src[1_0] + src4_0]
+ vaddl.u8 q12, d2, d12 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q13, d3, d13 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q14, d7, d9 @ temp1 = src[2_0] + src[3_0]
+ vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20
+ vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q15, d5, d11 @ temp2 = src[1_0] + src4_0]
+ vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20
+ vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5
+ @Q12,Q13 HAVE VERTICAL FILTERED VALUES
+ @CASCADED FILTERING FOR ROW 0
+
+ vext.16 q10, q12, q13, #5 @//extract a[5] (column1)
+ vext.16 q11, q12, q13, #2 @//extract a[2] (column1)
+ vaddl.s16 q14, d20, d24 @// a0 + a5 (column1)
+ vaddl.s16 q15, d21, d25 @// a0 + a5 (column1)
+ vext.16 q9, q12, q13, #1 @//extract a[1] (column1)
+ vext.16 q10, q12, q13, #3 @//extract a[3] (column1)
+ vext.16 q1, q12, q13, #4 @//extract a[4] (column1)
+ vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q14, d2, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vld1.u32 {d14, d15}, [r0], r2 @ Vector load from src[6_0]
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlsl.s16 q15, d3, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+
+ vaddl.u8 q12, d4, d14 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q13, d5, d15 @ temp = src[0_0] + src[5_0]
+ vqrshrun.s32 d18, q14, #10
+ vaddl.u8 q14, d9, d11 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q10, d8, d10 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q11, d6, d12 @ temp2 = src[1_0] + src4_0]
+ vqrshrun.s32 d19, q15, #10
+ vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20
+ vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q15, d7, d13 @ temp2 = src[1_0] + src4_0]
+ vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20
+ vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5
+ vqshrun.s16 d2, q9, #0
+ @ vERTICAL FILTERING FOR ROW 1
+
+ @Q12,Q13 HAVE VERTICAL FILTERED VALUES
+ @CASCADED FILTERING FOR ROW 1
+ vext.16 q10, q12, q13, #5 @//extract a[5] (column1)
+ vext.16 q11, q12, q13, #2 @//extract a[2] (column1)
+ vaddl.s16 q14, d20, d24 @// a0 + a5 (column1)
+ vaddl.s16 q15, d21, d25 @// a0 + a5 (column1)
+ vst1.u8 {d2}, [r1], r3 @//Store dest row0, column 1; (1/2,1/2)
+ vext.16 q9, q12, q13, #1 @//extract a[1] (column1)
+ vext.16 q10, q12, q13, #3 @//extract a[3] (column1)
+ vext.16 q2, q12, q13, #4 @//extract a[4] (column1)
+ vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q14, d4, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlsl.s16 q15, d5, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vqrshrun.s32 d18, q14, #10
+ vqrshrun.s32 d19, q15, #10
+ vqshrun.s16 d3, q9, #0
+ vst1.u8 {d3}, [r1], r3 @//Store dest row1, column 1; (1/2,1/2)
+
+ subs r8, r8, #2 @ 2 rows processed, decrement by 2
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+
+ b loop_8 @looping if height == 8 or 16
+
+loop_4:
+ vld1.u32 {d2, d3}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {d4, d5}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {d6, d7}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {d8, d9}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {d10, d11}, [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 {d12, d13}, [r0], r2 @ Vector load from src[5_0]
+
+ @ vERTICAL FILTERING FOR ROW 0
+ vaddl.u8 q10, d6, d8 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q11, d4, d10 @ temp2 = src[1_0] + src4_0]
+ vaddl.u8 q12, d2, d12 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q13, d3, d13 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q14, d7, d9 @ temp1 = src[2_0] + src[3_0]
+ vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20
+ vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q15, d5, d11 @ temp2 = src[1_0] + src4_0]
+ vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20
+ vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5
+ @Q12,Q13 HAVE VERTICAL FILTERED VALUES
+ @CASCADED FILTERING FOR ROW 0
+
+ vext.16 q10, q12, q13, #5 @//extract a[5] (column1)
+ vext.16 q11, q12, q13, #2 @//extract a[2] (column1)
+ vaddl.s16 q14, d20, d24 @// a0 + a5 (column1)
+ vaddl.s16 q15, d21, d25 @// a0 + a5 (column1)
+
+ vext.16 q1, q12, q13, #4 @//extract a[4] (column1)
+ vext.16 q9, q12, q13, #1 @//extract a[1] (column1)
+ vext.16 q10, q12, q13, #3 @//extract a[3] (column1)
+
+ vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q14, d2, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vld1.u32 {d14, d15}, [r0], r2 @ Vector load from src[6_0]
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlsl.s16 q15, d3, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vaddl.u8 q12, d4, d14 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q13, d5, d15 @ temp = src[0_0] + src[5_0]
+ vqrshrun.s32 d18, q14, #10
+ vaddl.u8 q14, d9, d11 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q11, d6, d12 @ temp2 = src[1_0] + src4_0]
+ vaddl.u8 q10, d8, d10 @ temp1 = src[2_0] + src[3_0]
+ vqrshrun.s32 d19, q15, #10
+ vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20
+ vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q15, d7, d13 @ temp2 = src[1_0] + src4_0]
+ vqshrun.s16 d2, q9, #0
+ vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20
+ vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5
+
+ @ vERTICAL FILTERING FOR ROW 1
+
+ @Q12,Q13 HAVE VERTICAL FILTERED VALUES
+ @CASCADED FILTERING FOR ROW 1
+ vext.16 q10, q12, q13, #5 @//extract a[5] (column1)
+ vext.16 q11, q12, q13, #2 @//extract a[2] (column1)
+ vst1.u32 {d2[0]}, [r1], r3 @//Store dest row0, column 1; (1/2,1/2)
+ vaddl.s16 q14, d20, d24 @// a0 + a5 (column1)
+ vaddl.s16 q15, d21, d25 @// a0 + a5 (column1)
+ vext.16 q9, q12, q13, #1 @//extract a[1] (column1)
+ vext.16 q10, q12, q13, #3 @//extract a[3] (column1)
+ vext.16 q2, q12, q13, #4 @//extract a[4] (column1)
+ vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q14, d4, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlsl.s16 q15, d5, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vqrshrun.s32 d18, q14, #10
+ vqrshrun.s32 d19, q15, #10
+ vqshrun.s16 d4, q9, #0
+ vst1.u32 {d4[0]}, [r1], r3 @//Store dest row1, column 1; (1/2,1/2)
+
+ subs r8, r8, #2 @ 2 rows processed, decrement by 2
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+
+ b loop_4 @looping if height == 8 or 16
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
new file mode 100755
index 0000000..65a6de7
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
@@ -0,0 +1,1044 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Mohit
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* This function implements a two stage cascaded six tap filter. It
+@* applies the six tap filter in the horizontal direction on the
+@* predictor values, followed by applying the same filter in the
+@* vertical direction on the output of the first stage. It then averages
+@* the output of the 1st stage and the output of the 2nd stage to obtain
+@* the quarter pel values. The six tap filtering operation is described
+@* in sec 8.4.2.2.1 titled "Luma sample interpolation process".
+@*
+@* @par Description:
+@* This function is called to obtain pixels lying at the following
+@* location (1/2,1/4) or (1/2,3/4). The function interpolates
+@* the predictors first in the horizontal direction and then in the
+@* vertical direction to output the (1/2,1/2). It then averages
+@* the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4)
+@* or (1/2,3/4) depending on the offset.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/;
+
+@void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd,
+@ UWORD8* pu1_tmp,
+@ UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ht
+@ r5 => wd
+@ r7 => dydx
+@ r9 => *pu1_tmp
+
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q
+
+ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q:
+
+ stmfd sp!, {r4-r12, r14} @ store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r4, [sp, #104] @ loads ht
+ sub r0, r0, r2, lsl #1 @ pu1_src-2*src_strd
+ sub r0, r0, #2 @ pu1_src-2
+ ldr r5, [sp, #108] @ loads wd
+ ldr r7, [sp, #116] @ loads dydx
+ lsr r7, r7, #3 @ dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
+ ldr r9, [sp, #112] @ pu1_tmp
+ add r7, r7, #2
+ mov r6, #48
+ mla r7, r7, r6, r9
+
+ subs r12, r5, #4 @if wd=4 branch to loop_4
+ beq loop_4_start
+
+ subs r12, r5, #8 @if wd=8 branch to loop_8
+ beq loop_8_start
+
+ @when wd=16
+ vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11
+ vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12
+ add r8, r0, #8
+ add r14, r1, #8
+ add r10, r9, #8
+ mov r12, r4
+ add r11, r7, #8
+
+loop_16_lowhalf_start:
+ vld1.32 {q0}, [r0], r2 @ row -2 load for horizontal filter
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q3, d0, d5
+
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q3, q4, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q4, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row -1 load for horizontal filter
+ vmls.u16 q3, q4, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q4, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q5, d2, d3
+
+ vst1.32 {q3}, [r9], r6 @ store temp buffer 0
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q4, q5, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q5, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 0 load for horizontal filter
+ vmls.u16 q4, q5, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q5, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q6, d2, d3
+
+ vst1.32 {q4}, [r9], r6 @ store temp buffer 1
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q5, q6, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q6, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 1 load for horizontal filter
+ vmls.u16 q5, q6, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q6, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q7, d2, d3
+
+ vst1.32 {q5}, [r9], r6 @ store temp buffer 2
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q6, q7, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q7, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 2 load for horizontal filter
+ vmls.u16 q6, q7, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q7, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d2, d3
+
+ vst1.32 {q6}, [r9], r6 @ store temp buffer 3
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q7, q8, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q8, d1, d4
+
+ vmls.u16 q7, q8, q12
+loop_16_lowhalf:
+
+ vld1.32 {q0}, [r0], r2 @ row 3 load for horizontal filter
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d0, d5
+
+ vst1.32 {q7}, [r9], r6 @ store temp buffer 4
+ vaddl.u8 q9, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q8, q9, q11
+ vext.8 d1, d0, d1, #1
+ vadd.s16 q14, q4, q7
+ vaddl.u8 q9, d1, d4
+ vadd.s16 q15, q5, q6
+ vmls.u16 q8, q9, q12
+ vld1.32 {q0}, [r0], r2 @ row 4 load for hoorizontal filter
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q10, d0, d5
+
+ vst1.32 {q8}, [r9], r6 @ store temp buffer r5
+
+ vaddl.s16 q9, d6, d16
+
+ vld1.32 {q13}, [r7], r6 @ load from temp buffer 0
+
+ vaddl.s16 q3, d7, d17
+
+ vqrshrun.s16 d26, q13, #5
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d28, d24
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d29, d24
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q10, q1, q11
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d1, d0, d1, #1
+ vqrshrun.s32 d19, q3, #10
+ vadd.s16 q14, q5, q8
+ vaddl.u8 q1, d1, d4
+ vadd.s16 q15, q6, q7
+ vmls.u16 q10, q1, q12
+ vqmovn.u16 d18, q9
+ vld1.32 {q0}, [r0], r2 @ row 5 load for horizontal filter
+
+ vrhadd.u8 d26, d18, d26
+
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+
+ vst1.32 {q10}, [r9], r6 @ store temp buffer r6
+
+ vaddl.s16 q9, d8, d20
+
+ vaddl.s16 q3, d9, d21
+
+ vld1.32 {q4}, [r7], r6 @load from temp buffer 1
+
+
+ vst1.32 d26, [r1], r3 @ store row 0
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d28, d24
+
+ vqrshrun.s16 d28, q4, #5
+
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d29, d24
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d0, d5
+ vaddl.u8 q1, d2, d3
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d4, d0, d1, #4
+ vqrshrun.s32 d19, q3, #10
+ vmla.u16 q4, q1, q11
+ vext.8 d1, d0, d1, #1
+ vadd.s16 q13, q6, q10
+ vaddl.u8 q1, d1, d4
+ vqmovn.u16 d18, q9
+ vadd.s16 q15, q7, q8
+ vmls.u16 q4, q1, q12
+ vld1.32 {q0}, [r0], r2 @ row 6 load for horizontal filter
+
+ vrhadd.u8 d28, d28, d18
+
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+
+ vst1.32 d28, [r1], r3 @ store row 1
+
+ vaddl.u8 q14, d0, d5
+
+ vst1.32 {q4}, [r9], r6 @ store temp buffer r7
+
+ vaddl.s16 q9, d10, d8
+ vaddl.s16 q3, d11, d9
+
+ vld1.32 {q5}, [r7], r6 @ load from temp buffer 2
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d26, d24
+ vmlal.s16 q3, d31, d22
+
+ vqrshrun.s16 d26, q5, #5
+
+ vmlsl.s16 q3, d27, d24
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q14, q1, q11
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d1, d0, d1, #1
+ vqrshrun.s32 d19, q3, #10
+ vadd.s16 q5, q7, q4
+ vaddl.u8 q1, d1, d4
+ vadd.s16 q15, q8, q10
+ vmls.u16 q14, q1, q12
+ vqmovn.u16 d27, q9
+
+ vaddl.s16 q9, d12, d28
+ vaddl.s16 q3, d13, d29
+
+ vrhadd.u8 d26, d26, d27
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d10, d24
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d11, d24
+
+ vst1.32 d26, [r1], r3 @ store row 2
+
+ vst1.32 {q14}, [r9]
+
+
+ vqrshrun.s32 d18, q9, #10
+ vmov q5, q10
+ vld1.32 {q15}, [r7], r6 @ load from temp buffer 3
+
+ vqrshrun.s32 d19, q3, #10
+ subs r4, r4, #4
+
+ vqrshrun.s16 d30, q15, #5
+
+ vqmovn.u16 d18, q9
+ vmov q6, q4
+ vmov q3, q7
+ vrhadd.u8 d30, d18, d30
+ vmov q4, q8
+ vmov q7, q14
+ vst1.32 d30, [r1], r3 @ store row 3
+
+ bgt loop_16_lowhalf @ looping if height =16
+
+
+loop_16_highhalf_start:
+ vld1.32 {q0}, [r8], r2
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q3, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q3, q4, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q4, d1, d4
+ vld1.32 {q0}, [r8], r2
+ vmls.u16 q3, q4, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q4, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q5, d2, d3
+
+ vst1.32 {q3}, [r10], r6
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q4, q5, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q5, d1, d4
+ vld1.32 {q0}, [r8], r2
+ vmls.u16 q4, q5, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q5, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q6, d2, d3
+
+ vst1.32 {q4}, [r10], r6
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q5, q6, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q6, d1, d4
+ vld1.32 {q0}, [r8], r2
+ vmls.u16 q5, q6, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q6, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q7, d2, d3
+
+ vst1.32 {q5}, [r10], r6
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q6, q7, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q7, d1, d4
+ vld1.32 {q0}, [r8], r2
+ vmls.u16 q6, q7, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q7, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d2, d3
+
+ vst1.32 {q6}, [r10], r6
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q7, q8, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q8, d1, d4
+
+ vmls.u16 q7, q8, q12
+
+loop_16_highhalf:
+
+ vld1.32 {q0}, [r8], r2
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d0, d5
+
+ vst1.32 {q7}, [r10], r6
+
+ vaddl.u8 q9, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q8, q9, q11
+ vext.8 d1, d0, d1, #1
+ vadd.s16 q14, q4, q7
+ vaddl.u8 q9, d1, d4
+ vadd.s16 q15, q5, q6
+ vmls.u16 q8, q9, q12
+ vld1.32 {q0}, [r8], r2
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q10, d0, d5
+
+ vst1.32 {q8}, [r10], r6
+
+ vaddl.s16 q9, d6, d16
+
+ vld1.32 {q13}, [r11], r6
+
+ vaddl.s16 q3, d7, d17
+
+ vqrshrun.s16 d26, q13, #5
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d28, d24
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d29, d24
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q10, q1, q11
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d1, d0, d1, #1
+ vqrshrun.s32 d19, q3, #10
+ vadd.s16 q14, q5, q8
+ vaddl.u8 q1, d1, d4
+ vadd.s16 q15, q6, q7
+ vmls.u16 q10, q1, q12
+ vqmovn.u16 d18, q9
+ vld1.32 {q0}, [r8], r2
+
+ vrhadd.u8 d26, d18, d26
+
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+
+ vst1.32 {q10}, [r10], r6
+
+ vaddl.s16 q9, d8, d20
+ vaddl.s16 q3, d9, d21
+
+ vld1.32 {q4}, [r11], r6
+
+
+ vst1.32 d26, [r14], r3 @store row 0
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d28, d24
+
+ vqrshrun.s16 d28, q4, #5
+
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d29, d24
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d0, d5
+ vaddl.u8 q1, d2, d3
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d4, d0, d1, #4
+ vqrshrun.s32 d19, q3, #10
+ vmla.u16 q4, q1, q11
+ vext.8 d1, d0, d1, #1
+ vadd.s16 q13, q6, q10
+ vaddl.u8 q1, d1, d4
+ vqmovn.u16 d18, q9
+ vadd.s16 q15, q7, q8
+ vmls.u16 q4, q1, q12
+ vld1.32 {q0}, [r8], r2
+
+ vrhadd.u8 d28, d28, d18
+
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+
+ vst1.32 d28, [r14], r3 @store row 1
+
+ vaddl.u8 q14, d0, d5
+
+ vst1.32 {q4}, [r10], r6
+
+ vaddl.s16 q9, d10, d8
+ vaddl.s16 q3, d11, d9
+
+ vld1.32 {q5}, [r11], r6
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d26, d24
+ vmlal.s16 q3, d31, d22
+
+ vqrshrun.s16 d26, q5, #5
+
+ vmlsl.s16 q3, d27, d24
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q14, q1, q11
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d1, d0, d1, #1
+ vqrshrun.s32 d19, q3, #10
+ vadd.s16 q5, q7, q4
+ vaddl.u8 q1, d1, d4
+ vadd.s16 q15, q8, q10
+ vmls.u16 q14, q1, q12
+ vqmovn.u16 d27, q9
+
+
+ vaddl.s16 q9, d12, d28
+ vaddl.s16 q3, d13, d29
+
+ vrhadd.u8 d26, d26, d27
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d10, d24
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d11, d24
+
+ vst1.32 d26, [r14], r3 @ store row 2
+
+ vst1.32 {q14}, [r10]
+
+ vqrshrun.s32 d18, q9, #10
+ vmov q5, q10
+ vld1.32 {q15}, [r11], r6
+
+ vqrshrun.s32 d19, q3, #10
+ subs r12, r12, #4
+
+ vqrshrun.s16 d30, q15, #5
+
+ vqmovn.u16 d18, q9
+ vmov q6, q4
+ vmov q3, q7
+ vrhadd.u8 d30, d18, d30
+ vmov q4, q8
+ vmov q7, q14
+ vst1.32 d30, [r14], r3 @ store row 3
+
+ bgt loop_16_highhalf @ looping if height = 8 or 16
+ b end_func
+
+loop_8_start:
+
+ vmov.u16 q11, #20 @ Filter coeff 20 into Q11
+ vmov.u16 q12, #5 @ Filter coeff 5 into Q12
+ vld1.32 {q0}, [r0], r2 @ row -2 load for horizontal filter
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q3, d0, d5
+
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q3, q4, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q4, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row -1 load for horizontal filter
+ vmls.u16 q3, q4, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q4, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q5, d2, d3
+
+ vst1.32 {q3}, [r9], r6 @ store temp buffer 0
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q4, q5, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q5, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 0 load for horizontal filter
+ vmls.u16 q4, q5, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q5, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q6, d2, d3
+
+ vst1.32 {q4}, [r9], r6 @ store temp buffer 1
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q5, q6, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q6, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 1 load for horizontal filter
+ vmls.u16 q5, q6, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q6, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q7, d2, d3
+
+ vst1.32 {q5}, [r9], r6 @ store temp buffer 2
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q6, q7, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q7, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 2 load for horizontal filter
+ vmls.u16 q6, q7, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q7, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d2, d3
+
+ vst1.32 {q6}, [r9], r6 @ store temp buffer 3
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q7, q8, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q8, d1, d4
+
+ vmls.u16 q7, q8, q12
+loop_8:
+
+ vld1.32 {q0}, [r0], r2 @ row 3 load for horizontal filter
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d0, d5
+
+ vst1.32 {q7}, [r9], r6 @ store temp buffer 4
+
+ vaddl.u8 q9, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q8, q9, q11
+ vext.8 d1, d0, d1, #1
+ vadd.s16 q14, q4, q7
+ vaddl.u8 q9, d1, d4
+ vadd.s16 q15, q5, q6
+ vmls.u16 q8, q9, q12
+ vld1.32 {q0}, [r0], r2 @ row 4 load for hoorizontal filter
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q10, d0, d5
+
+ vst1.32 {q8}, [r9], r6 @ store temp buffer r5
+
+ vaddl.s16 q9, d6, d16
+
+ vld1.32 {q13}, [r7], r6 @ load from temp buffer 0
+
+ vaddl.s16 q3, d7, d17
+
+ vqrshrun.s16 d26, q13, #5
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d28, d24
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d29, d24
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q10, q1, q11
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d1, d0, d1, #1
+ vqrshrun.s32 d19, q3, #10
+ vadd.s16 q14, q5, q8
+ vaddl.u8 q1, d1, d4
+ vadd.s16 q15, q6, q7
+ vmls.u16 q10, q1, q12
+ vqmovn.u16 d18, q9
+ vld1.32 {q0}, [r0], r2 @ row 5 load for horizontal filter
+
+ vrhadd.u8 d26, d18, d26
+
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+
+ vst1.32 {q10}, [r9], r6 @ store temp buffer r6
+
+ vaddl.s16 q9, d8, d20
+
+ vaddl.s16 q3, d9, d21
+
+ vld1.32 {q4}, [r7], r6 @load from temp buffer 1
+
+
+ vst1.32 d26, [r1], r3 @ store row 0
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d28, d24
+
+ vqrshrun.s16 d28, q4, #5
+
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d29, d24
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d0, d5
+ vaddl.u8 q1, d2, d3
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d4, d0, d1, #4
+ vqrshrun.s32 d19, q3, #10
+ vmla.u16 q4, q1, q11
+ vext.8 d1, d0, d1, #1
+ vadd.s16 q13, q6, q10
+ vaddl.u8 q1, d1, d4
+ vqmovn.u16 d18, q9
+ vadd.s16 q15, q7, q8
+ vmls.u16 q4, q1, q12
+ vld1.32 {q0}, [r0], r2 @ row 6 load for horizontal filter
+
+ vrhadd.u8 d28, d28, d18
+
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+
+ vst1.32 d28, [r1], r3 @ store row 1
+
+ vaddl.u8 q14, d0, d5
+
+ vst1.32 {q4}, [r9], r6 @ store temp buffer r7
+
+ vaddl.s16 q9, d10, d8
+ vaddl.s16 q3, d11, d9
+
+ vld1.32 {q5}, [r7], r6 @ load from temp buffer 2
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d26, d24
+ vmlal.s16 q3, d31, d22
+
+ vqrshrun.s16 d26, q5, #5
+
+ vmlsl.s16 q3, d27, d24
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q14, q1, q11
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d1, d0, d1, #1
+ vqrshrun.s32 d19, q3, #10
+ vadd.s16 q5, q7, q4
+ vaddl.u8 q1, d1, d4
+ vadd.s16 q15, q8, q10
+ vmls.u16 q14, q1, q12
+ vqmovn.u16 d27, q9
+
+ vaddl.s16 q9, d12, d28
+ vaddl.s16 q3, d13, d29
+
+ vrhadd.u8 d26, d26, d27
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d10, d24
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d11, d24
+
+ vst1.32 d26, [r1], r3 @ store row 2
+
+ vst1.32 {q14}, [r9]
+
+
+ vqrshrun.s32 d18, q9, #10
+ vmov q5, q10
+ vld1.32 {q15}, [r7], r6 @ load from temp buffer 3
+
+ vqrshrun.s32 d19, q3, #10
+ subs r4, r4, #4
+
+ vqrshrun.s16 d30, q15, #5
+
+ vqmovn.u16 d18, q9
+ vmov q6, q4
+ vmov q3, q7
+ vrhadd.u8 d30, d18, d30
+ vmov q4, q8
+ vmov q7, q14
+ vst1.32 d30, [r1], r3 @ store row 3
+
+ bgt loop_8 @if height =8 or 16 loop
+ b end_func
+
+loop_4_start:
+ vmov.u16 d22, #20 @ Filter coeff 20 into D22
+ vmov.u16 d23, #5 @ Filter coeff 5 into D23
+
+ vld1.32 {q0}, [r0], r2 @row -2 load
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q3, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 d6, d8, d22
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q4, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row -1 load
+ vmls.u16 d6, d8, d23
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q4, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q5, d2, d3
+
+ vst1.32 d6, [r9], r6 @ store temp buffer 0
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 d8, d10, d22
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q5, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 0 load
+ vmls.u16 d8, d10, d23
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q5, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q6, d2, d3
+
+ vst1.32 d8, [r9], r6 @ store temp buffer 1
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 d10, d12, d22
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q6, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 1 load
+ vmls.u16 d10, d12, d23
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q6, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q7, d2, d3
+
+ vst1.32 d10, [r9], r6 @ store temp buffer 2
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 d12, d14, d22
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q7, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 2 load
+ vmls.u16 d12, d14, d23
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q7, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 d14, d16, d22
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q8, d1, d4
+
+ vst1.32 d12, [r9], r6 @ store temp buffer 3
+
+ vmls.u16 d14, d16, d23
+
+loop_4:
+
+ vld1.32 {q0}, [r0], r2 @ row 3 load
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q8, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q9, d2, d3
+ vst1.32 d14, [r9], r6 @ store temp buffer 4
+ vext.8 d4, d0, d1, #4
+ vmla.u16 d16, d18, d22
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q9, d1, d4
+ vadd.s16 d2, d10, d12
+ vmls.u16 d16, d18, d23
+ vadd.s16 d3, d8, d14
+ vld1.32 {q9}, [r0], r2 @ row 4 load
+ vext.8 d25, d18, d19, #5
+ vaddl.u8 q13, d18, d25
+ vext.8 d20, d18, d19, #2
+
+ vst1.32 d16, [r9], r6 @ store temp buffer 5
+
+ vaddl.s16 q0, d6, d16
+ vmlal.s16 q0, d2, d22
+ vext.8 d21, d18, d19, #3
+ vaddl.u8 q14, d20, d21
+ vext.8 d24, d18, d19, #4
+ vmlsl.s16 q0, d3, d23
+ vmla.u16 d26, d28, d22
+ vext.8 d19, d18, d19, #1
+ vaddl.u8 q14, d19, d24
+ vadd.s16 d2, d12, d14
+ vmls.u16 d26, d28, d23
+ vqrshrun.s32 d0, q0, #0xa
+ vadd.s16 d3, d10, d16
+ vld1.32 {q9}, [r0], r2 @ row 5 load
+ vext.8 d25, d18, d19, #5
+ vqmovn.u16 d11, q0
+ vaddl.u8 q14, d18, d25
+
+ vst1.32 d26, [r9], r6 @ store temp buffer 6
+
+ @Q3 available here
+ vld1.32 d6, [r7], r6 @ load from temp buffer 0
+ vld1.32 d7, [r7], r6 @ load from temp buffer 1
+ vqrshrun.s16 d9, q3, #5
+
+ vext.8 d20, d18, d19, #2
+
+ vaddl.s16 q0, d8, d26
+ vmlal.s16 q0, d2, d22
+ vext.8 d21, d18, d19, #3
+ vaddl.u8 q3, d20, d21
+ vext.8 d24, d18, d19, #4
+ vmlsl.s16 q0, d3, d23
+ vmla.u16 d28, d6, d22
+ vext.8 d19, d18, d19, #1
+ vaddl.u8 q3, d19, d24
+ vadd.s16 d2, d14, d16
+ vmls.u16 d28, d6, d23
+ vqrshrun.s32 d0, q0, #0xa
+ vadd.s16 d3, d12, d26
+ vld1.32 {q9}, [r0], r2 @ row 6 load
+ vext.8 d25, d18, d19, #5
+ vqmovn.u16 d13, q0
+
+ vtrn.32 d11, d13
+ vaddl.s16 q0, d10, d28
+ vrhadd.u8 d9, d9, d11
+
+ vst1.32 d28, [r9], r6 @ store temp buffer 7
+
+ vmlal.s16 q0, d2, d22
+ vaddl.u8 q15, d18, d25
+
+ vst1.32 d9[0], [r1], r3 @ store row 0
+
+ vext.8 d20, d18, d19, #2
+
+ vst1.32 d9[1], [r1], r3 @ store row 1
+
+ vext.8 d21, d18, d19, #3
+ vmlsl.s16 q0, d3, d23
+ vaddl.u8 q4, d20, d21
+ vext.8 d24, d18, d19, #4
+ vmla.u16 d30, d8, d22
+ vext.8 d19, d18, d19, #1
+ vaddl.u8 q4, d19, d24
+ vqrshrun.s32 d0, q0, #0xa
+ vadd.s16 d2, d16, d26
+ vmls.u16 d30, d8, d23
+ vqmovn.u16 d4, q0
+
+ vadd.s16 d3, d14, d28
+
+
+ vaddl.s16 q0, d12, d30
+
+ vst1.32 d30, [r9]
+
+ vmlal.s16 q0, d2, d22
+
+ vld1.32 d8, [r7], r6 @ load from temp buffer 2
+ vld1.32 d9, [r7], r6 @ load from temp buffer 3
+ vmlsl.s16 q0, d3, d23
+ subs r4, r4, #4
+ vqrshrun.s16 d10, q4, #5
+
+ vmov d12, d28
+
+ vqrshrun.s32 d0, q0, #0xa
+ vmov d6, d14
+ vmov d8, d16
+
+ vqmovn.u16 d5, q0
+
+ vtrn.32 d4, d5
+ vrhadd.u8 d4, d4, d10
+ vmov d10, d26
+ vmov d14, d30
+
+ vst1.32 d4[0], [r1], r3 @ store row 2
+ vst1.32 d4[1], [r1], r3 @ store row 3
+
+ bgt loop_4
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s
new file mode 100755
index 0000000..c39ae01
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s
@@ -0,0 +1,266 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_horz_qpel_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction horizontal quarter pel interpolation.
+@*
+@* @author
+@* Mohit
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_horz_qpe_a9ql()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Quarter pel interprediction luma filter for horizontal input
+@*
+@* @par Description:
+@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
+@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@ @param[in] pu1_tmp: temporary buffer: UNUSED in this function
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations.
+@* @returns
+@*
+@ @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_luma_horz (
+@ UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd,
+@ UWORD8* pu1_tmp,
+@ UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r5 => ht
+@ r6 => wd
+@ r7 => dydx
+
+.text
+.p2align 2
+
+
+ .global ih264_inter_pred_luma_horz_qpel_a9q
+
+ih264_inter_pred_luma_horz_qpel_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r5, [sp, #104] @Loads ht
+ ldr r6, [sp, #108] @Loads wd
+ ldr r7, [sp, #116] @Loads dydx
+ and r7, r7, #3 @Finds x-offset
+ add r7, r0, r7, lsr #1 @pu1_src + (x_offset>>1)
+ sub r0, r0, #2 @pu1_src-2
+ vmov.i8 d0, #5 @filter coeff
+ subs r12, r6, #8 @if wd=8 branch to loop_8
+ vmov.i8 d1, #20 @filter coeff
+
+ beq loop_8
+
+ subs r12, r6, #4 @if wd=4 branch to loop_4
+ beq loop_4
+
+loop_16: @when wd=16
+ @// Processing row0 and row1
+ vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1
+ vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0)
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0)
+ vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0)
+ vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1)
+ vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0)
+ vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1)
+ vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0)
+ vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1)
+ vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1)
+ vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0)
+ vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1)
+ vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1)
+ vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0)
+ vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1)
+ vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0)
+ vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1)
+ vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1)
+ vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1)
+ vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0)
+ vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1)
+ vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+ vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1)
+ vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1)
+ vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row0)
+ vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2)
+ vrhadd.u8 q10, q6, q10 @Interpolation step for qpel calculation
+ vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vst1.8 {d20, d21}, [r1], r3 @//Store dest row0
+ vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2)
+ vqrshrun.s16 d19, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1)
+ vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row1)
+ vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation
+ vst1.8 {d18, d19}, [r1], r3 @//Store dest row1
+ subs r5, r5, #2 @ 2 rows done, decrement by 2
+
+ beq end_func
+ b loop_16
+
+loop_8:
+@// Processing row0 and row1
+
+ vld1.8 {d5, d6}, [r0], r2 @// Load row1
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vld1.8 {d2, d3}, [r0], r2 @// Load row0
+ vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1)
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1)
+ vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1)
+ vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0)
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0)
+ vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0)
+ vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1)
+ vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation
+ vst1.8 {d18}, [r1], r3 @//Store dest row0
+ vst1.8 {d19}, [r1], r3 @//Store dest row1
+ subs r5, r5, #2 @ 2 rows done, decrement by 2
+
+ beq end_func @ Branch if height==4
+ b loop_8 @looping if height == 8 or 16
+
+loop_4:
+ vld1.8 {d5, d6}, [r0], r2 @// Load row1
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vld1.8 {d2, d3}, [r0], r2 @// Load row0
+ vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1)
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1)
+ vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1)
+ vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1)
+ vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0)
+ vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0)
+ vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1)
+ vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0)
+ vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation
+ vst1.32 d18[0], [r1], r3 @//Store dest row0
+ vst1.32 d19[0], [r1], r3 @//Store dest row1
+
+ subs r5, r5, #2 @ 2 rows done, decrement by 2
+ beq end_func
+
+ b loop_4
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
new file mode 100755
index 0000000..565cc80
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
@@ -0,0 +1,505 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Mohit
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* This function implements a two stage cascaded six tap filter. It
+@* applies the six tap filter in the vertical direction on the
+@* predictor values, followed by applying the same filter in the
+@* horizontal direction on the output of the first stage. It then averages
+@* the output of the 1st stage and the final stage to obtain the quarter
+@* pel values.The six tap filtering operation is described in sec 8.4.2.2.1
+@* titled "Luma sample interpolation process".
+@*
+@* @par Description:
+@* This function is called to obtain pixels lying at the following
+@* location (1/4,1/2) or (3/4,1/2). The function interpolates
+@* the predictors first in the verical direction and then in the
+@* horizontal direction to output the (1/2,1/2). It then averages
+@* the output of the 2nd stage and (1/2,1/2) value to obtain (1/4,1/2)
+@* or (3/4,1/2) depending on the offset.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/;
+
+@void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd,
+@ UWORD8* pu1_tmp,
+@ UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ht
+@ r5 => wd
+@ r6 => dydx
+@ r9 => *pu1_tmp
+
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q
+
+ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r4, [sp, #104] @ loads ht
+ sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd
+ sub r0, r0, #2 @pu1_src-2
+ ldr r5, [sp, #108] @ loads wd
+ ldr r6, [sp, #116] @ loads dydx
+ and r6, r6, #2 @ dydx & 0x3 followed by dydx>>1 and dydx<<1
+ ldr r9, [sp, #112] @pu1_tmp
+ add r7, r9, #4
+ add r6, r7, r6 @ pi16_pred1_temp += (x_offset>>1)
+
+ vmov.u16 q13, #0x14 @ Filter coeff 20 into Q13
+ vmov.u16 q12, #0x5 @ Filter coeff 5 into Q12
+ mov r7, #0x20
+ mov r8, #0x30
+ subs r12, r5, #4 @if wd=4 branch to loop_4
+ beq loop_4
+
+ subs r12, r5, #8 @if wd=8 branch to loop_8
+ beq loop_8
+
+ @when wd=16
+ vmov.u16 q14, #0x14 @ Filter coeff 20 into Q13
+ vmov.u16 q15, #0x5 @ Filter coeff 5 into Q12
+ add r14, r2, #0
+ sub r2, r2, #16
+
+
+loop_16:
+
+ vld1.u32 {q0}, [r0]! @ Vector load from src[0_0]
+ vld1.u32 d12, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {q1}, [r0]! @ Vector load from src[1_0]
+ vld1.u32 d13, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {q2}, [r0]! @ Vector load from src[2_0]
+ vld1.u32 d14, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {q3}, [r0]! @ Vector load from src[3_0]
+ vld1.u32 d15, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {q4}, [r0]! @ Vector load from src[4_0]
+ vld1.u32 d16, [r0], r2 @ Vector load from src[4_0]
+
+ vld1.u32 {q5}, [r0]! @ Vector load from src[5_0]
+ vld1.u32 d17, [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q10, d4, d6
+ vaddl.u8 q9, d0, d10
+ vaddl.u8 q11, d2, d8
+ vmla.u16 q9, q10, q14
+ vaddl.u8 q12, d5, d7
+ vaddl.u8 q10, d1, d11
+ vaddl.u8 q13, d3, d9
+ vmla.u16 q10, q12, q14
+ vaddl.u8 q12, d14, d15
+ vmls.u16 q9, q11, q15
+ vaddl.u8 q11, d12, d17
+ vmls.u16 q10, q13, q15
+ vaddl.u8 q13, d13, d16
+ vmla.u16 q11, q12, q14
+ vmls.u16 q11, q13, q15
+ vst1.32 {q9}, [r9]!
+ vst1.32 {q10}, [r9]!
+ vext.16 q12, q9, q10, #2
+ vext.16 q13, q9, q10, #3
+ vst1.32 {q11}, [r9]
+ vext.16 q11, q9, q10, #5
+ vadd.s16 q0, q12, q13
+ vext.16 q12, q9, q10, #1
+ vext.16 q13, q9, q10, #4
+ vadd.s16 q12, q12, q13
+
+ vaddl.s16 q13, d18, d22
+ vmlal.s16 q13, d0, d28
+ vmlsl.s16 q13, d24, d30
+
+ vaddl.s16 q11, d19, d23
+ vmlal.s16 q11, d1, d28
+ vmlsl.s16 q11, d25, d30
+
+ vqrshrun.s32 d18, q13, #10
+ vqrshrun.s32 d19, q11, #10
+ vld1.32 {q11}, [r9]!
+ vqmovn.u16 d18, q9
+
+ vext.16 q12, q10, q11, #2
+ vext.16 q13, q10, q11, #3
+ vext.16 q0, q10, q11, #5
+ vst1.32 d18, [r1]
+ vadd.s16 q9, q12, q13
+ vext.16 q12, q10, q11, #1
+ vext.16 q13, q10, q11, #4
+ vadd.s16 q12, q12, q13
+
+ vaddl.s16 q13, d0, d20
+ vmlal.s16 q13, d18, d28
+ vmlsl.s16 q13, d24, d30
+
+ vaddl.s16 q11, d1, d21
+ vmlal.s16 q11, d19, d28
+ vmlsl.s16 q11, d25, d30
+
+ vqrshrun.s32 d18, q13, #10
+ vqrshrun.s32 d19, q11, #10
+
+ vaddl.u8 q12, d7, d9
+ vld1.32 {q10}, [r6]!
+ vld1.32 {q11}, [r6], r7
+
+ vqmovn.u16 d19, q9
+
+ vld1.32 d18, [r1]
+ vqrshrun.s16 d20, q10, #5
+ vqrshrun.s16 d21, q11, #5
+ vaddl.u8 q11, d4, d10
+ vld1.u32 {q0}, [r0]! @ Vector load from src[6_0]
+ vrhadd.u8 q9, q9, q10
+ vld1.u32 d12, [r0], r2 @ Vector load from src[6_0]
+ vaddl.u8 q10, d6, d8
+ vaddl.u8 q13, d5, d11
+ vst1.32 {q9}, [r1], r3 @ store row 0
+
+@ROW_2
+
+ vaddl.u8 q9, d2, d0
+
+ vmla.u16 q9, q10, q14
+
+ vaddl.u8 q10, d3, d1
+
+ vmla.u16 q10, q12, q14
+ vaddl.u8 q12, d15, d16
+ vmls.u16 q9, q11, q15
+ vaddl.u8 q11, d13, d12
+ vmls.u16 q10, q13, q15
+ vaddl.u8 q13, d14, d17
+ vmla.u16 q11, q12, q14
+ vmls.u16 q11, q13, q15
+ vst1.32 {q9}, [r9]!
+ vst1.32 {q10}, [r9]!
+ vext.16 q12, q9, q10, #2
+ vext.16 q13, q9, q10, #3
+ vst1.32 {q11}, [r9]
+ vext.16 q11, q9, q10, #5
+ vadd.s16 q1, q12, q13
+ vext.16 q12, q9, q10, #1
+ vext.16 q13, q9, q10, #4
+ vadd.s16 q12, q12, q13
+
+ vaddl.s16 q13, d18, d22
+ vmlal.s16 q13, d2, d28
+ vmlsl.s16 q13, d24, d30
+
+ vaddl.s16 q11, d19, d23
+ vmlal.s16 q11, d3, d28
+ vmlsl.s16 q11, d25, d30
+
+ vqrshrun.s32 d18, q13, #10
+ vqrshrun.s32 d19, q11, #10
+ vld1.32 {q11}, [r9]!
+ vqmovn.u16 d18, q9
+
+ vext.16 q12, q10, q11, #2
+ vext.16 q13, q10, q11, #3
+ vext.16 q1, q10, q11, #5
+ vst1.32 d18, [r1]
+ vadd.s16 q9, q12, q13
+ vext.16 q12, q10, q11, #1
+ vext.16 q13, q10, q11, #4
+ vadd.s16 q12, q12, q13
+
+ vaddl.s16 q13, d2, d20
+ vmlal.s16 q13, d18, d28
+ vmlsl.s16 q13, d24, d30
+
+ vaddl.s16 q11, d3, d21
+ vmlal.s16 q11, d19, d28
+ vmlsl.s16 q11, d25, d30
+
+ vqrshrun.s32 d18, q13, #10
+ vqrshrun.s32 d19, q11, #10
+ vaddl.u8 q12, d9, d11
+ vld1.32 {q10}, [r6]!
+ vld1.32 {q11}, [r6], r7
+ vqmovn.u16 d19, q9
+ vld1.32 d18, [r1]
+ vqrshrun.s16 d20, q10, #5
+ vqrshrun.s16 d21, q11, #5
+
+ vrhadd.u8 q9, q9, q10
+
+ vst1.32 {q9}, [r1], r3 @ store row 1
+
+ subs r4, r4, #2
+ subne r0, r0 , r14, lsl #2
+ subne r0, r0, r14
+
+ beq end_func @ Branch if height==4
+ b loop_16 @ Loop if height==8
+
+loop_8:
+ vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0]
+
+ vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0]
+ vaddl.u8 q7, d4, d6
+ vaddl.u8 q6, d0, d10
+ vaddl.u8 q8, d2, d8
+ vmla.u16 q6, q7, q13
+ vaddl.u8 q9, d5, d7
+ vaddl.u8 q7, d1, d11
+ vaddl.u8 q11, d3, d9
+ vmla.u16 q7, q9, q13
+ vmls.u16 q6, q8, q12
+ vld1.32 {q0}, [r0], r2 @ Vector load from src[6_0]
+ vaddl.u8 q8, d6, d8
+ vmls.u16 q7, q11, q12
+ vaddl.u8 q14, d2, d0
+ vst1.32 {q6}, [r9]! @ store row 0 to temp buffer: col 0
+ vext.16 q11, q6, q7, #5
+ vaddl.u8 q9, d4, d10
+ vmla.u16 q14, q8, q13
+ vaddl.s16 q15, d12, d22
+ vst1.32 {q7}, [r9], r7 @ store row 0 to temp buffer: col 1
+ vaddl.s16 q11, d13, d23
+ vext.16 q8, q6, q7, #2
+ vmls.u16 q14, q9, q12
+ vext.16 q9, q6, q7, #3
+ vext.16 q10, q6, q7, #4
+ vext.16 q7, q6, q7, #1
+ vadd.s16 q8, q8, q9
+ vadd.s16 q9, q7, q10
+ vaddl.u8 q10, d7, d9
+ vmlal.s16 q15, d16, d26
+ vmlsl.s16 q15, d18, d24
+ vmlal.s16 q11, d17, d26
+ vmlsl.s16 q11, d19, d24
+ vaddl.u8 q7, d3, d1
+ vst1.32 {q14}, [r9]! @ store row 1 to temp buffer: col 0
+ vmla.u16 q7, q10, q13
+ vqrshrun.s32 d12, q15, #10
+ vaddl.u8 q8, d5, d11
+ vqrshrun.s32 d13, q11, #10
+ vmls.u16 q7, q8, q12
+@ vld1.32 {q1},[r0],r2 ; Vector load from src[7_0]
+ vqmovn.u16 d25, q6
+ vaddl.u8 q8, d8, d10
+
+
+ vext.16 q11, q14, q7, #5
+ vaddl.u8 q10, d4, d2
+ vaddl.s16 q15, d28, d22
+ vmla.u16 q10, q8, q13
+ vst1.32 {q7}, [r9], r7 @ store row 1 to temp buffer: col 1
+ vaddl.s16 q11, d29, d23
+ vext.16 q8, q14, q7, #2
+ vext.16 q9, q14, q7, #3
+ vext.16 q6, q14, q7, #4
+ vext.16 q7, q14, q7, #1
+ vadd.s16 q8, q8, q9
+ vadd.s16 q9, q6, q7
+ vld1.32 {q7}, [r6], r8 @ load row 0 from temp buffer
+ vmlal.s16 q15, d16, d26
+ vmlsl.s16 q15, d18, d24
+ vmlal.s16 q11, d17, d26
+ vmlsl.s16 q11, d19, d24
+ vqrshrun.s16 d14, q7, #0x5
+ vld1.32 {q14}, [r6], r8 @ load row 1 from temp buffer
+ vaddl.u8 q9, d6, d0
+ vqrshrun.s32 d16, q15, #10
+ vqrshrun.s16 d15, q14, #0x5
+ vqrshrun.s32 d17, q11, #10
+ vmov d12, d25
+ vmov d25, d24
+
+ vqmovn.u16 d13, q8
+ vrhadd.u8 q6, q6, q7
+
+ vst1.32 d12, [r1], r3 @ store row 0
+ vst1.32 d13, [r1], r3 @ store row 1
+
+ subs r4, r4, #2
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+
+ beq end_func @ Branch if height==4
+ b loop_8 @ Loop if height==8
+
+loop_4:
+ vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q7, d4, d6 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q6, d0, d10 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q6, q7, q13 @ temp += temp1 * 20
+ vaddl.u8 q9, d5, d7 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q7, d1, d11 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q11, d3, d9 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q7, q9, q13 @ temp += temp1 * 20
+ vmls.u16 q6, q8, q12 @ temp -= temp2 * 5
+ vld1.32 {q0}, [r0], r2 @ Vector load from src[6_0]
+ vaddl.u8 q8, d6, d8
+ vmls.u16 q7, q11, q12 @ temp -= temp2 * 5
+ @Q6 and Q7 have filtered values
+ vaddl.u8 q14, d2, d0
+ vst1.32 {q6}, [r9]! @ store row 0 to temp buffer: col 0
+ vext.16 q11, q6, q7, #5
+ vaddl.u8 q9, d4, d10
+ vmla.u16 q14, q8, q13
+ vaddl.s16 q15, d12, d22
+ vst1.32 {q7}, [r9], r7 @ store row 0 to temp buffer: col 1
+ vaddl.s16 q11, d13, d23
+ vext.16 q8, q6, q7, #2
+ vmls.u16 q14, q9, q12
+ vext.16 q9, q6, q7, #3
+ vext.16 q10, q6, q7, #4
+ vext.16 q7, q6, q7, #1
+ vadd.s16 q8, q8, q9
+ vadd.s16 q9, q7, q10
+ vaddl.u8 q10, d7, d9
+ vmlal.s16 q15, d16, d26
+ vmlsl.s16 q15, d18, d24
+ vmlal.s16 q11, d17, d26
+ vmlsl.s16 q11, d19, d24
+ vaddl.u8 q7, d3, d1
+ vst1.32 {q14}, [r9]! @ store row 1 to temp buffer: col 0
+ vmla.u16 q7, q10, q13
+ vqrshrun.s32 d12, q15, #10
+ vaddl.u8 q8, d5, d11
+ vqrshrun.s32 d13, q11, #10
+ vmls.u16 q7, q8, q12
+ vqmovn.u16 d25, q6
+ vaddl.u8 q8, d8, d10
+
+ vext.16 q11, q14, q7, #5
+ vaddl.u8 q10, d4, d2
+ vaddl.s16 q15, d28, d22
+ vmla.u16 q10, q8, q13
+ vst1.32 {q7}, [r9], r7 @ store row 1 to temp buffer: col 1
+ vaddl.s16 q11, d29, d23
+ vext.16 q8, q14, q7, #2
+ vext.16 q9, q14, q7, #3
+ vext.16 q6, q14, q7, #4
+ vext.16 q7, q14, q7, #1
+ vadd.s16 q8, q8, q9
+ vadd.s16 q9, q6, q7
+ vld1.32 d14, [r6], r8 @load row 0 from temp buffer
+ vmlal.s16 q15, d16, d26
+ vmlsl.s16 q15, d18, d24
+ vmlal.s16 q11, d17, d26
+ vmlsl.s16 q11, d19, d24
+ vqrshrun.s16 d14, q7, #0x5
+ vld1.32 d28, [r6], r8 @load row 1 from temp buffer
+ vaddl.u8 q9, d6, d0
+ vqrshrun.s32 d16, q15, #10
+ vqrshrun.s16 d15, q14, #0x5
+ vqrshrun.s32 d17, q11, #10
+ vmov d12, d25
+ vmov d25, d24
+
+ vqmovn.u16 d13, q8
+ vrhadd.u8 q6, q6, q7
+ vst1.32 d12[0], [r1], r3 @ store row 0
+ vst1.32 d13[0], [r1], r3 @store row 1
+
+ subs r4, r4, #2
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+
+ beq end_func @ Branch if height==4
+ b loop_4 @ Loop if height==8
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
new file mode 100755
index 0000000..3c8b60a
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
@@ -0,0 +1,355 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Mohit
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* This function implements two six tap filters. It
+@* applies the six tap filter in the horizontal direction on the
+@* predictor values, then applies the same filter in the
+@* vertical direction on the predictor values. It then averages these
+@* two outputs to obtain quarter pel values in horizontal and vertical direction.
+@* The six tap filtering operation is described in sec 8.4.2.2.1 titled
+@* "Luma sample interpolation process"
+@*
+@* @par Description:
+@* This function is called to obtain pixels lying at the following
+@* location (1/4,1/4) or (3/4,1/4) or (1/4,3/4) or (3/4,3/4).
+@* The function interpolates the predictors first in the horizontal direction
+@* and then in the vertical direction, and then averages these two
+@* values.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/;
+
+@void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd,
+@ UWORD8* pu1_tmp,
+@ UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ht
+@ r5 => wd
+@ r6 => dydx
+
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q
+
+ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r4, [sp, #104] @ loads ht
+ ldr r5, [sp, #108] @ loads wd
+ ldr r6, [sp, #116] @dydx
+ and r7, r6, #3
+ add r7, r0, r7, lsr #1 @pu1_pred_vert = pu1_src + (x_offset>>1)
+
+ and r6, r6, #12 @Finds y-offset
+ lsr r6, r6, #3 @dydx>>3
+ mul r6, r2, r6
+ add r6, r0, r6 @pu1_pred_horz = pu1_src + (y_offset>>1)*src_strd
+ sub r7, r7, r2, lsl #1 @pu1_pred_vert-2*src_strd
+ sub r6, r6, #2 @pu1_pred_horz-2
+ vmov.u8 d30, #20 @ Filter coeff 20
+ vmov.u8 d31, #5 @ Filter coeff 5
+
+ subs r12, r5, #4 @if wd=4 branch to loop_4
+ beq loop_4
+ subs r12, r5, #8 @if wd=8 branch to loop_8
+ beq loop_8
+
+loop_16:
+ vld1.32 {q0}, [r7], r2 @ Vector load from src[0_0]
+ vld1.32 {q1}, [r7], r2 @ Vector load from src[1_0]
+ vld1.32 {q2}, [r7], r2 @ Vector load from src[2_0]
+ vld1.32 {q3}, [r7], r2 @ Vector load from src[3_0]
+ vld1.32 {q4}, [r7], r2 @ Vector load from src[4_0]
+ add r11, r6, #8
+ vld1.32 {q5}, [r7], r2 @ Vector load from src[5_0]
+ vld1.32 {q9}, [r6], r2 @ horz row0, col 0
+ vaddl.u8 q12, d0, d10
+ vmlal.u8 q12, d4, d30
+ vmlal.u8 q12, d6, d30
+ vmlsl.u8 q12, d2, d31
+ vmlsl.u8 q12, d8, d31
+ vext.8 d23, d18, d19, #5
+ vext.8 d20, d18, d19, #2
+ vext.8 d21, d18, d19, #3
+ vext.8 d22, d18, d19, #4
+ vext.8 d19, d18, d19, #1
+ vqrshrun.s16 d26, q12, #5
+ vaddl.u8 q14, d18, d23
+ vmlal.u8 q14, d20, d30
+ vmlal.u8 q14, d21, d30
+ vmlsl.u8 q14, d19, d31
+ vmlsl.u8 q14, d22, d31
+ vld1.32 {q9}, [r11], r2 @ horz row 0, col 1
+ vaddl.u8 q12, d1, d11
+ vmlal.u8 q12, d5, d30
+ vmlal.u8 q12, d7, d30
+ vmlsl.u8 q12, d3, d31
+ vmlsl.u8 q12, d9, d31
+ vqrshrun.s16 d28, q14, #5
+ vext.8 d23, d18, d19, #5
+ vext.8 d20, d18, d19, #2
+ vext.8 d21, d18, d19, #3
+ vext.8 d22, d18, d19, #4
+ vext.8 d19, d18, d19, #1
+ vqrshrun.s16 d27, q12, #5
+ vld1.32 {q6}, [r7], r2 @ src[6_0]
+
+ vaddl.u8 q12, d18, d23
+ vmlal.u8 q12, d20, d30
+ vmlal.u8 q12, d21, d30
+ vmlsl.u8 q12, d19, d31
+ vmlsl.u8 q12, d22, d31
+
+ vaddl.u8 q8, d2, d12
+ vmlal.u8 q8, d6, d30
+ vmlal.u8 q8, d8, d30
+ vmlsl.u8 q8, d4, d31
+ vmlsl.u8 q8, d10, d31
+ vqrshrun.s16 d29, q12, #5
+ vld1.32 {q9}, [r6], r2 @ horz row 1, col 0
+
+ vaddl.u8 q12, d3, d13
+ vmlal.u8 q12, d7, d30
+ vmlal.u8 q12, d9, d30
+ vmlsl.u8 q12, d5, d31
+ vmlsl.u8 q12, d11, d31
+ vrhadd.u8 q14, q14, q13
+ vqrshrun.s16 d26, q8, #5
+ vext.8 d23, d18, d19, #5
+ vext.8 d20, d18, d19, #2
+ vext.8 d21, d18, d19, #3
+ vext.8 d22, d18, d19, #4
+ vst1.32 {q14}, [r1], r3 @ store row 0
+ vext.8 d19, d18, d19, #1
+ vqrshrun.s16 d27, q12, #5
+
+ vaddl.u8 q14, d18, d23
+ vmlal.u8 q14, d20, d30
+ vmlal.u8 q14, d21, d30
+ vmlsl.u8 q14, d19, d31
+ vmlsl.u8 q14, d22, d31
+
+ vld1.32 {q9}, [r11], r2 @ horz row 1, col 1
+
+ vext.8 d23, d18, d19, #5
+ vext.8 d20, d18, d19, #2
+ vext.8 d21, d18, d19, #3
+ vext.8 d22, d18, d19, #4
+ vext.8 d19, d18, d19, #1
+
+ vqrshrun.s16 d28, q14, #5
+ vaddl.u8 q12, d18, d23
+ vmlal.u8 q12, d20, d30
+ vmlal.u8 q12, d21, d30
+ vmlsl.u8 q12, d19, d31
+ vmlsl.u8 q12, d22, d31
+
+ vqrshrun.s16 d29, q12, #5
+ vrhadd.u8 q14, q14, q13
+ vst1.32 {q14}, [r1], r3 @ store row 1
+
+ subs r4, r4, #2 @ 2 rows processed, decrement by 2
+ subne r7, r7 , r2, lsl #2
+ subne r7, r7, r2
+ beq end_func @ Branch if height==4
+
+ b loop_16 @ looping if height = 8 or 16
+
+
+loop_8:
+ vld1.32 d0, [r7], r2 @ Vector load from src[0_0]
+ vld1.32 d1, [r7], r2 @ Vector load from src[1_0]
+ vld1.32 d2, [r7], r2 @ Vector load from src[2_0]
+ vld1.32 d3, [r7], r2 @ Vector load from src[3_0]
+ vld1.32 d4, [r7], r2 @ Vector load from src[4_0]
+ vld1.32 d5, [r7], r2 @ Vector load from src[5_0]
+ vaddl.u8 q5, d0, d5
+ vmlal.u8 q5, d2, d30
+ vmlal.u8 q5, d3, d30
+ vmlsl.u8 q5, d1, d31
+ vmlsl.u8 q5, d4, d31
+ vld1.32 {q6}, [r6], r2 @horz row 0
+ vext.8 d17, d12, d13, #5
+ vext.8 d14, d12, d13, #2
+ vext.8 d15, d12, d13, #3
+ vext.8 d16, d12, d13, #4
+ vext.8 d13, d12, d13, #1
+ vqrshrun.s16 d26, q5, #5
+ vld1.32 d6, [r7], r2 @ src[6_0]
+ vaddl.u8 q5, d12, d17
+ vmlal.u8 q5, d14, d30
+ vmlal.u8 q5, d15, d30
+ vmlsl.u8 q5, d13, d31
+ vmlsl.u8 q5, d16, d31
+ vld1.32 {q6}, [r6], r2 @ horz row 1
+ vaddl.u8 q9, d1, d6
+ vmlal.u8 q9, d3, d30
+ vmlal.u8 q9, d4, d30
+ vmlsl.u8 q9, d2, d31
+ vmlsl.u8 q9, d5, d31
+ vqrshrun.s16 d28, q5, #5
+ vext.8 d17, d12, d13, #5
+ vext.8 d14, d12, d13, #2
+ vext.8 d15, d12, d13, #3
+ vext.8 d16, d12, d13, #4
+ vext.8 d13, d12, d13, #1
+ vqrshrun.s16 d27, q9, #5
+ vaddl.u8 q5, d12, d17
+ vmlal.u8 q5, d14, d30
+ vmlal.u8 q5, d15, d30
+ vmlsl.u8 q5, d13, d31
+ vmlsl.u8 q5, d16, d31
+ vqrshrun.s16 d29, q5, #5
+ vrhadd.u8 q13, q13, q14
+ vst1.32 d26, [r1], r3
+ vst1.32 d27, [r1], r3
+
+ subs r4, r4, #2 @ 2 rows processed, decrement by 2
+ subne r7, r7 , r2, lsl #2
+ subne r7, r7, r2
+ beq end_func @ Branch if height==4
+ b loop_8 @looping if height == 8 or 16
+
+loop_4:
+ vld1.32 d0[0], [r7], r2 @ Vector load from src[0_0]
+ vld1.32 d1[0], [r7], r2 @ Vector load from src[1_0]
+ vld1.32 d2[0], [r7], r2 @ Vector load from src[2_0]
+ vld1.32 d3[0], [r7], r2 @ Vector load from src[3_0]
+ vld1.32 d4[0], [r7], r2 @ Vector load from src[4_0]
+ vld1.32 d5[0], [r7], r2 @ Vector load from src[5_0]
+ vaddl.u8 q5, d0, d5
+ vmlal.u8 q5, d2, d30
+ vmlal.u8 q5, d3, d30
+ vmlsl.u8 q5, d1, d31
+ vmlsl.u8 q5, d4, d31
+ vld1.32 {q6}, [r6], r2 @load for horz filter row 0
+ vext.8 d17, d12, d13, #5
+ vext.8 d14, d12, d13, #2
+ vext.8 d15, d12, d13, #3
+ vext.8 d16, d12, d13, #4
+ vext.8 d13, d12, d13, #1
+ vqrshrun.s16 d26, q5, #5
+ vld1.32 d6[0], [r7], r2 @ Vector load from src[6_0]
+ vaddl.u8 q5, d12, d17
+ vmlal.u8 q5, d14, d30
+ vmlal.u8 q5, d15, d30
+ vmlsl.u8 q5, d13, d31
+ vmlsl.u8 q5, d16, d31
+ vld1.32 {q6}, [r6], r2 @horz row 1
+ vaddl.u8 q9, d1, d6
+ vmlal.u8 q9, d3, d30
+ vmlal.u8 q9, d4, d30
+ vmlsl.u8 q9, d2, d31
+ vmlsl.u8 q9, d5, d31
+ vqrshrun.s16 d28, q5, #5
+ vext.8 d17, d12, d13, #5
+ vext.8 d14, d12, d13, #2
+ vext.8 d15, d12, d13, #3
+ vext.8 d16, d12, d13, #4
+ vext.8 d13, d12, d13, #1
+ vqrshrun.s16 d27, q9, #5
+ vaddl.u8 q5, d12, d17
+ vmlal.u8 q5, d14, d30
+ vmlal.u8 q5, d15, d30
+ vmlsl.u8 q5, d13, d31
+ vmlsl.u8 q5, d16, d31
+ vqrshrun.s16 d29, q5, #5
+ vrhadd.u8 q13, q13, q14
+ vst1.32 d26[0], [r1], r3
+ vst1.32 d27[0], [r1], r3
+
+ subs r4, r4, #2 @ 2 rows processed, decrement by 2
+ subne r7, r7 , r2, lsl #2
+ subne r7, r7, r2
+ beq end_func @ Branch if height==4
+ b loop_4 @ Loop if height==8
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s
new file mode 100755
index 0000000..d45055e
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s
@@ -0,0 +1,330 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_vert_qpel_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction vertical quarter pel interpolation.
+@*
+@* @author
+@* Mohit
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_vert_qpel_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Quarter pel interprediction luma filter for vertical input
+@*
+@* @par Description:
+@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
+@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer: UNUSED in this function
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations.
+@* @returns
+@*
+@ @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_luma_vert (
+@ UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd,
+@ UWORD8* pu1_tmp,
+@ UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r5 => ht
+@ r6 => wd
+@ r7 => dydx
+
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_vert_qpel_a9q
+
+ih264_inter_pred_luma_vert_qpel_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r5, [sp, #104] @Loads ht
+
+ ldr r6, [sp, #108] @Loads wd
+ ldr r7, [sp, #116] @Loads dydx
+ and r7, r7, #12 @Finds y-offset
+ lsr r7, r7, #3 @dydx>>3
+ mul r7, r2, r7
+ add r7, r0, r7 @pu1_src + (y_offset>>1)*src_strd
+ vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11
+ sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd
+ subs r12, r6, #8 @if wd=8 branch to loop_8
+ vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12
+ beq loop_8
+
+ subs r12, r6, #4 @if wd=4 branch to loop_4
+ beq loop_4
+
+loop_16: @when wd=16
+
+ vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0]
+ vaddl.u8 q6, d4, d6 @ temp1 = src[2_0] + src[3_0]
+ vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0]
+ vaddl.u8 q7, d0, d10 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q7, q6, q11 @ temp += temp1 * 20
+ vaddl.u8 q10, d1, d11 @ temp4 = src[0_8] + src[5_8]
+ vaddl.u8 q9, d5, d7 @ temp3 = src[2_8] + src[3_8]
+ vmla.u16 q10, q9, q11 @ temp4 += temp3 * 20
+ vld1.u32 {q0}, [r0], r2
+ vaddl.u8 q13, d3, d9 @ temp5 = src[1_8] + src[4_8]
+ vaddl.u8 q6, d6, d8
+ vmls.u16 q7, q8, q12 @ temp -= temp2 * 5
+ vaddl.u8 q8, d2, d0
+ vaddl.u8 q9, d4, d10
+ vmla.u16 q8, q6, q11
+ vmls.u16 q10, q13, q12 @ temp4 -= temp5 * 5
+ vaddl.u8 q13, d5, d11
+ vaddl.u8 q6, d7, d9
+ vqrshrun.s16 d30, q7, #5 @ dst[0_0] = CLIP_U8((temp +16) >> 5)
+ vaddl.u8 q7, d3, d1
+ vld1.u32 {q1}, [r0], r2
+ vmla.u16 q7, q6, q11
+ vmls.u16 q8, q9, q12
+ vqrshrun.s16 d31, q10, #5 @ dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ vld1.u32 {q10}, [r7], r2 @ Load for interpolation row 0
+ vrhadd.u8 q15, q10, q15 @ Interpolation to obtain qpel value
+ vaddl.u8 q9, d4, d2
+ vaddl.u8 q6, d8, d10
+
+ vst1.u32 {q15}, [r1], r3 @ Vector store to dst[0_0]
+ vmla.u16 q9, q6, q11
+ vaddl.u8 q10, d6, d0
+ vmls.u16 q7, q13, q12
+ vqrshrun.s16 d30, q8, #5
+ vaddl.u8 q6, d9, d11
+ vaddl.u8 q8, d5, d3
+ vaddl.u8 q13, d7, d1
+ vmla.u16 q8, q6, q11
+ vmls.u16 q9, q10, q12
+ vld1.u32 {q2}, [r0], r2
+
+ vqrshrun.s16 d31, q7, #5
+ vld1.u32 {q7}, [r7], r2 @ Load for interpolation row 1
+ vaddl.u8 q6, d10, d0
+ vrhadd.u8 q15, q7, q15 @ Interpolation to obtain qpel value
+ vaddl.u8 q7, d6, d4
+ vaddl.u8 q10, d8, d2
+ vmla.u16 q7, q6, q11
+ vmls.u16 q8, q13, q12
+ vst1.u32 {q15}, [r1], r3 @store row 1
+ vqrshrun.s16 d30, q9, #5
+ vaddl.u8 q9, d7, d5
+ vaddl.u8 q6, d11, d1
+ vmla.u16 q9, q6, q11
+ vaddl.u8 q13, d9, d3
+ vmls.u16 q7, q10, q12
+ vqrshrun.s16 d31, q8, #5
+ vld1.u32 {q8}, [r7], r2 @ Load for interpolation row 2
+ vmls.u16 q9, q13, q12
+ vrhadd.u8 q15, q8, q15 @ Interpolation to obtain qpel value
+ vaddl.u8 q6, d0, d2 @ temp1 = src[2_0] + src[3_0]
+ vst1.u32 {q15}, [r1], r3 @store row 2
+ vaddl.u8 q8, d10, d4 @ temp2 = src[1_0] + src[4_0]
+ vaddl.u8 q10, d9, d7 @ temp4 = src[0_8] + src[5_8]
+ vqrshrun.s16 d30, q7, #5
+ vaddl.u8 q13, d5, d11 @ temp5 = src[1_8] + src[4_8]
+ vaddl.u8 q7, d8, d6 @ temp = src[0_0] + src[5_0]
+ vqrshrun.s16 d31, q9, #5
+ vld1.u32 {q9}, [r7], r2 @ Load for interpolation row 3
+ vmla.u16 q7, q6, q11 @ temp += temp1 * 20
+ vrhadd.u8 q15, q9, q15 @ Interpolation to obtain qpel value
+ vaddl.u8 q9, d1, d3 @ temp3 = src[2_8] + src[3_8]
+ vst1.u32 {q15}, [r1], r3 @store row 3
+ subs r5, r5, #4 @ 4 rows processed, decrement by 4
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+
+ b loop_16 @ looping if height = 8 or 16
+
+
+loop_8:
+
+ @// Processing row0 and row1
+ vld1.u32 d0, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 d1, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 d2, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 d3, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 d4, [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 d5, [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q4, q3, q11 @ temp += temp1 * 20
+ vld1.u32 d6, [r0], r2
+ vaddl.u8 q7, d3, d4
+ vaddl.u8 q8, d1, d6
+ vaddl.u8 q9, d2, d5
+ vmls.u16 q4, q5, q12 @ temp -= temp2 * 5
+ vmla.u16 q8, q7, q11
+ vld1.u32 d7, [r0], r2
+ vaddl.u8 q10, d4, d5
+ vaddl.u8 q6, d2, d7
+ vaddl.u8 q5, d3, d6
+ vmls.u16 q8, q9, q12
+ vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+ vmla.u16 q6, q10, q11
+ vld1.32 d8, [r7], r2 @Load value for interpolation (row0)
+ vld1.32 d9, [r7], r2 @Load value for interpolation (row1)
+ vld1.u32 d0, [r0], r2
+ vaddl.u8 q7, d5, d6
+ vqrshrun.s16 d27, q8, #5
+ vrhadd.u8 q13, q4, q13 @ Interpolation step for qpel calculation
+ vaddl.u8 q10, d3, d0
+ vmls.u16 q6, q5, q12
+ vst1.u32 d26, [r1], r3 @ Vector store to dst[0_0]
+ vaddl.u8 q9, d4, d7
+ vmla.u16 q10, q7, q11
+ vst1.u32 d27, [r1], r3 @ Vector store to dst[1_0]
+ vqrshrun.s16 d28, q6, #5
+ vmls.u16 q10, q9, q12
+ vld1.32 d12, [r7], r2 @Load value for interpolation (row2)
+ vld1.32 d13, [r7], r2 @Load value for interpolation (row3)
+ vqrshrun.s16 d29, q10, #5
+ subs r9, r5, #4
+ vrhadd.u8 q14, q6, q14
+ vst1.u32 d28, [r1], r3 @store row 2
+ vst1.u32 d29, [r1], r3 @store row 3
+
+ subs r5, r5, #4 @ 4 rows processed, decrement by 4
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+ b loop_8 @looping if height == 8 or 16
+
+loop_4:
+@// Processing row0 and row1
+
+ vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 d2[0], [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 d3[0], [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 d4[0], [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 d5[0], [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q4, q3, q11 @ temp += temp1 * 20
+ vld1.u32 d6, [r0], r2
+ vaddl.u8 q7, d3, d4
+ vaddl.u8 q8, d1, d6
+ vaddl.u8 q9, d2, d5
+ vmls.u16 q4, q5, q12 @ temp -= temp2 * 5
+ vld1.u32 d7[0], [r0], r2
+ vmla.u16 q8, q7, q11
+ vaddl.u8 q10, d4, d5
+ vaddl.u8 q6, d2, d7
+ vaddl.u8 q5, d3, d6
+ vmls.u16 q8, q9, q12
+ vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+ vld1.u32 d8[0], [r7], r2 @Load value for interpolation - row 0
+ vld1.u32 d9[0], [r7], r2 @Load value for interpolation - row 1
+ vmla.u16 q6, q10, q11
+ vld1.u32 d0[0], [r0], r2
+ vaddl.u8 q7, d5, d6
+ vqrshrun.s16 d27, q8, #5
+ vaddl.u8 q10, d3, d0
+ vrhadd.u8 q13, q13, q4 @Interpolation step for qpel calculation
+ vmls.u16 q6, q5, q12
+ vst1.u32 d26[0], [r1], r3 @ Vector store to dst[0_0]
+ vaddl.u8 q9, d4, d7
+ vmla.u16 q10, q7, q11
+ vst1.u32 d27[0], [r1], r3 @ store row 1
+ vqrshrun.s16 d28, q6, #5
+ vld1.u32 d12[0], [r7], r2 @Load value for interpolation - row 2
+ vld1.u32 d13[0], [r7], r2 @Load value for interpolation - row 3
+
+ vmls.u16 q10, q9, q12
+ vqrshrun.s16 d29, q10, #5
+ vrhadd.u8 q14, q6, q14 @Interpolation step for qpel calculation
+ vst1.u32 d28[0], [r1], r3 @store row 2
+ vst1.u32 d29[0], [r1], r3 @store row 3
+
+ subs r5, r5, #8
+ subeq r0, r0, r2, lsl #2
+ subeq r0, r0, r2
+ beq loop_4 @ Loop if height==8
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_intra_pred_chroma_a9q.s b/common/arm/ih264_intra_pred_chroma_a9q.s
new file mode 100755
index 0000000..d03fc55
--- /dev/null
+++ b/common/arm/ih264_intra_pred_chroma_a9q.s
@@ -0,0 +1,551 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_intra_pred_chroma_a9q.s
+@*
+@* @brief
+@* Contains function definitions for intra chroma prediction .
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* - ih264_intra_pred_chroma_mode_horz_a9q()
+@* - ih264_intra_pred_chroma_8x8_mode_vert_a9q()
+@* - ih264_intra_pred_chroma_mode_dc_a9q()
+@* - ih264_intra_pred_chroma_mode_plane_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+.text
+.p2align 2
+
+ .extern ih264_gai1_intrapred_chroma_plane_coeffs1
+.hidden ih264_gai1_intrapred_chroma_plane_coeffs1
+ .extern ih264_gai1_intrapred_chroma_plane_coeffs2
+.hidden ih264_gai1_intrapred_chroma_plane_coeffs2
+scratch_chroma_intrapred_addr1:
+ .long ih264_gai1_intrapred_chroma_plane_coeffs1 - scrlblc1 - 8
+
+scratch_intrapred_chroma_plane_addr1:
+ .long ih264_gai1_intrapred_chroma_plane_coeffs2 - scrlblc2 - 8
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_chroma_8x8_mode_dc
+@*
+@* @brief
+@* Perform Intra prediction for chroma_8x8 mode:DC
+@*
+@* @par Description:
+@* Perform Intra prediction for chroma_8x8 mode:DC ,described in sec 8.3.4.1
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination with alternate U and V samples
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@** @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+
+ .global ih264_intra_pred_chroma_8x8_mode_dc_a9q
+
+ih264_intra_pred_chroma_8x8_mode_dc_a9q:
+
+ stmfd sp!, {r4, r14} @store register values to stack
+ ldr r4, [sp, #8] @r4 => ui_neighboravailability
+ vpush {d8-d15}
+
+ ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE
+ beq top_available
+ ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+ beq left_available
+
+ vld1.u8 {q0}, [r0] @BOTH LEFT AND TOP AVAILABLE
+ add r0, r0, #18
+ vld1.u8 {q1}, [r0]
+ vaddl.u8 q2, d1, d2
+ vaddl.u8 q3, d0, d3
+ vmovl.u8 q1, d3
+ vmovl.u8 q0, d0
+
+ vadd.u16 d12, d4, d5
+ vadd.u16 d13, d2, d3
+ vadd.u16 d15, d6, d7
+ vadd.u16 d14, d0, d1
+
+ vpadd.u32 d12, d12, d15
+ vpadd.u32 d14, d13, d14
+ vqrshrun.s16 d12, q6, #3
+ vqrshrun.s16 d14, q7, #2
+ vdup.u16 d8, d12[0]
+ vdup.u16 d9, d14[0]
+ vdup.u16 d10, d14[1]
+ vdup.u16 d11, d12[1]
+ b str_pred
+
+top_available: @ONLY TOP AVAILABLE
+ ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+ beq none_available
+
+ add r0, r0, #18
+ vld1.u8 {q0}, [r0]
+ vmovl.u8 q1, d0
+ vmovl.u8 q2, d1
+ vadd.u16 d0, d2, d3
+ vadd.u16 d1, d4, d5
+ vpaddl.u32 q0, q0
+ vqrshrun.s16 d0, q0, #2
+ vdup.u16 d8, d0[0]
+ vdup.u16 d9, d0[2]
+ vmov q5, q4
+ b str_pred
+
+left_available: @ONLY LEFT AVAILABLE
+ vld1.u8 {q0}, [r0]
+ vmovl.u8 q1, d0
+ vmovl.u8 q2, d1
+ vadd.u16 d0, d2, d3
+ vadd.u16 d1, d4, d5
+ vpaddl.u32 q0, q0
+ vqrshrun.s16 d0, q0, #2
+ vdup.u16 q5, d0[0]
+ vdup.u16 q4, d0[2]
+ b str_pred
+
+none_available: @NONE AVAILABLE
+ vmov.u8 q4, #128
+ vmov.u8 q5, #128
+
+str_pred:
+ vst1.8 {q4}, [r1], r3
+ vst1.8 {q4}, [r1], r3
+ vst1.8 {q4}, [r1], r3
+ vst1.8 {q4}, [r1], r3
+ vst1.8 {q5}, [r1], r3
+ vst1.8 {q5}, [r1], r3
+ vst1.8 {q5}, [r1], r3
+ vst1.8 {q5}, [r1], r3
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4, pc} @Restoring registers from stack
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_chroma_8x8_mode_horz
+@*
+@* @brief
+@* Perform Intra prediction for chroma_8x8 mode:Horizontal
+@*
+@* @par Description:
+@* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination with alternate U and V samples
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_chroma_8x8_mode_horz_a9q
+
+ih264_intra_pred_chroma_8x8_mode_horz_a9q:
+
+ stmfd sp!, {r14} @store register values to stack
+
+ vld1.u8 {q0}, [r0]
+ mov r2, #6
+
+ vdup.u16 q1, d1[3]
+ vdup.u16 q2, d1[2]
+ vst1.8 {q1}, [r1], r3
+
+loop_8x8_horz:
+ vext.8 q0, q0, q0, #12
+ vst1.8 {q2}, [r1], r3
+ vdup.u16 q1, d1[3]
+ subs r2, #2
+ vdup.u16 q2, d1[2]
+ vst1.8 {q1}, [r1], r3
+ bne loop_8x8_horz
+
+ vext.8 q0, q0, q0, #12
+ vst1.8 {q2}, [r1], r3
+
+ ldmfd sp!, {pc} @restoring registers from stack
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_chroma_8x8_mode_vert
+@*
+@* @brief
+@* Perform Intra prediction for chroma_8x8 mode:vertical
+@*
+@* @par Description:
+@*Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination with alternate U and V samples
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_chroma_8x8_mode_vert_a9q
+
+ih264_intra_pred_chroma_8x8_mode_vert_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ add r0, r0, #18
+ vld1.8 {q0}, [r0]
+
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_chroma_8x8_mode_plane
+@*
+@* @brief
+@* Perform Intra prediction for chroma_8x8 mode:PLANE
+@*
+@* @par Description:
+@* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination with alternate U and V samples
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_chroma_8x8_mode_plane_a9q
+ih264_intra_pred_chroma_8x8_mode_plane_a9q:
+
+ stmfd sp!, {r4-r10, r12, lr}
+ vpush {d8-d15}
+
+
+ vld1.32 d0, [r0]
+ add r10, r0, #10
+ vld1.32 d1, [r10]
+ add r10, r10, #6
+ vrev64.16 d5, d0
+ vld1.32 d2, [r10]!
+ add r10, r10, #2
+ vrev64.16 d7, d2
+ vld1.32 d3, [r10]
+ sub r5, r3, #8
+ ldr r12, scratch_chroma_intrapred_addr1
+scrlblc1:
+ add r12, r12, pc
+ vsubl.u8 q5, d5, d1
+ vld1.64 {q4}, [r12] @ Load multiplication factors 1 to 8 into D3
+ vsubl.u8 q6, d3, d7
+ vmul.s16 q7, q5, q4
+ vmul.s16 q8, q6, q4
+ vuzp.16 q7, q8
+
+ vpadd.s16 d14, d14
+ vpadd.s16 d15, d15
+ vpadd.s16 d16, d16
+ vpadd.s16 d17, d17
+ vpadd.s16 d14, d14
+ vpadd.s16 d15, d15
+ vpadd.s16 d16, d16
+ vpadd.s16 d17, d17
+
+ mov r6, #34
+ vdup.16 q9, r6
+
+ vmull.s16 q11, d14, d18
+ vmull.s16 q12, d15, d18
+ vmull.s16 q13, d16, d18
+ vmull.s16 q14, d17, d18
+
+ vrshrn.s32 d10, q11, #6
+ vrshrn.s32 d12, q12, #6
+ vrshrn.s32 d13, q13, #6
+ vrshrn.s32 d14, q14, #6
+
+
+ ldrb r6, [r0], #1
+ add r10, r0, #31
+ ldrb r8, [r0], #1
+ ldrb r7, [r10], #1
+ ldrb r9, [r10], #1
+
+ add r6, r6, r7
+ add r8, r8, r9
+ lsl r6, r6, #4
+ lsl r8, r8, #4
+
+ vdup.16 q0, r6
+ vdup.16 q1, r8
+ vdup.16 q2, d12[0]
+ vdup.16 q3, d10[0]
+
+ vdup.16 q12, d14[0]
+ vdup.16 q13, d13[0]
+ vzip.16 q2, q12
+ vzip.16 q3, q13
+ vzip.16 q0, q1
+
+ ldr r12, scratch_intrapred_chroma_plane_addr1
+scrlblc2:
+ add r12, r12, pc
+ vld1.64 {q4}, [r12]
+ vmov.16 q5, q4
+ vmov q11, q4
+ vzip.16 q4, q5
+
+ vmul.s16 q6, q2, q4
+ vmul.s16 q8, q2, q5
+ vadd.s16 q6, q0, q6
+ vadd.s16 q8, q0, q8
+
+
+ vdup.16 q10, d22[0]
+ vmul.s16 q2, q3, q10
+ vdup.16 q15, d22[1]
+ vmul.s16 q9, q3, q10
+ vmul.s16 q7, q3, q15
+ vmul.s16 q4, q3, q15
+ vadd.s16 q12, q6, q2
+ vadd.s16 q0, q8, q9
+ vadd.s16 q1, q6, q7
+ vqrshrun.s16 d28, q12, #5
+ vadd.s16 q13, q8, q4
+ vqrshrun.s16 d29, q0, #5
+ vdup.16 q10, d22[2]
+ vst1.8 {q14}, [r1], r3
+ vqrshrun.s16 d28, q1, #5
+ vqrshrun.s16 d29, q13, #5
+ vmul.s16 q2, q3, q10
+ vmul.s16 q9, q3, q10
+ vst1.8 {q14}, [r1], r3
+ vadd.s16 q12, q6, q2
+ vadd.s16 q0, q8, q9
+ vdup.16 q15, d22[3]
+ vqrshrun.s16 d28, q12, #5
+ vqrshrun.s16 d29, q0, #5
+ vmul.s16 q7, q3, q15
+ vmul.s16 q4, q3, q15
+ vst1.8 {q14}, [r1], r3
+ vadd.s16 q1, q6, q7
+ vadd.s16 q13, q8, q4
+ vdup.16 q10, d23[0]
+ vqrshrun.s16 d28, q1, #5
+ vqrshrun.s16 d29, q13, #5
+ vmul.s16 q2, q3, q10
+ vmul.s16 q9, q3, q10
+ vst1.8 {q14}, [r1], r3
+ vadd.s16 q12, q6, q2
+ vadd.s16 q0, q8, q9
+ vdup.16 q15, d23[1]
+ vqrshrun.s16 d28, q12, #5
+ vqrshrun.s16 d29, q0, #5
+ vmul.s16 q7, q3, q15
+ vmul.s16 q4, q3, q15
+ vst1.8 {q14}, [r1], r3
+ vadd.s16 q1, q6, q7
+ vadd.s16 q13, q8, q4
+ vdup.16 q10, d23[2]
+ vqrshrun.s16 d28, q1, #5
+ vqrshrun.s16 d29, q13, #5
+ vmul.s16 q2, q3, q10
+ vmul.s16 q9, q3, q10
+ vst1.8 {q14}, [r1], r3
+ vadd.s16 q12, q6, q2
+ vadd.s16 q0, q8, q9
+ vdup.16 q15, d23[3]
+ vqrshrun.s16 d28, q12, #5
+ vqrshrun.s16 d29, q0, #5
+ vmul.s16 q7, q3, q15
+ vmul.s16 q4, q3, q15
+ vst1.8 {q14}, [r1], r3
+ vadd.s16 q1, q6, q7
+ vadd.s16 q13, q8, q4
+ vqrshrun.s16 d28, q1, #5
+ vqrshrun.s16 d29, q13, #5
+ vst1.8 {q14}, [r1], r3
+
+
+
+end_func_plane:
+
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r10, r12, pc}
+
+
+
+
diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
new file mode 100755
index 0000000..e38e203
--- /dev/null
+++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
@@ -0,0 +1,520 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_intra_pred_luma_16x16_a9q.s
+@*
+@* @brief
+@* Contains function definitions for intra 16x16 Luma prediction .
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* - ih264_intra_pred_luma_16x16_mode_vert_a9q()
+@* - ih264_intra_pred_luma_16x16_mode_horz_a9q()
+@* - ih264_intra_pred_luma_16x16_mode_dc_a9q()
+@* - ih264_intra_pred_luma_16x16_mode_plane_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_intra_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+
+.text
+.p2align 2
+
+
+ .extern ih264_gai1_intrapred_luma_plane_coeffs
+.hidden ih264_gai1_intrapred_luma_plane_coeffs
+scratch_intrapred_addr1:
+ .long ih264_gai1_intrapred_luma_plane_coeffs - scrlbl1 - 8
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_16x16_mode_vert_a9q
+@*
+@* @brief
+@* Perform Intra prediction for luma_16x16 mode:vertical
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_16x16 mode:Vertical ,described in sec 8.3.3.1
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_16x16_mode_vert_a9q
+
+ih264_intra_pred_luma_16x16_mode_vert_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ add r0, r0, #17
+ vld1.8 {q0}, [r0]
+
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_16x16_mode_horz_a9q
+@*
+@* @brief
+@* Perform Intra prediction for luma_16x16 mode:horizontal
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_16x16 mode:horizontal ,described in sec 8.3.3.2
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_16x16_mode_horz_a9q
+
+ih264_intra_pred_luma_16x16_mode_horz_a9q:
+
+ stmfd sp!, {r14} @store register values to stack
+
+ vld1.u8 {q0}, [r0]
+ mov r2, #14
+
+ vdup.u8 q1, d1[7]
+ vdup.u8 q2, d1[6]
+ vst1.8 {q1}, [r1], r3
+
+loop_16x16_horz:
+ vext.8 q0, q0, q0, #14
+ vst1.8 {q2}, [r1], r3
+ vdup.u8 q1, d1[7]
+ subs r2, #2
+ vdup.u8 q2, d1[6]
+ vst1.8 {q1}, [r1], r3
+ bne loop_16x16_horz
+
+ vext.8 q0, q0, q0, #14
+ vst1.8 {q2}, [r1], r3
+
+ ldmfd sp!, {pc} @Restoring registers from stack
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_16x16_mode_dc_a9q
+@*
+@* @brief
+@* Perform Intra prediction for luma_16x16 mode:DC
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_16x16 mode:DC ,described in sec 8.3.3.3
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_16x16_mode_dc_a9q
+
+ih264_intra_pred_luma_16x16_mode_dc_a9q:
+
+ stmfd sp!, {r4, r14} @store register values to stack
+ ldr r4, [sp, #8] @r4 => ui_neighboravailability
+
+ ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE
+ beq top_available
+ ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+ beq left_available
+
+ vld1.u8 {q0}, [r0] @BOTH LEFT AND TOP AVAILABLE
+ add r0, r0, #17
+ vpaddl.u8 q0, q0
+ vld1.u8 {q1}, [r0]
+ vpaddl.u8 q1, q1
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vqrshrun.s16 d0, q0, #5
+ vdup.u8 q0, d0[0]
+ b str_pred
+
+top_available: @ONLY TOP AVAILABLE
+ ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+ beq none_available
+
+ add r0, r0, #17
+ vld1.u8 {q0}, [r0]
+ vpaddl.u8 q0, q0
+ vadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vqrshrun.s16 d0, q0, #4
+ vdup.u8 q0, d0[0]
+ b str_pred
+
+left_available: @ONLY LEFT AVAILABLE
+ vld1.u8 {q0}, [r0]
+ vpaddl.u8 q0, q0
+ vadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vqrshrun.s16 d0, q0, #4
+ vdup.u8 q0, d0[0]
+ b str_pred
+
+none_available: @NONE AVAILABLE
+ vmov.u8 q0, #128
+
+str_pred:
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+
+ ldmfd sp!, {r4, pc} @Restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_16x16_mode_plane_a9q
+@*
+@* @brief
+@* Perform Intra prediction for luma_16x16 mode:PLANE
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_16x16 mode:PLANE ,described in sec 8.3.3.4
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_16x16_mode_plane_a9q
+ih264_intra_pred_luma_16x16_mode_plane_a9q:
+
+ stmfd sp!, {r4-r10, r12, lr}
+
+ mov r2, r1
+ add r1, r0, #17
+ add r0, r0, #15
+
+ mov r8, #9
+ sub r1, r1, #1
+ mov r10, r1 @top_left
+ mov r4, #-1
+ vld1.32 d2, [r1], r8
+ ldr r7, scratch_intrapred_addr1
+scrlbl1:
+ add r7, r7, pc
+
+ vld1.32 d0, [r1]
+ vrev64.8 d2, d2
+ vld1.32 {q3}, [r7]
+ vsubl.u8 q0, d0, d2
+ vmovl.u8 q8, d6
+ vmul.s16 q0, q0, q8
+ vmovl.u8 q9, d7
+
+ add r7, r0, r4, lsl #3
+ sub r0, r7, r4, lsl #1
+ rsb lr, r4, #0x0
+
+ vpadd.s16 d0, d0, d1
+
+ ldrb r8, [r7], r4
+ ldrb r9, [r0], lr
+
+ vpaddl.s16 d0, d0
+ sub r12, r8, r9
+
+ ldrb r8, [r7], r4
+
+ vpaddl.s32 d0, d0
+ ldrb r9, [r0], lr
+ sub r8, r8, r9
+ vshl.s32 d2, d0, #2
+ add r12, r12, r8, lsl #1
+
+ vadd.s32 d0, d0, d2
+ ldrb r8, [r7], r4
+ ldrb r9, [r0], lr
+ vrshr.s32 d0, d0, #6 @ i_b = D0[0]
+ sub r8, r8, r9
+ ldrb r5, [r7], r4
+ add r8, r8, r8, lsl #1
+
+ vdup.16 q2, d0[0]
+ add r12, r12, r8
+ ldrb r9, [r0], lr
+ vmul.s16 q0, q2, q8
+ sub r5, r5, r9
+ vmul.s16 q1, q2, q9
+ add r12, r12, r5, lsl #2
+
+ ldrb r8, [r7], r4
+ ldrb r9, [r0], lr
+ sub r8, r8, r9
+ ldrb r5, [r7], r4
+ add r8, r8, r8, lsl #2
+ ldrb r6, [r0], lr
+ add r12, r12, r8
+ ldrb r8, [r7], r4
+ ldrb r9, [r0], lr
+
+ sub r5, r5, r6
+ sub r8, r8, r9
+ add r5, r5, r5, lsl #1
+ rsb r8, r8, r8, lsl #3
+ add r12, r12, r5, lsl #1
+ ldrb r5, [r7], r4
+ ldrb r6, [r10] @top_left
+ add r12, r12, r8
+ sub r9, r5, r6
+ ldrb r6, [r1, #7]
+ add r12, r12, r9, lsl #3 @ i_c = r12
+ add r8, r5, r6
+
+ add r12, r12, r12, lsl #2
+ lsl r8, r8, #4 @ i_a = r8
+
+ add r12, r12, #0x20
+ lsr r12, r12, #6
+
+ vshl.s16 q14, q2, #3
+ vdup.16 q3, r12
+
+ vdup.16 q15, r8
+ vshl.s16 q13, q3, #3
+ vsub.s16 q15, q15, q14
+ vsub.s16 q15, q15, q13
+ vadd.s16 q14, q15, q3
+
+ mov r0, #14
+ vadd.s16 q13, q14, q0
+ vadd.s16 q14, q14, q1
+ vqrshrun.s16 d20, q13, #5
+ vqrshrun.s16 d21, q14, #5
+
+loop_16x16_plane:
+
+ vadd.s16 q13, q13, q3
+ vadd.s16 q14, q14, q3
+ vqrshrun.s16 d22, q13, #5
+ vst1.32 {q10}, [r2], r3
+ vqrshrun.s16 d23, q14, #5
+
+ vadd.s16 q13, q13, q3
+ subs r0, #2
+ vadd.s16 q14, q14, q3
+ vqrshrun.s16 d20, q13, #5
+ vst1.32 {q11}, [r2], r3
+ vqrshrun.s16 d21, q14, #5
+ bne loop_16x16_plane
+
+ vadd.s16 q13, q13, q3
+ vadd.s16 q14, q14, q3
+ vqrshrun.s16 d22, q13, #5
+ vst1.32 {q10}, [r2], r3
+ vqrshrun.s16 d23, q14, #5
+ vst1.32 {q11}, [r2], r3
+
+ ldmfd sp!, {r4-r10, r12, pc}
+
+
+
diff --git a/common/arm/ih264_intra_pred_luma_4x4_a9q.s b/common/arm/ih264_intra_pred_luma_4x4_a9q.s
new file mode 100755
index 0000000..cb386ea
--- /dev/null
+++ b/common/arm/ih264_intra_pred_luma_4x4_a9q.s
@@ -0,0 +1,842 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_intra_pred_luma_4x4_a9q.s
+@*
+@* @brief
+@* Contains function definitions for intra 4x4 Luma prediction .
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* -ih264_intra_pred_luma_4x4_mode_vert_a9q
+@* -ih264_intra_pred_luma_4x4_mode_horz_a9q
+@* -ih264_intra_pred_luma_4x4_mode_dc_a9q
+@* -ih264_intra_pred_luma_4x4_mode_diag_dl_a9q
+@* -ih264_intra_pred_luma_4x4_mode_diag_dr_a9q
+@* -ih264_intra_pred_luma_4x4_mode_vert_r_a9q
+@* -ih264_intra_pred_luma_4x4_mode_horz_d_a9q
+@* -ih264_intra_pred_luma_4x4_mode_vert_l_a9q
+@* -ih264_intra_pred_luma_4x4_mode_horz_u_a9q
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_intra_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+
+.text
+.p2align 2
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_vert
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:vertical
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:vertical ,described in sec 8.3.1.2.1
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@void ih264_intra_pred_luma_4x4_mode_vert(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_4x4_mode_vert_a9q
+
+ih264_intra_pred_luma_4x4_mode_vert_a9q:
+
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ add r0, r0, #5
+
+ vld1.32 d0[0], [r0]
+
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+
+
+
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_horz
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:horizontal
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264_intra_pred_luma_4x4_mode_horz(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+
+ .global ih264_intra_pred_luma_4x4_mode_horz_a9q
+
+ih264_intra_pred_luma_4x4_mode_horz_a9q:
+
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ add r0, r0, #3
+ mov r2 , #-1
+
+ ldrb r5, [r0], r2
+ vdup.u8 d0, r5
+ ldrb r6, [r0], r2
+ vst1.32 d0[0], [r1], r3
+ vdup.u8 d1, r6
+ ldrb r7, [r0], r2
+ vst1.32 d1[0], [r1], r3
+ vdup.u8 d2, r7
+ ldrb r8, [r0], r2
+ vst1.32 d2[0], [r1], r3
+ vdup.u8 d3, r8
+ vst1.32 d3[0], [r1], r3
+
+
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_dc
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:DC
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:DC ,described in sec 8.3.1.2.3
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_dc(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+
+ .global ih264_intra_pred_luma_4x4_mode_dc_a9q
+
+ih264_intra_pred_luma_4x4_mode_dc_a9q:
+
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ ldr r4, [sp, #40] @ r4 => ui_neighboravailability
+
+ ands r5, r4, #0x01
+ beq top_available @LEFT NOT AVAILABLE
+
+ add r10, r0, #3
+ mov r2, #-1
+ ldrb r5, [r10], r2
+ ldrb r6, [r10], r2
+ ldrb r7, [r10], r2
+ add r5, r5, r6
+ ldrb r8, [r10], r2
+ add r5, r5, r7
+ ands r11, r4, #0x04 @ CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+ add r5, r5, r8
+ beq left_available
+ add r10, r0, #5
+ @ BOTH LEFT AND TOP AVAILABLE
+ ldrb r6, [r10], #1
+ ldrb r7, [r10], #1
+ add r5, r5, r6
+ ldrb r8, [r10], #1
+ add r5, r5, r7
+ ldrb r9, [r10], #1
+ add r5, r5, r8
+ add r5, r5, r9
+ add r5, r5, #4
+ lsr r5, r5, #3
+ vdup.u8 d0, r5
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ b end_func
+
+top_available: @ ONLT TOP AVAILABLE
+ ands r11, r4, #0x04 @ CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+ beq none_available
+
+ add r10, r0, #5
+ ldrb r6, [r10], #1
+ ldrb r7, [r10], #1
+ ldrb r8, [r10], #1
+ add r5, r6, r7
+ ldrb r9, [r10], #1
+ add r5, r5, r8
+ add r5, r5, r9
+ add r5, r5, #2
+ lsr r5, r5, #2
+ vdup.u8 d0, r5
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ b end_func
+
+left_available: @ONLY LEFT AVAILABLE
+ add r5, r5, #2
+ lsr r5, r5, #2
+ vdup.u8 d0, r5
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ b end_func
+
+none_available: @NONE AVAILABLE
+ mov r5, #128
+ vdup.u8 d0, r5
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ b end_func
+
+
+end_func:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_diag_dl
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_diag_dl(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_diag_dl_a9q
+
+ih264_intra_pred_luma_4x4_mode_diag_dl_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ add r0, r0, #5
+ sub r5, r3, #2
+ add r6, r0, #7
+ vld1.8 {d0}, [r0]
+ vext.8 d1, d0, d0, #1
+ vext.8 d2, d0, d0, #2
+ vld1.8 {d2[6]}, [r6]
+ vaddl.u8 q10, d0, d1
+ vaddl.u8 q11, d1, d2
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d3, q12, #2
+ vst1.32 {d3[0]}, [r1], r3
+ vext.8 d4, d3, d3, #1
+ vst1.32 {d4[0]}, [r1], r3
+ vst1.16 {d3[1]}, [r1]!
+ vst1.16 {d3[2]}, [r1], r5
+ vst1.16 {d4[1]}, [r1]!
+ vst1.16 {d4[2]}, [r1]
+
+end_func_diag_dl:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_diag_dr
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_diag_dr(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_diag_dr_a9q
+
+ih264_intra_pred_luma_4x4_mode_diag_dr_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+
+ vld1.u8 {d0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {d1}, [r0]
+ vext.8 d2, d1, d1, #1
+ vaddl.u8 q10, d0, d1
+ vaddl.u8 q11, d1, d2
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d3, q12, #2
+
+ vext.8 d4, d3, d3, #1
+ sub r5, r3, #2
+ vst1.16 {d4[1]}, [r1]!
+ vst1.16 {d4[2]}, [r1], r5
+ vst1.16 {d3[1]}, [r1]!
+ vst1.16 {d3[2]}, [r1], r5
+ vst1.32 {d4[0]}, [r1], r3
+ vst1.32 {d3[0]}, [r1], r3
+
+end_func_diag_dr:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_vert_r
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:Vertical_Right
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_vert_r(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_vert_r_a9q
+
+ih264_intra_pred_luma_4x4_mode_vert_r_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+
+ vld1.u8 {d0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {d1}, [r0]
+ vext.8 d2, d1, d1, #1
+ vaddl.u8 q10, d0, d1
+ vaddl.u8 q11, d1, d2
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d3, q12, #2
+ sub r5, r3, #2
+ vext.8 d5, d3, d3, #3
+ vst1.32 {d4[1]}, [r1], r3
+ vst1.32 {d5[0]}, [r1], r3
+ sub r8, r3, #3
+ vst1.u8 {d3[2]}, [r1]!
+ vst1.16 {d4[2]}, [r1]!
+ vst1.u8 {d4[6]}, [r1], r8
+ vst1.u8 {d3[1]}, [r1]!
+ vst1.16 {d5[0]}, [r1]!
+ vst1.u8 {d5[2]}, [r1]
+
+
+end_func_vert_r:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_horz_d
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:Horizontal_Down
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_horz_d(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_horz_d_a9q
+
+ih264_intra_pred_luma_4x4_mode_horz_d_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ vld1.u8 {d0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {d1}, [r0]
+ vext.8 d2, d1, d0, #1
+ vaddl.u8 q10, d0, d1
+ vaddl.u8 q11, d1, d2
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q12, #2
+ sub r5, r3, #2
+ vmov.8 d6, d5
+ vtrn.8 d4, d5 @
+ vst1.u16 {d5[1]}, [r1]!
+ vst1.16 {d6[2]}, [r1], r5
+ vst1.u16 {d4[1]}, [r1]!
+ vst1.16 {d5[1]}, [r1], r5
+ vst1.u16 {d5[0]}, [r1]!
+ vst1.16 {d4[1]}, [r1], r5
+ vst1.u16 {d4[0]}, [r1]!
+ vst1.16 {d5[0]}, [r1], r5
+
+end_func_horz_d:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_vert_l
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:Vertical_Left
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_vert_l(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_vert_l_a9q
+
+ih264_intra_pred_luma_4x4_mode_vert_l_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ add r0, r0, #4
+ vld1.u8 {d0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {d1}, [r0]
+ vext.8 d2, d1, d0, #1
+ vaddl.u8 q10, d0, d1
+ vaddl.u8 q11, d1, d2
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q12, #2
+ vext.8 d6, d4, d4, #1
+ vext.8 d7, d5, d5, #1
+ vst1.32 {d6[0]}, [r1], r3
+ vext.8 d16, d4, d4, #2
+ vext.8 d17, d5, d5, #2
+ vst1.32 {d7[0]}, [r1], r3
+ vst1.32 {d16[0]}, [r1], r3
+ vst1.32 {d17[0]}, [r1], r3
+
+
+
+end_func_vert_l:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_horz_u
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:Horizontal_Up
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_horz_u(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_horz_u_a9q
+
+ih264_intra_pred_luma_4x4_mode_horz_u_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ mov r10, r0
+ vld1.u8 {d0}, [r0]
+ ldrb r9, [r0], #1
+ vext.8 d1, d0, d0, #1
+ vld1.u8 {d0[7]}, [r10]
+ vext.8 d2, d1, d1, #1
+ vaddl.u8 q10, d0, d1
+ vaddl.u8 q11, d1, d2
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q12, #2
+ vmov d6, d4
+ vext.8 d6, d5, d4, #1
+ vst1.8 {d4[2]}, [r1]!
+ vst1.8 {d6[0]}, [r1]!
+ vtrn.8 d6, d5 @
+ sub r5, r3, #2
+ vtrn.8 d4, d6 @
+ vdup.8 d7, r9
+ vst1.16 {d6[0]}, [r1], r5
+ vst1.16 {d6[0]}, [r1]!
+ vst1.16 {d5[3]}, [r1], r5
+ vst1.16 {d5[3]}, [r1]!
+ vst1.16 {d7[3]}, [r1], r5
+ vst1.32 {d7[0]}, [r1], r3
+
+end_func_horz_u:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_intra_pred_luma_8x8_a9q.s b/common/arm/ih264_intra_pred_luma_8x8_a9q.s
new file mode 100755
index 0000000..6da1c95
--- /dev/null
+++ b/common/arm/ih264_intra_pred_luma_8x8_a9q.s
@@ -0,0 +1,1037 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_intra_pred_luma_8x8_a9q.s
+@*
+@* @brief
+@* Contains function definitions for intra 8x8 Luma prediction .
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* -ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q
+@* -ih264_intra_pred_luma_8x8_mode_vert_a9q
+@* -ih264_intra_pred_luma_8x8_mode_horz_a9q
+@* -ih264_intra_pred_luma_8x8_mode_dc_a9q
+@* -ih264_intra_pred_luma_8x8_mode_diag_dl_a9q
+@* -ih264_intra_pred_luma_8x8_mode_diag_dr_a9q
+@* -ih264_intra_pred_luma_8x8_mode_vert_r_a9q
+@* -ih264_intra_pred_luma_8x8_mode_horz_d_a9q
+@* -ih264_intra_pred_luma_8x8_mode_vert_l_a9q
+@* -ih264_intra_pred_luma_8x8_mode_horz_u_a9q
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_intra_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+
+
+.text
+.p2align 2
+
+ .extern ih264_gai1_intrapred_luma_8x8_horz_u
+.hidden ih264_gai1_intrapred_luma_8x8_horz_u
+scratch_intrapred_addr_8x8:
+ .long ih264_gai1_intrapred_luma_8x8_horz_u - scrlb8x8l2 - 8
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_ref_filtering
+@*
+@* @brief
+@* Reference sample filtering process for Intra_8x8 sample prediction
+@*
+@* @par Description:
+@* Perform Reference sample filtering process for Intra_8x8 sample prediction ,described in sec 8.3.2.2.1
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride [Not used]
+@*
+@* @param[in] dst_strd
+@* integer destination stride[Not used]
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels[Not used]
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_ref_filtering(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+
+
+ .global ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q
+
+ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vpush {d8-d15}
+
+
+ vld1.u8 {q0}, [r0]! @
+ vld1.u8 {q1}, [r0]
+ add r0, r0, #8 @
+ vext.8 q2, q0, q1, #1
+ vext.8 q3, q1, q1, #1
+ vext.8 q4, q2, q3, #1
+ vext.8 q5, q3, q3, #1
+ vld1.8 {d10[7]}, [r0] @ LOADING SRC[24] AGIN TO THE END FOR p'[ 15, -1 ] = ( p[ 14, -1 ] + 3 * p[ 15, -1 ] + 2 ) >> 2
+ vaddl.u8 q10, d0, d4
+ vaddl.u8 q7, d0, d0 @ SPECIAL CASE FOR p'[ -1 ,7 ] = ( p[ -1, 6 ] + 3 * p[ -1, 7 ] + 2 ) >> 2
+ vadd.u16 q7, q10, q7
+ vaddl.u8 q11, d1, d5
+ vqrshrun.s16 d14, q7, #2
+ vaddl.u8 q12, d4, d8
+ vaddl.u8 q13, d5, d9
+ vst1.8 {d14[0]}, [r1]!
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+ vaddl.u8 q9, d2, d6
+ vaddl.u8 q8, d6, d10
+ vqrshrun.s16 d4, q12, #2
+ vqrshrun.s16 d5, q13, #2
+ vadd.u16 q6, q8, q9
+ vst1.8 {q2}, [r1]!
+ vqrshrun.s16 d6, q6, #2
+ vst1.8 {d6}, [r1]
+
+
+end_func_ref_filt:
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_vert
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:vertical
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:vertical ,described in sec 8.3.2.2.2
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@void ih264_intra_pred_luma_8x8_mode_vert(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_vert_a9q
+
+ih264_intra_pred_luma_8x8_mode_vert_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ add r0, r0, #9
+ vld1.8 d0, [r0]
+
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_horz
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:horizontal
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:horizontal ,described in sec 8.3.2.2.2
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_horz_a9q
+
+ih264_intra_pred_luma_8x8_mode_horz_a9q:
+
+ stmfd sp!, {r14} @store register values to stack
+
+ vld1.u8 {d0}, [r0]
+ mov r2, #6
+
+ vdup.u8 d1, d0[7]
+ vdup.u8 d2, d0[6]
+ vst1.8 {d1}, [r1], r3
+
+loop_8x8_horz:
+ vext.8 d0, d0, d0, #6
+ vst1.8 {d2}, [r1], r3
+ vdup.u8 d1, d0[7]
+ subs r2, #2
+ vdup.u8 d2, d0[6]
+ vst1.8 {d1}, [r1], r3
+ bne loop_8x8_horz
+
+ vext.8 d0, d0, d0, #6
+ vst1.8 {d2}, [r1], r3
+
+ ldmfd sp!, {pc} @restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_dc
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:DC
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:DC ,described in sec 8.3.2.2.3
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_dc_a9q
+
+ih264_intra_pred_luma_8x8_mode_dc_a9q:
+
+ stmfd sp!, {r4, r14} @store register values to stack
+ ldr r4, [sp, #8] @r4 => ui_neighboravailability
+
+ ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE
+ beq top_available
+ ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+ beq left_available
+
+ vld1.u8 {d0}, [r0] @BOTH LEFT AND TOP AVAILABLE
+ add r0, r0, #9
+ vld1.u8 {d1}, [r0]
+ vpaddl.u8 q0, q0
+ vadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vqrshrun.s16 d0, q0, #4
+ vdup.u8 d0, d0[0]
+ b str_pred
+
+top_available: @ONLY TOP AVAILABLE
+ ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+ beq none_available
+
+ add r0, r0, #9
+ vld1.u8 {d0}, [r0]
+ vpaddl.u8 d0, d0
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vqrshrun.s16 d0, q0, #3
+ vdup.u8 d0, d0[0]
+ b str_pred
+
+left_available: @ONLY LEFT AVAILABLE
+ vld1.u8 {d0}, [r0]
+ vpaddl.u8 d0, d0
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vqrshrun.s16 d0, q0, #3
+ vdup.u8 d0, d0[0]
+ b str_pred
+
+none_available: @NONE AVAILABLE
+ vmov.u8 q0, #128
+
+str_pred:
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+
+ ldmfd sp!, {r4, pc} @Restoring registers from stack
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_diag_dl
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.4
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_8x8_mode_diag_dl_a9q
+
+ih264_intra_pred_luma_8x8_mode_diag_dl_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ add r0, r0, #9
+ sub r5, r3, #4
+ add r6, r0, #15
+ vld1.8 {q0}, [r0]
+ vext.8 q2, q0, q0, #2
+ vext.8 q1, q0, q0, #1
+ vld1.8 {d5[6]}, [r6]
+ @ q1 = q0 shifted to left once
+ @ q2 = q1 shifted to left once
+ vaddl.u8 q10, d0, d2 @Adding for FILT121
+ vaddl.u8 q11, d1, d3
+ vaddl.u8 q12, d2, d4
+ vaddl.u8 q13, d3, d5
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+
+ vqrshrun.s16 d4, q12, #2
+ vqrshrun.s16 d5, q13, #2
+ @Q2 has all FILT121 values
+ vst1.8 {d4}, [r1], r3
+ vext.8 q9, q2, q2, #1
+ vext.8 q8, q9, q9, #1
+ vst1.8 {d18}, [r1], r3
+ vext.8 q15, q8, q8, #1
+ vst1.8 {d16}, [r1], r3
+ vst1.8 {d30}, [r1], r3
+ vst1.32 {d4[1]}, [r1]!
+ vst1.32 {d5[0]}, [r1], r5
+ vst1.32 {d18[1]}, [r1]!
+ vst1.32 {d19[0]}, [r1], r5
+ vst1.32 {d16[1]}, [r1]!
+ vst1.32 {d17[0]}, [r1], r5
+ vst1.32 {d30[1]}, [r1]!
+ vst1.32 {d31[0]}, [r1], r5
+
+
+end_func_diag_dl:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_diag_dr
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.5
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_diag_dr_a9q
+
+ih264_intra_pred_luma_8x8_mode_diag_dr_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+
+ vld1.u8 {q0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {q1}, [r0]
+ vext.8 q2, q1, q1, #1
+ @ q1 = q0 shifted to left once
+ @ q2 = q1 shifted to left once
+ vaddl.u8 q10, d0, d2 @Adding for FILT121
+ vaddl.u8 q11, d1, d3
+ vaddl.u8 q12, d2, d4
+ vaddl.u8 q13, d3, d5
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+ vqrshrun.s16 d4, q12, #2
+ vqrshrun.s16 d5, q13, #2
+ @Q2 has all FILT121 values
+ sub r5, r3, #4
+ vext.8 q9, q2, q2, #15
+ vst1.8 {d19}, [r1], r3
+ vext.8 q8, q9, q9, #15
+ vst1.8 {d17}, [r1], r3
+ vext.8 q15, q8, q8, #15
+ vst1.8 {d31}, [r1], r3
+ vst1.32 {d4[1]}, [r1]!
+ vst1.32 {d5[0]}, [r1], r5
+ vst1.32 {d18[1]}, [r1]!
+ vst1.32 {d19[0]}, [r1], r5
+ vst1.32 {d16[1]}, [r1]!
+ vst1.32 {d17[0]}, [r1], r5
+ vst1.32 {d30[1]}, [r1]!
+ vst1.32 {d31[0]}, [r1], r5
+ vst1.8 {d4}, [r1], r3
+
+end_func_diag_dr:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_vert_r
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:Vertical_Right
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.6
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_vert_r_a9q
+
+ih264_intra_pred_luma_8x8_mode_vert_r_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ vld1.u8 {q0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {q1}, [r0]
+ vext.8 q2, q1, q1, #1
+ @ q1 = q0 shifted to left once
+ @ q2 = q1 shifted to left once
+ vaddl.u8 q10, d0, d2
+ vaddl.u8 q11, d1, d3
+ vaddl.u8 q12, d2, d4
+ vaddl.u8 q13, d3, d5
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q11, #1
+ vqrshrun.s16 d6, q12, #2
+ vqrshrun.s16 d7, q13, #2
+ @Q2 has all FILT11 values
+ @Q3 has all FILT121 values
+ sub r5, r3, #6
+ sub r6, r3, #4
+ vst1.8 {d5}, [r1], r3 @ row 0
+ vext.8 q9, q3, q3, #15
+ vmov.8 q11, q9
+ vext.8 q8, q2, q2, #1
+ vst1.8 {d19}, [r1], r3 @row 1
+
+ vmov.8 q15, q8
+ vext.8 q10, q2, q2, #15
+ vuzp.8 q8, q9
+ @row 2
+ vext.8 q14, q8, q8, #1
+ vst1.8 {d21}, [r1]
+ vst1.8 {d6[6]}, [r1], r3
+ @row 3
+
+ vst1.16 {d29[1]}, [r1]!
+ vst1.32 {d7[0]}, [r1]!
+ vst1.16 {d7[2]}, [r1], r5
+@row 4
+ vst1.16 {d19[1]}, [r1]!
+ vst1.32 {d5[0]}, [r1]!
+ vst1.16 {d5[2]}, [r1], r5
+
+@row 5
+ vext.8 q13, q9, q9, #1
+ vst1.16 {d17[1]}, [r1]!
+ vst1.32 {d23[0]}, [r1]!
+ vst1.16 {d23[2]}, [r1], r5
+
+
+@row 6
+ vst1.16 {d27[0]}, [r1]!
+ vst1.8 {d27[2]}, [r1]!
+ vst1.8 {d5[0]}, [r1]!
+ vst1.32 {d31[0]}, [r1], r6
+@row 7
+ vst1.32 {d29[0]}, [r1]!
+ vst1.32 {d7[0]}, [r1]!
+
+
+
+end_func_vert_r:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_horz_d
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:Horizontal_Down
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.7
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_8x8_mode_horz_d_a9q
+
+ih264_intra_pred_luma_8x8_mode_horz_d_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vpush {d8-d15}
+
+ vld1.u8 {q0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {q1}, [r0]
+ vext.8 q2, q1, q1, #1
+ @ q1 = q0 shifted to left once
+ @ q2 = q1 shifted to left once
+ vaddl.u8 q10, d0, d2
+ vaddl.u8 q11, d1, d3
+ vaddl.u8 q12, d2, d4
+ vaddl.u8 q13, d3, d5
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q11, #1
+ vqrshrun.s16 d6, q12, #2
+ vqrshrun.s16 d7, q13, #2
+ @Q2 has all FILT11 values
+ @Q3 has all FILT121 values
+ vmov.8 q4, q2
+ vmov.8 q5, q3
+ sub r6, r3, #6
+ vtrn.8 q4, q5 @
+ vmov.8 q6, q4
+ vmov.8 q7, q5
+ sub r5, r3, #4
+ vtrn.16 q6, q7
+ vext.8 q8, q3, q3, #14
+ @ROW 0
+ vst1.8 {d17}, [r1]
+ vst1.16 {d10[3]}, [r1], r3
+
+ @ROW 1
+ vst1.32 {d14[1]}, [r1]!
+ vst1.32 {d7[0]}, [r1], r5
+ @ROW 2
+ vst1.16 {d10[2]}, [r1]!
+ vst1.32 {d14[1]}, [r1]!
+ vst1.16 {d7[0]}, [r1], r6
+ @ROW 3
+ vst1.32 {d12[1]}, [r1]!
+ vst1.32 {d14[1]}, [r1], r5
+ @ROW 4
+ vst1.16 {d14[1]}, [r1]!
+ vst1.32 {d12[1]}, [r1]!
+ vst1.16 {d14[2]}, [r1], r6
+ @ROW 5
+ vst1.32 {d14[0]}, [r1]!
+ vst1.32 {d12[1]}, [r1], r5
+ @ROW 6
+ vst1.16 {d10[0]}, [r1]!
+ vst1.16 {d8[1]}, [r1]!
+ vst1.16 {d14[1]}, [r1]!
+ vst1.16 {d12[2]}, [r1], r6
+ @ROW 7
+ vst1.32 {d12[0]}, [r1]!
+ vst1.32 {d14[0]}, [r1], r5
+
+end_func_horz_d:
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_vert_l
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:Vertical_Left
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.8
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_vert_l_a9q
+
+ih264_intra_pred_luma_8x8_mode_vert_l_a9q:
+
+ stmfd sp!, {r4-r12, r14} @Restoring registers from stack
+ vpush {d8-d15}
+ add r0, r0, #9
+ vld1.u8 {q0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {q1}, [r0]
+ vext.8 q2, q1, q1, #1
+ vaddl.u8 q10, d0, d2
+ vaddl.u8 q11, d1, d3
+ vaddl.u8 q12, d2, d4
+ vaddl.u8 q13, d3, d5
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q11, #1
+ vqrshrun.s16 d6, q12, #2
+ vext.8 q4, q2, q2, #1
+ vqrshrun.s16 d7, q13, #2
+ @Q2 has all FILT11 values
+ @Q3 has all FILT121 values
+
+ vext.8 q5, q3, q3, #1
+ @ROW 0,1
+ vst1.8 {d4}, [r1], r3
+ vst1.8 {d6}, [r1], r3
+
+ vext.8 q6, q4, q4, #1
+ vext.8 q7, q5, q5, #1
+ @ROW 2,3
+ vst1.8 {d8}, [r1], r3
+ vst1.8 {d10}, [r1], r3
+
+ vext.8 q8, q6, q6, #1
+ vext.8 q9, q7, q7, #1
+ @ROW 4,5
+ vst1.8 {d12}, [r1], r3
+ vst1.8 {d14}, [r1], r3
+ @ROW 6,7
+ vst1.8 {d16}, [r1], r3
+ vst1.8 {d18}, [r1], r3
+
+end_func_vert_l:
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_horz_u
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:Horizontal_Up
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.9
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_8x8_mode_horz_u_a9q
+
+ih264_intra_pred_luma_8x8_mode_horz_u_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vpush {d8-d15}
+
+ vld1.u8 {q0}, [r0]
+ vld1.u8 {d1[7]}, [r0]
+ vext.8 q1, q0, q0, #1
+ vext.8 q2, q1, q1, #1
+ @ LOADING V TABLE
+ ldr r12, scratch_intrapred_addr_8x8
+scrlb8x8l2:
+ add r12, r12, pc
+ vaddl.u8 q10, d0, d2
+ vaddl.u8 q11, d1, d3
+ vaddl.u8 q12, d2, d4
+ vaddl.u8 q13, d3, d5
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+ vld1.u8 {q5}, [r12]
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q11, #1
+ vqrshrun.s16 d6, q12, #2
+ vqrshrun.s16 d7, q13, #2
+ @Q2 has all FILT11 values
+ @Q3 has all FILT121 values
+ vtbl.u8 d12, {q2, q3}, d10
+ vdup.u8 q7, d5[7] @
+ vtbl.u8 d13, {q2, q3}, d11
+ vext.8 q8, q6, q7, #2
+ vext.8 q9, q8, q7, #2
+ vst1.8 {d12}, [r1], r3
+ vext.8 q10, q9, q7, #2
+ vst1.8 {d16}, [r1], r3
+ vst1.8 {d18}, [r1], r3
+ vst1.8 {d20}, [r1], r3
+ vst1.8 {d13}, [r1], r3
+ vst1.8 {d17}, [r1], r3
+ vst1.8 {d19}, [r1], r3
+ vst1.8 {d21}, [r1], r3
+
+
+end_func_horz_u:
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+
diff --git a/common/arm/ih264_iquant_itrans_recon_a9.s b/common/arm/ih264_iquant_itrans_recon_a9.s
new file mode 100755
index 0000000..f71ca69
--- /dev/null
+++ b/common/arm/ih264_iquant_itrans_recon_a9.s
@@ -0,0 +1,871 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264_iquant_itrans_recon_a9.s
+@ *
+@ * @brief
+@ * Contains function definitions for single stage inverse transform
+@ *
+@ * @author
+@ * Mohit
+@ * Harinarayanaan
+@ *
+@ * @par List of Functions:
+@ * - ih264_iquant_itrans_recon_4x4_a9()
+@ * - ih264_iquant_itrans_recon_8x8_a9()
+@ * - ih264_iquant_itrans_recon_chroma_4x4_a9()
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@*/
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
+@ *
+@ * @par Description:
+@ * Performs inverse transform Ci4 and adds the residue to get the
+@ * reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ * Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ * Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ * Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ * QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction stride
+@ *
+@ * @param[in] out_strd
+@ * Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*16
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns Void
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD32 *pi4_tmp,
+@ WORD32 iq_start_idx
+@ WORD16 *pi2_dc_ld_addr)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 => pred_strd
+@r4 => out_strd
+@r5 => *pu2_iscal_mat
+@r6 => *pu2_weigh_mat
+@r7 => u4_qp_div_6
+@r8 => iq_start_idx
+@r10=> pi2_dc_ld_addr
+.text
+.p2align 2
+
+ .global ih264_iquant_itrans_recon_4x4_a9
+
+ih264_iquant_itrans_recon_4x4_a9:
+
+@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
+@If the macro value changes need to change the instruction according to it.
+@Only one shift is done in horizontal inverse because,
+@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r7, [sp, #52] @Loads u4_qp_div_6
+ ldr r4, [sp, #40] @Loads out_strd
+ vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15
+ ldr r5, [sp, #44] @Loads *pu2_iscal_mat
+
+ ldr r6, [sp, #48] @Loads *pu2_weigh_mat
+
+ ldr r8, [sp, #60] @Loads iq_start_idx
+
+ ldr r10, [sp, #64] @Load alternate dc address
+
+ vpush {d8-d15}
+@=======================DEQUANT FROM HERE===================================
+
+ vld4.s16 {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15
+ vld4.s16 {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15
+ vmul.s16 q10, q10, q13 @x[i]=(scale[i] * dequant[i]) where i = 0..7
+ vld4.s16 {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15
+
+ vmul.s16 q11, q11, q14 @x[i]=(scale[i] * dequant[i]) where i = 8..15
+
+ subs r8, r8, #1 @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
+ ldreqsh r9, [r10] @ Loads signed halfword pi2_dc_ld_addr[0], if r8==1
+
+ vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+ vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+ vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+ vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+ vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3
+ vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7
+ vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11
+ vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15
+
+ vqrshrn.s32 d0, q0, #0x4 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+ vqrshrn.s32 d1, q1, #0x4 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+ vqrshrn.s32 d2, q2, #0x4 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+ vqrshrn.s32 d3, q3, #0x4 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+ vmoveq.16 d0[0], r9 @ Restore dc value in case of intra, i.e. r8 == 1
+
+@========= PROCESS IDCT FROM HERE =======
+@Steps for Stage 1:
+@------------------
+ vld1.32 d30[0], [r1], r3 @I row Load pu1_pred buffer
+ vadd.s16 d4, d0, d2 @x0 = q0 + q1;
+
+ vsub.s16 d5, d0, d2 @x1 = q0 - q1;
+
+ vshr.s16 d8, d1, #1 @q0>>1
+ vshr.s16 d9, d3, #1 @q1>>1
+
+ vsub.s16 d6, d8, d3 @x2 = (q0 >> 1) - q1;
+ vadd.s16 d7, d1, d9 @x3 = q0+ (q1 >> 1);
+ vld1.32 d30[1], [r1], r3 @II row Load pu1_pred buffer
+
+ vswp d6, d7 @Reverse positions of x2 and x3
+
+ vsub.s16 q6, q2, q3 @x0-x3 and x1-x2 combined
+ vadd.s16 q5, q2, q3 @x0 + x3 and x1+x2 combined
+
+ vld1.32 d31[0], [r1], r3 @III row Load pu1_pred buf
+
+ vswp d12, d13
+@Steps for Stage 2:
+@------------------
+ vtrn.16 d10, d11
+ vtrn.16 d12, d13
+ vtrn.32 d10, d12
+ vtrn.32 d11, d13
+ vadd.s16 d14, d10, d12 @x0 = q0 + q1;
+
+ vsub.s16 d15, d10, d12 @x1 = q0 - q1;
+
+ vshr.s16 d18, d11, #1 @q0>>1
+ vshr.s16 d19, d13, #1 @q1>>1
+
+ vsub.s16 d16, d18, d13 @x2 = (q0 >> 1) - q1;
+ vadd.s16 d17, d11, d19 @x3 = q0+ (q1 >> 1);
+
+ vld1.32 d31[1], [r1], r3 @IV row Load pu1_pred buffer
+ vswp d16, d17 @Reverse positions of x2 and x3
+
+ vsub.s16 q11, q7, q8 @x0-x3 and x1-x2 combined
+ vadd.s16 q10, q7, q8 @x0 + x3 and x1+x2 combined
+
+ vswp d22, d23
+
+ vrshr.s16 q10, q10, #6 @
+ vrshr.s16 q11, q11, #6
+
+ vaddw.u8 q10, q10, d30
+ vaddw.u8 q11, q11, d31
+
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+
+ vst1.32 d0[0], [r2], r4 @I row store the value
+ vst1.32 d0[1], [r2], r4 @II row store the value
+ vst1.32 d1[0], [r2], r4 @III row store the value
+ vst1.32 d1[1], [r2] @IV row store the value
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+
+ @/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
+@ *
+@ * @par Description:
+@ * Performs inverse transform Ci4 and adds the residue to get the
+@ * reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ * Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ * Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ * Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ * QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction stride
+@ *
+@ * @param[in] out_strd
+@ * Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*16
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns Void
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD32 *pi4_tmp
+@ WORD16 *pi2_dc_src)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 => pred_strd
+@r4 => out_strd
+@r5 => *pu2_iscal_mat
+@r6 => *pu2_weigh_mat
+@r7 => u4_qp_div_6
+
+ .global ih264_iquant_itrans_recon_chroma_4x4_a9
+ih264_iquant_itrans_recon_chroma_4x4_a9:
+
+@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
+@If the macro value changes need to change the instruction according to it.
+@Only one shift is done in horizontal inverse because,
+@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r7, [sp, #52] @Loads u4_qp_div_6
+ ldr r4, [sp, #40] @Loads out_strd
+ vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15
+ ldr r5, [sp, #44] @Loads *pu2_iscal_mat
+ ldr r6, [sp, #48] @Loads *pu2_weigh_mat
+ ldr r8, [sp, #60] @loads *pi2_dc_src
+
+ vpush {d8-d15}
+@=======================DEQUANT FROM HERE===================================
+
+ vld4.s16 {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15
+ vld4.s16 {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15
+ vmul.s16 q10, q10, q13 @x[i]=(scale[i] * dequant[i]) where i = 0..7
+ vld4.s16 {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15
+
+ vmul.s16 q11, q11, q14 @x[i]=(scale[i] * dequant[i]) where i = 8..15
+
+ vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+ vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+ vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+ vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+ vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3
+ vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7
+ vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11
+ vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15
+
+ vqrshrn.s32 d0, q0, #0x4 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+ vqrshrn.s32 d1, q1, #0x4 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+ vqrshrn.s32 d2, q2, #0x4 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+ vqrshrn.s32 d3, q3, #0x4 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+ ldrsh r9, [r8] @ Loads signed halfword pi2_dc_src[0]
+ vmov.16 d0[0], r9 @ Restore dc value since its chroma iq-it
+
+@========= PROCESS IDCT FROM HERE =======
+@Steps for Stage 1:
+@------------------
+ vld2.8 {d28, d29}, [r1], r3 @I row Load pu1_pred buffer
+ vadd.s16 d4, d0, d2 @x0 = q0 + q1;
+
+ vsub.s16 d5, d0, d2 @x1 = q0 - q1;
+
+ vshr.s16 d8, d1, #1 @q0>>1
+ vshr.s16 d9, d3, #1 @q1>>1
+
+ vsub.s16 d6, d8, d3 @x2 = (q0 >> 1) - q1;
+ vadd.s16 d7, d1, d9 @x3 = q0+ (q1 >> 1);
+ vld2.8 {d29, d30}, [r1], r3 @II row Load pu1_pred buffer
+
+ vswp d6, d7 @Reverse positions of x2 and x3
+
+ vsub.s16 q6, q2, q3 @x0-x3 and x1-x2 combined
+ vtrn.32 d28, d29 @ D28 -- row I and II of pu1_pred_buffer
+ vadd.s16 q5, q2, q3 @x0 + x3 and x1+x2 combined
+
+ vld2.8 {d29, d30}, [r1], r3 @III row Load pu1_pred buf
+
+ vswp d12, d13
+@Steps for Stage 2:
+@------------------
+ vtrn.16 d10, d11
+ vtrn.16 d12, d13
+ vtrn.32 d10, d12
+ vtrn.32 d11, d13
+ vadd.s16 d14, d10, d12 @x0 = q0 + q1;
+
+ vsub.s16 d15, d10, d12 @x1 = q0 - q1;
+
+ vshr.s16 d18, d11, #1 @q0>>1
+ vshr.s16 d19, d13, #1 @q1>>1
+
+ vsub.s16 d16, d18, d13 @x2 = (q0 >> 1) - q1;
+ vadd.s16 d17, d11, d19 @x3 = q0+ (q1 >> 1);
+
+ vld2.8 {d30, d31}, [r1], r3 @IV row Load pu1_pred buffer
+ vswp d16, d17 @Reverse positions of x2 and x3
+
+ vsub.s16 q11, q7, q8 @x0-x3 and x1-x2 combined
+ vtrn.32 d29, d30 @ D29 -- row III and IV of pu1_pred_buf
+ vadd.s16 q10, q7, q8 @x0 + x3 and x1+x2 combined
+
+ vswp d22, d23
+
+ vrshr.s16 q10, q10, #6 @
+ vrshr.s16 q11, q11, #6
+
+ vaddw.u8 q10, q10, d28
+ vaddw.u8 q11, q11, d29
+
+ vld1.u8 d0, [r2], r4 @Loading out buffer 16 coeffs
+ vld1.u8 d1, [r2], r4
+ vld1.u8 d2, [r2], r4
+ vld1.u8 d3, [r2], r4
+
+ sub r2, r2, r4, lsl #2
+
+ vqmovun.s16 d20, q10 @Getting quantized coeffs
+ vqmovun.s16 d22, q11
+
+ vmovl.u8 q10, d20 @Move the coffs into 16 bit
+ vmovl.u8 q11, d22 @so that we can use vbit to copy
+
+ vmov.u16 q14, #0x00ff @Copy lsb from qantized(long)coeffs
+
+ vbit.u8 q0, q10, q14
+ vbit.u8 q1, q11, q14
+
+ vst1.u8 d0, [r2], r4
+ vst1.u8 d1, [r2], r4
+ vst1.u8 d2, [r2], r4
+ vst1.u8 d3, [r2]
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+
+@/*
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
+@ *
+@ * @par Description:
+@ * Performs inverse transform Ci8 and adds the residue to get the
+@ * reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ * Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ * Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ * Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ * QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction stride
+@ *
+@ * @param[in] out_strd
+@ * Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*64
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns Void
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD32 *pi4_tmp,
+@ WORD32 iq_start_idx)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 => pred_strd
+@r4 => out_strd
+@r5 => *pu2_iscal_mat
+@r6 => *pu2_weigh_mat
+@r7 => u4_qp_div_6
+
+
+ .global ih264_iquant_itrans_recon_8x8_a9
+ih264_iquant_itrans_recon_8x8_a9:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r7, [sp, #52] @Loads u4_qp_div_6
+ ldr r4, [sp, #40] @Loads out_strd
+
+ ldr r5, [sp, #44] @Loads *pu2_iscal_mat
+ ldr r6, [sp, #48] @Loads *pu2_weigh_mat
+ vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15
+ vpush {d8-d15}
+
+idct_8x8_begin:
+
+@========= DEQUANT FROM HERE ===========
+
+ vld1.32 {q13}, [r5]! @ Q13 = dequant values row 0
+ vld1.32 {q10}, [r6]! @ Q10 = scaling factors row 0
+ vld1.32 {q14}, [r5]! @ Q14 = dequant values row 1
+ vmul.s16 q10, q10, q13 @ Q10 = x[i] = (scale[i] * dequant[i]) where i = 0..7
+ vld1.32 {q11}, [r6]! @ Q11 = scaling factors row 1
+ vld1.32 {q8}, [r0]! @ Q8 = Source row 0
+ vmul.s16 q11, q11, q14 @ Q11 = x[i] = (scale[i] * dequant[i]) where i = 8..15
+ vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+ vld1.32 {q9}, [r0]! @ Q8 = Source row 1
+ vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+ vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+ vld1.32 {q13}, [r6]! @ Scaling factors row 2
+ vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+ vld1.32 {q14}, [r6]! @ Scaling factors row 3
+ vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3
+ vld1.32 {q10}, [r5]! @ Q10 = Dequant values row 2
+ vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7
+ vld1.32 {q8}, [r0]! @ Source Row 2
+ vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11
+ vld1.32 {q11}, [r5]! @ Q11 = Dequant values row 3
+ vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15
+ vld1.32 {q9}, [r0]! @ Source Row 3
+ vmul.s16 q10, q10, q13 @ Dequant row2*scale matrix row 2
+ vmul.s16 q11, q11, q14 @ Dequant row 3*scale matrix row 3
+ vld1.32 {q4}, [r6]! @ Scaling factors row 4
+ vqrshrn.s32 d0, q0, #0x6 @ D0 = c[i] = ((q[i] + 32) >> 6) where i = 0..3
+ vqrshrn.s32 d1, q1, #0x6 @ D1 = c[i] = ((q[i] + 32) >> 6) where i = 4..7
+ vld1.32 {q5}, [r6]! @ Scaling factors row 5
+ vqrshrn.s32 d2, q2, #0x6 @ D2 = c[i] = ((q[i] + 32) >> 6) where i = 8..11
+ vqrshrn.s32 d3, q3, #0x6 @ D3 = c[i] = ((q[i] + 32) >> 6) where i = 12..15
+ vld1.32 {q13}, [r5]! @ Q13 = Dequant values row 4
+ vmull.s16 q2, d16, d20 @ p[i] = (x[i] * trns_coeff[i]) where i=16..19
+ vmull.s16 q3, d17, d21 @ p[i] = (x[i] * trns_coeff[i]) where i=20..23
+ vld1.32 {q12}, [r5]! @ Q12 = Dequant values row 5
+ vmull.s16 q6, d18, d22 @ p[i] = (x[i] * trns_coeff[i]) where i=24..27
+ vmull.s16 q7, d19, d23 @ p[i] = (x[i] * trns_coeff[i]) where i=28..31
+
+ vld1.32 {q14}, [r0]! @ Source row 4
+ vmul.s16 q10, q4, q13 @ Dequant row4*scale matrix row 4
+ vmul.s16 q11, q5, q12 @ Dequant row5*scale matrix row 5
+ vld1.32 {q9}, [r0]! @ Source row 5
+ vshl.s32 q2, q2, q15 @
+ vshl.s32 q3, q3, q15 @
+ vld1.32 {q13}, [r6]! @ Scaling factors row 6
+ vshl.s32 q6, q6, q15 @
+ vshl.s32 q7, q7, q15 @
+ vmull.s16 q4, d28, d20 @ i = 32..35
+ vqrshrn.s32 d4, q2, #0x6 @ D4 = c[i] = ((q[i] + 32) >> 6) where i = 16..19
+ vqrshrn.s32 d5, q3, #0x6 @ D5 = c[i] = ((q[i] + 32) >> 6) where i = 20..23
+ vmull.s16 q5, d29, d21 @ i =36..39
+ vld1.32 {q10}, [r5]! @ Dequant values row 6
+ vqrshrn.s32 d6, q6, #0x6 @ D6 = c[i] = ((q[i] + 32) >> 6) where i = 24..27
+ vqrshrn.s32 d7, q7, #0x6 @ D7 = c[i] = ((q[i] + 32) >> 6) where i = 28..31
+ vld1.32 {q14}, [r6]! @ Scaling factors row 7
+ vmull.s16 q6, d18, d22 @
+ vld1.32 {q8}, [r0]! @ Source row 6
+ vmull.s16 q7, d19, d23 @
+ vld1.32 {q11}, [r5]! @ Dequant values row 7
+ vshl.s32 q4, q4, q15 @
+ vld1.32 {q9}, [r0]! @ Source row 7
+ vshl.s32 q5, q5, q15 @
+
+ vshl.s32 q6, q6, q15 @
+ vshl.s32 q7, q7, q15 @
+ vmul.s16 q10, q10, q13 @ Dequant*scaling row 6
+ vmul.s16 q11, q11, q14 @ Dequant*scaling row 7
+ vqrshrn.s32 d8, q4, #0x6 @ D8 = c[i] = ((q[i] + 32) >> 6) where i = 32..35
+ vqrshrn.s32 d9, q5, #0x6 @ D9 = c[i] = ((q[i] + 32) >> 6) where i = 36..39
+ vqrshrn.s32 d10, q6, #0x6 @ D10 = c[i] = ((q[i] + 32) >> 6) where i = 40..43
+ vqrshrn.s32 d11, q7, #0x6 @ D11 = c[i] = ((q[i] + 32) >> 6) where i = 44..47
+ vmull.s16 q6, d16, d20 @ i= 48..51
+ vmull.s16 q7, d17, d21 @ i= 52..55
+ vmull.s16 q8, d18, d22 @ i=56..59
+ vmull.s16 q9, d19, d23 @ i=60..63
+ vshl.s32 q6, q6, q15 @
+ vzip.s16 q0, q1 @Transpose
+ vshl.s32 q7, q7, q15 @
+ vshl.s32 q8, q8, q15 @
+ vzip.s16 q2, q3 @
+ vshl.s32 q9, q9, q15 @
+ vqrshrn.s32 d12, q6, #0x6 @ D12 = c[i] = ((q[i] + 32) >> 6) where i = 48..51
+ vzip.s16 q4, q5 @Transpose
+ vqrshrn.s32 d13, q7, #0x6 @ D13 = c[i] = ((q[i] + 32) >> 6) where i = 52..55
+ vqrshrn.s32 d14, q8, #0x6 @ D14 = c[i] = ((q[i] + 32) >> 6) where i = 56..59
+ vzip.s32 q0, q2 @Transpose
+ vqrshrn.s32 d15, q9, #0x6 @ D15 = c[i] = ((q[i] + 32) >> 6) where i = 60..63
+
+@========= PROCESS IDCT FROM HERE =======
+
+@Steps for Stage 2:
+@------------------
+
+@ TRANSPOSE 8x8 coeffs to actual order
+
+ vzip.s16 q6, q7 @
+
+ vzip.s32 q1, q3 @
+ vzip.s32 q4, q6 @
+ vzip.s32 q5, q7 @
+
+ vswp d1, d8 @ Q0/Q1 = Row order x0/x1
+ vswp d3, d10 @ Q2/Q3 = Row order x2/x3
+ vswp d5, d12 @ Q4/Q5 = Row order x4/x5
+ vswp d7, d14 @ Q6/Q7 = Row order x6/x7
+
+ vswp q1, q4 @
+ vshr.s16 q10, q2, #0x1 @
+ vswp q3, q6 @
+
+@Steps for Stage 1:
+@------------------
+
+ vadd.s16 q8, q0, q4 @ Q8 = y0
+ vsub.s16 q9, q0, q4 @ Q9 = y2
+
+ vsra.s16 q2, q6, #0x1 @ Q2 = y6
+ vsub.s16 q6, q10, q6 @ Q6 = y4
+
+ vaddl.s16 q12, d14, d2 @ y3 (0-3) 1+7
+ vaddl.s16 q13, d15, d3 @ y3 (4-7) 1+7
+
+ vsubl.s16 q10, d14, d2 @ y5 (0-3) 7-1
+ vsubl.s16 q11, d15, d3 @ y5 (4-7) 7-1
+
+ vadd.s16 q0, q8, q2 @ Q0 = z0
+ vsub.s16 q4, q8, q2 @ Q4 = z6
+
+ vadd.s16 q8, q9, q6 @ Q8 = z2
+ vsub.s16 q2, q9, q6 @ Q2 = z4
+
+ vsubw.s16 q12, q12, d6 @ y3 (0-3) 1+7-3
+ vsubw.s16 q13, q13, d7 @ y3 (0-7) 1+7-3
+
+ vshr.s16 q6, q3, #0x1 @
+
+ vaddw.s16 q10, q10, d10 @
+ vaddw.s16 q11, q11, d11 @
+
+ vshr.s16 q9, q5, #0x1 @
+
+ vsubw.s16 q12, q12, d12 @
+ vsubw.s16 q13, q13, d13 @
+
+ vaddw.s16 q10, q10, d18 @
+ vaddw.s16 q11, q11, d19 @
+
+ vqmovn.s32 d12, q12 @
+ vaddl.s16 q12, d10, d6 @
+ vqmovn.s32 d13, q13 @ Q6 = y3
+ vaddl.s16 q13, d11, d7 @
+ vqmovn.s32 d18, q10 @
+ vsubl.s16 q10, d10, d6 @
+ vqmovn.s32 d19, q11 @ Q9 = y5
+ vsubl.s16 q11, d11, d7 @
+
+ vshr.s16 q3, q6, #0x2 @
+
+ vsra.s16 q6, q9, #0x2 @ Q6 = z3
+
+ vaddw.s16 q12, q12, d2 @
+ vaddw.s16 q13, q13, d3 @
+
+ vshr.s16 q1, #0x1 @
+
+ vsub.s16 q5, q3, q9 @ Q5 = z5
+
+ vsubw.s16 q10, q10, d14 @
+ vsubw.s16 q11, q11, d15 @
+
+ vshr.s16 q7, #0x1 @
+
+ vaddw.s16 q12, q12, d2 @
+ vaddw.s16 q13, q13, d3 @
+
+ vsubw.s16 q10, q10, d14 @
+ vsubw.s16 q11, q11, d15 @
+
+
+ vqmovn.s32 d14, q12 @
+ vadd.s16 q1, q8, q5 @ Q1 = x1
+ vqmovn.s32 d15, q13 @ Q7 = y7
+ vsub.s16 q3, q8, q5 @ Q3 = x6
+ vqmovn.s32 d18, q10 @
+ vsub.s16 q5, q2, q6 @ Q5 = x5
+ vqmovn.s32 d19, q11 @ Q9 = y1
+ vadd.s16 q2, q2, q6 @ Q2 = x2
+
+ vshr.s16 q12, q9, #0x2 @
+ vsra.s16 q9, q7, #0x2 @ Q9 = z1
+
+ vsub.s16 q11, q7, q12 @ Q11 = z7
+
+ vadd.s16 q6, q4, q9 @ Q6 = x3
+ vsub.s16 q4, q4, q9 @ Q4 = x4
+
+ vsub.s16 q7, q0, q11 @ Q7 = x7
+ vadd.s16 q0, q0, q11 @ Q0 = x0
+
+ vswp.s16 q3, q6 @ Q3 = x3, Q6 = x6
+
+
+@Steps for Stage 2:
+@------------------
+
+@ TRANSPOSE 8x8 coeffs to actual order
+
+ vzip.s16 q0, q1 @
+ vzip.s16 q2, q3 @
+ vzip.s16 q4, q5 @
+ vzip.s16 q6, q7 @
+
+ vzip.s32 q0, q2 @
+ vzip.s32 q1, q3 @
+ vzip.s32 q4, q6 @
+ vzip.s32 q5, q7 @
+
+ vswp d1, d8 @ Q0/Q1 = Row order x0/x1
+ vswp d3, d10 @ Q2/Q3 = Row order x2/x3
+ vswp d5, d12 @ Q4/Q5 = Row order x4/x5
+ vswp d7, d14 @ Q6/Q7 = Row order x6/x7
+
+ vswp q1, q4 @
+ vshr.s16 q10, q2, #0x1 @
+ vswp q3, q6 @
+
+@Steps for Stage 3:
+@------------------
+
+@Repeat stage 1 again for vertical transform
+
+ vadd.s16 q8, q0, q4 @ Q8 = y0
+ vld1.32 d28, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vsub.s16 q9, q0, q4 @ Q9 = y2
+
+ vsra.s16 q2, q6, #0x1 @ Q2 = y6
+ vsub.s16 q6, q10, q6 @ Q6 = y4
+
+ vaddl.s16 q12, d14, d2 @
+ vld1.32 d29, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vaddl.s16 q13, d15, d3 @
+
+ vsubl.s16 q10, d14, d2 @
+ vld1.32 d30, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vsubl.s16 q11, d15, d3 @
+
+ vadd.s16 q0, q8, q2 @ Q0 = z0
+ vld1.32 d31, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vsub.s16 q4, q8, q2 @ Q4 = z6
+
+ vadd.s16 q8, q9, q6 @ Q8 = z2
+ vsub.s16 q2, q9, q6 @ Q2 = z4
+
+ vsubw.s16 q12, q12, d6 @
+ vsubw.s16 q13, q13, d7 @
+
+ vshr.s16 q6, q3, #0x1 @
+
+ vaddw.s16 q10, q10, d10 @
+ vaddw.s16 q11, q11, d11 @
+
+ vshr.s16 q9, q5, #0x1 @
+
+ vsubw.s16 q12, q12, d12 @
+ vsubw.s16 q13, q13, d13 @
+
+ vaddw.s16 q10, q10, d18 @
+ vaddw.s16 q11, q11, d19 @
+
+ vqmovn.s32 d12, q12 @
+ vaddl.s16 q12, d10, d6 @
+ vqmovn.s32 d13, q13 @ Q6 = y3
+ vaddl.s16 q13, d11, d7 @
+ vqmovn.s32 d18, q10 @
+ vsubl.s16 q10, d10, d6 @
+ vqmovn.s32 d19, q11 @ Q9 = y5
+ vsubl.s16 q11, d11, d7 @
+
+ vshr.s16 q3, q6, #0x2 @
+
+ vsra.s16 q6, q9, #0x2 @ Q6 = z3
+
+ vaddw.s16 q12, q12, d2 @
+ vaddw.s16 q13, q13, d3 @
+
+ vshr.s16 q1, #0x1 @
+
+ vsub.s16 q5, q3, q9 @ Q5 = z5
+
+ vsubw.s16 q10, q10, d14 @
+ vsubw.s16 q11, q11, d15 @
+
+ vshr.s16 q7, #0x1 @
+
+ vaddw.s16 q12, q12, d2 @
+ vaddw.s16 q13, q13, d3 @
+
+ vsubw.s16 q10, q10, d14 @
+ vsubw.s16 q11, q11, d15 @
+
+ vqmovn.s32 d14, q12 @
+ vadd.s16 q1, q8, q5 @ Q1 = x1
+ vqmovn.s32 d15, q13 @ Q7 = y7
+ vsub.s16 q3, q8, q5 @ Q3 = x6
+ vqmovn.s32 d18, q10 @
+ vsub.s16 q5, q2, q6 @ Q5 = x5
+ vqmovn.s32 d19, q11 @ Q9 = y1
+ vadd.s16 q2, q2, q6 @ Q2 = x2
+
+ vshr.s16 q12, q9, #0x2 @
+ vsra.s16 q9, q7, #0x2 @ Q9 = z1
+
+ vsub.s16 q11, q7, q12 @ Q11 = z7
+
+ vadd.s16 q6, q4, q9 @ Q6 = x3
+ vsub.s16 q4, q4, q9 @ Q4 = x4
+
+ vsub.s16 q7, q0, q11 @ Q7 = x7
+ vadd.s16 q0, q0, q11 @ Q0 = x0
+
+ vswp.s16 q3, q6 @ Q3 <-> Q6
+
+ vrshr.s16 q1, q1, #6 @
+ vld1.32 d16, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vrshr.s16 q2, q2, #6 @
+ vrshr.s16 q4, q4, #6 @
+ vld1.32 d17, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vrshr.s16 q5, q5, #6 @
+ vrshr.s16 q7, q7, #6 @
+ vld1.32 d18, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vrshr.s16 q0, q0, #6 @
+ vrshr.s16 q3, q3, #6 @
+ vld1.32 d19, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vrshr.s16 q6, q6, #6 @
+
+@ Code Added to pack sign and magnitudes
+
+ vaddw.u8 q0, q0, d28
+ vaddw.u8 q1, q1, d29
+ vaddw.u8 q2, q2, d30
+ vaddw.u8 q3, q3, d31
+ vqmovun.s16 d0, q0
+ vaddw.u8 q4, q4, d16
+ vqmovun.s16 d1, q1
+ vaddw.u8 q5, q5, d17
+ vqmovun.s16 d2, q2
+ vaddw.u8 q6, q6, d18
+ vqmovun.s16 d3, q3
+ vaddw.u8 q7, q7, d19
+
+ vqmovun.s16 d4, q4
+ vst1.32 d0, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vqmovun.s16 d5, q5
+ vst1.32 d1, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vqmovun.s16 d6, q6
+ vst1.32 d2, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vqmovun.s16 d7, q7
+ vst1.32 d3, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vst1.32 d4, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+
+ vst1.32 d5, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+
+
+ vst1.32 d6, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+
+
+ vst1.32 d7, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+
+idct_8x8_end:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15}
+
diff --git a/common/arm/ih264_iquant_itrans_recon_dc_a9.s b/common/arm/ih264_iquant_itrans_recon_dc_a9.s
new file mode 100755
index 0000000..8d71bdb
--- /dev/null
+++ b/common/arm/ih264_iquant_itrans_recon_dc_a9.s
@@ -0,0 +1,399 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264_iquant_itrans_recon_dc_a9.s
+@ *
+@ * @brief
+@ * Contains function definitions for single stage inverse transform
+@ *
+@ * @author
+@ * Mohit
+@ *
+@ * @par List of Functions:
+@ * - ih264_iquant_itrans_recon_4x4_dc_a9()
+@ * - ih264_iquant_itrans_recon_8x8_dc_a9()
+@ * - ih264_iquant_itrans_recon_chroma_4x4_dc_a9()
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@*/
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
+@ * for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is
+@ * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
+@ *
+@ * @par Description:
+@ * Performs inverse transform Ci4 and adds the residue to get the
+@ * reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ * Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ * Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ * Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ * QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction stride
+@ *
+@ * @param[in] out_strd
+@ * Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*16
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns Void
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD32 *pi4_tmp,
+@ WORD32 iq_start_idx
+@ WORD16 *pi2_dc_ld_addr)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 => pred_strd
+@r4 => out_strd
+@r5 => *pu2_iscal_mat
+@r6 => *pu2_weigh_mat
+@r7 => u4_qp_div_6
+@r9 => iq_start_idx
+@unused => pi2_dc_ld_addr
+
+.text
+.p2align 2
+
+ .global ih264_iquant_itrans_recon_4x4_dc_a9
+
+ih264_iquant_itrans_recon_4x4_dc_a9:
+
+@Only one shift is done in horizontal inverse because,
+@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+ stmfd sp!, {r4-r10, r14} @stack stores the values of the arguments
+ ldr r5, [sp, #36] @Loads *pu2_iscal_mat
+ ldr r6, [sp, #40] @Loads *pu2_weigh_mat
+ ldrsh r8, [r0] @load pi2_src[0], SH for signed halfword load
+ ldrh r6, [r6] @load pu2_weight_mat[0] , H for unsigned halfword load
+ ldrh r5, [r5] @load pu2_iscal_mat[0] , H for unsigned halfword load
+@=======================DEQUANT FROM HERE===================================
+ mul r6, r6, r5 @pu2_iscal_mat[0]*pu2_weigh_mat[0]
+ ldr r7, [sp, #44] @Loads u4_qp_div_6
+ mul r6, r6, r8 @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0]
+ ldr r4, [sp, #32] @Loads out_strd
+ ldr r9, [sp, #52] @Loads iq_start_idx
+
+ lsl r6, r6, r7 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6
+ add r6, r6, #8 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact
+ asr r6, r6, #4 @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4)
+
+ subs r9, r9, #1 @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
+ ldreqsh r10, [r0] @ Loads signed halfword pi2_src[0], if r9==1
+ moveq r6, r10 @ Restore dc value in case of intra, i.e. r9 == 1
+
+ add r6, r6, #32 @i_macro = q0 + 32
+ asr r6, r6, #6 @i_macro >>6 = DC output of 2-stage transform
+ vdup.s16 q0, r6 @copy transform output to Q0
+
+ vld1.32 d30[0], [r1], r3 @I row Load pu1_pred buffer
+
+ vld1.32 d30[1], [r1], r3 @II row Load pu1_pred buffer
+
+ vld1.32 d31[0], [r1], r3 @III row Load pu1_pred buf
+
+ vld1.32 d31[1], [r1], r3 @IV row Load pu1_pred buffer
+ vaddw.u8 q10, q0, d30
+
+ vaddw.u8 q11, q0, d31
+
+ vqmovun.s16 d0, q10
+
+ vst1.32 d0[0], [r2], r4 @I row store the value
+ vqmovun.s16 d1, q11
+ vst1.32 d0[1], [r2], r4 @II row store the value
+ vst1.32 d1[0], [r2], r4 @III row store the value
+ vst1.32 d1[1], [r2] @IV row store the value
+
+ ldmfd sp!, {r4-r10, r15} @Reload the registers from SP
+
+
+
+
+@/*
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
+@ * for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is
+@ * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
+@ *
+@ * @par Description:
+@ * Performs inverse transform Ci8 and adds the residue to get the
+@ * reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ * Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ * Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ * Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ * QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction stride
+@ *
+@ * @param[in] out_strd
+@ * Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*64
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns Void
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD32 *pi4_tmp,
+@ WORD32 iq_start_idx)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 => pred_strd
+@r4 => out_strd
+@r5 => *pu2_iscal_mat
+@r6 => *pu2_weigh_mat
+@r7 => u4_qp_div_6
+
+
+ .global ih264_iquant_itrans_recon_8x8_dc_a9
+ih264_iquant_itrans_recon_8x8_dc_a9:
+
+ stmfd sp!, {r4-r8, r14} @stack stores the values of the arguments
+ ldr r5, [sp, #28] @Loads *pu2_iscal_mat
+ ldr r6, [sp, #32] @Loads *pu2_weigh_mat
+ ldrsh r8, [r0] @load pi2_src[0], SH for signed halfword load
+ ldrh r6, [r6] @load pu2_weight_mat[0] , H for unsigned halfword load
+ ldrh r5, [r5] @load pu2_iscal_mat[0] , H for unsigned halfword load
+@=======================DEQUANT FROM HERE===================================
+ mul r6, r6, r5 @pu2_iscal_mat[0]*pu2_weigh_mat[0]
+ ldr r7, [sp, #36] @Loads u4_qp_div_6
+ mul r6, r6, r8 @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0]
+ ldr r4, [sp, #24] @Loads out_strd
+
+ vpush {d8-d15}
+ lsl r6, r6, r7 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6
+ add r6, r6, #32 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact
+ asr r6, r6, #6 @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4)
+ add r6, r6, #32 @i_macro = q0 + 32
+ asr r6, r6, #6 @i_macro >>6 = DC output of 2-stage transform
+ vdup.s16 q8, r6 @copy transform output to Q0
+
+ vld1.32 d24, [r1], r3 @ Q12 = 0x070605....0x070605....
+
+ vld1.32 d25, [r1], r3 @ Q12 = 0x070605....0x070605....
+
+ vld1.32 d26, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vaddw.u8 q0, q8, d24
+ vld1.32 d27, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vaddw.u8 q1, q8, d25
+ vld1.32 d28, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vaddw.u8 q2, q8, d26
+ vld1.32 d29, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vaddw.u8 q3, q8, d27
+ vld1.32 d30, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vaddw.u8 q4, q8, d28
+ vld1.32 d31, [r1], r3 @ Q12 = 0x070605....0x070605....
+
+@ Code Added to pack sign and magnitudes
+
+
+ vqmovun.s16 d0, q0
+ vaddw.u8 q5, q8, d29
+ vqmovun.s16 d1, q1
+ vaddw.u8 q6, q8, d30
+ vqmovun.s16 d2, q2
+ vqmovun.s16 d3, q3
+ vaddw.u8 q7, q8, d31
+ vqmovun.s16 d4, q4
+ vqmovun.s16 d5, q5
+ vst1.32 d0, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vqmovun.s16 d6, q6
+ vst1.32 d1, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vqmovun.s16 d7, q7
+ vst1.32 d2, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vst1.32 d3, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vst1.32 d4, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vst1.32 d5, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vst1.32 d6, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vst1.32 d7, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r8, r15}
+
+
+@ /*
+@ ********************************************************************************
+@ *
+@ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
+@ * prediction buffer if only dc value is present for residue
+@ *
+@ * @par Description:
+@ * The quantized residue is first inverse quantized,
+@ * This inverse quantized content is added to the prediction buffer to recon-
+@ * struct the end output
+@ *
+@ * @param[in] pi2_src
+@ * quantized dc coeffiient
+@ *
+@ * @param[in] pu1_pred
+@ * prediction 4x4 block in interleaved format
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction buffer stride in interleaved format
+@ *
+@ * @param[in] out_strd
+@ * recon buffer Stride
+@ *
+@ * @returns none
+@ *
+@ * @remarks none
+@ *
+@ *******************************************************************************
+@ */
+@ void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD16 *pi2_tmp,
+@ WORD16 *pi2_dc_src)
+@ Register Usage
+@ r0 : pi2_src
+@ r1 : pu1_pred
+@ r2 : pu1_out
+@ r3 : pred_strd
+@ Neon registers d0-d7, d16-d30 are used
+@ No need for pushing arm and neon registers
+ .global ih264_iquant_itrans_recon_chroma_4x4_dc_a9
+ih264_iquant_itrans_recon_chroma_4x4_dc_a9:
+
+ ldr r0, [sp, #20]
+ vld1.s16 d0, [r0] @load pi2_dc_src
+
+ ldr r0, [sp] @load out_strd
+
+ vld2.s8 {d2, d3}, [r1], r3 @load pred plane 1 => d2 &pred palne 2 => d3
+ vld2.s8 {d3, d4}, [r1], r3
+ vrshr.s16 d0, d0, #6 @i_macro = ((q0 + 32) >> 6);
+ vld2.s8 {d4, d5}, [r1], r3
+ vld2.s8 {d5, d6}, [r1], r3
+
+ vdup.s16 q0, d0[0] @duplicate pi2_sr[0]
+ mov r1, r2 @backup pu1_out
+
+ vtrn.32 d2, d3 @mov the 4 coeffs of current block to d2
+ vtrn.32 d4, d5
+
+ vmov.u16 q15, #0x00ff
+
+ vld1.u8 d18, [r2], r0 @load out [8 bit size) -8 coeffs
+ vaddw.u8 q1, q0, d2 @Add pred
+ vld1.u8 d19, [r2], r0
+ vaddw.u8 q2, q0, d4
+ vld1.u8 d20, [r2], r0
+ vld1.u8 d21, [r2], r0
+
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d4, q2
+
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+
+ vbit.u8 q9, q1, q15
+ vbit.u8 q10, q2, q15
+
+ vst1.u8 d18, [r1], r0 @store out
+ vst1.u8 d19, [r1], r0
+ vst1.u8 d20, [r1], r0
+ vst1.u8 d21, [r1], r0
+
+ bx lr
+
+
+
+
+
+
+
diff --git a/common/arm/ih264_itrans_recon_a9.s b/common/arm/ih264_itrans_recon_a9.s
new file mode 100755
index 0000000..1d74da5
--- /dev/null
+++ b/common/arm/ih264_itrans_recon_a9.s
@@ -0,0 +1,216 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264_itrans_recon_neon_a9.s
+@ *
+@ * @brief
+@ * Contains function definitions for single stage inverse transform
+@ *
+@ *
+@ * @par List of Functions:
+@ * - ih264_itrans_recon_4x4_a9()
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@*/
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * This function performs Inverse transform type Ci4 for 4*4 block
+@ *
+@ * @par Description:
+@ * Performs inverse transform Ci4 and adds the residue to get the
+@ * reconstructed block
+@ *
+@ * @param[in] pi16_levelBlock
+@ * Input 4x4 coefficients
+@ *
+@ * @param[in] puc_predBuffer
+@ * Prediction 4x4 block
+@ *
+@ * @param[out] puc_reconPic
+@ * Output 4x4 block
+@ *
+@ * @param[in] ui16_picWidth
+@ * Input stride
+@ *
+@ * @param[in] pred_strd
+@ * Prediction stride
+@ *
+@ * @param[in] dst_strd
+@ * Output Stride
+@ *
+@ * @param[in] zero_cols
+@ * Zero columns in pi2_src
+@ *
+@ * @returns Void
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_itrans_recon_4x4(
+@ WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_recon,
+@ WORD32 src_strd,
+@ WORD32 pred_strd,
+@ WORD32 dst_strd,
+@ UWORD32 q_lev, //quantizer level
+@ WORD32 *pi4_tmp)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_recon
+@r3 => src_strd
+@r4 => pred_strd
+@r5 => dst_strd
+@r6 => q_lev
+@r7 => *pi4_tmp
+
+.text
+.p2align 2
+
+
+ .global ih264_itrans_recon_4x4_a9
+
+ih264_itrans_recon_4x4_a9:
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ lsl r3, r3, #1
+
+ vld1.16 d0, [r0], r3 @0th row pi2_src_tmp[0]
+ ldr r4, [sp, #40] @Loads pred_strd
+
+ vld1.16 d1, [r0], r3 @I row pi2_src_tmp[0]
+ ldr r5, [sp, #44] @Loads *dst_strd
+
+ vld1.16 d2, [r0], r3 @II row pi2_src_tmp[0]
+
+ vld1.16 d3, [r0] @III row pi2_src_tmp[0]
+ ldr r7, [sp, #52] @Loads *pi4_tmp
+
+ vpush {d8-d15}
+
+ vtrn.16 d0, d1 @Transpose to get all the 0th element in the single D register
+ vtrn.16 d2, d3
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3 @D0 --> pi2_src_tmp[0], D1 --> pi2_src_tmp[1]
+ @D2 --> pi2_src_tmp[2], D3 --> pi2_src_tmp[3]
+
+ vaddl.s16 q3, d0, d2 @x0 = (pi2_src_tmp[0] + pi2_src_tmp[2])
+ vsubl.s16 q4, d0, d2 @x1 = (pi2_src_tmp[0] - pi2_src_tmp[2])
+ vshr.s16 d4, d1, #1 @pi2_src_tmp[1] >> 1
+ vshr.s16 d5, d3, #1 @pi2_src_tmp[3] >> 1
+
+ vsubl.s16 q5, d4, d3 @x2 = D_SHIFT(pi2_src_tmp[1],1,shft) - pi2_src_tmp[3]
+
+ vaddl.s16 q6, d1, d5 @x3 = pi2_src_tmp[1] + D_SHIFT(pi2_src_tmp[3],1,shft)
+
+ vadd.s32 q8, q4, q5 @x1 + x2
+ vsub.s32 q9, q4, q5 @x1 - x2
+
+ vadd.s32 q7, q3, q6 @x0 + x3
+ vsub.s32 q10, q3, q6 @x0 - x3
+
+ vtrn.32 q7, q8 @Transpose the register to have the adjacent values
+
+ vtrn.32 q9, q10
+ vadd.s32 d6, d14, d15 @x0(0,1) = (pi4_tblk[0,1] + pi4_tblk[8,9])
+
+ vsub.s32 d7, d14, d15 @x1(0,1) = (pi4_tblk[0,1] - pi4_tblk[8,9])
+
+ vshr.s32 d4, d16, #1 @pi4_tblk[4,5] >> 1
+ vshr.s32 d5, d17, #1 @pi4_tblk[12,13] >> 1
+
+ vsub.s32 d8, d4, d17 @x2(0,1) = D_SHIFT(pi4_tblk[4,5],1,shft) - pi4_tblk[12,13]
+ vadd.s32 d9, d16, d5 @x3(0,1) = pi4_tblk[4,5] + D_SHIFT(pi4_tblk[12,13],1,shft)
+
+ vadd.s32 d10, d18, d19 @x0(2,3) = (pi4_tblk[2,3] + pi4_tblk[10,11])
+ vsub.s32 d11, d18, d19 @x1(2,3) = (pi4_tblk[2,3] - pi4_tblk[10,11])
+ vshr.s32 d4, d20, #1 @pi4_tblk[6,7] >> 1
+ vshr.s32 d5, d21, #1 @pi4_tblk[14,15] >> 1
+
+ vld1.32 d30[0], [r1], r4 @I row Load pu1_pred buffer
+ vsub.s32 d12, d4, d21 @x2(2,3) = D_SHIFT(pi4_tblk[6,7],1,shft) - pi4_tblk[14,15]
+
+ vmovl.u8 q15, d30 @I row Convert 8 bit pred buffer to 16 bit
+ vadd.s32 d13, d20, d5 @x3(2,3) = pi4_tblk[6,7] + D_SHIFT(pi4_tblk[14,15],1,shft)
+
+ vadd.s32 d16, d6, d9 @I row i_macro(0,1) = x0(0,1) + x3(0,1)
+
+ vld1.32 d28[0], [r1], r4 @II row Load pu1_pred buffer
+ vadd.s32 d17, d10, d13 @I row i_macro(2,3) = x0(2,3) + x3(2,3)
+
+ vqrshrn.s32 d16, q8, #6 @I row i_macro = D_SHIFT(i_macro,6,shft)
+
+ vmovl.u8 q14, d28 @II row Convert 8 bit pred buffer to 16 bit
+ vadd.u16 d16, d16, d30 @I row i_macro += *pu1_pred_tmp
+
+ vqmovun.s16 d16, q8 @I row CLIP_U8(i_macro)
+ vadd.s32 d18, d7, d8 @II row i_macro(0,1) = x1(0,1) + x2(0,1)
+
+ vld1.32 d26[0], [r1], r4 @III row Load pu1_pred buffer
+ vadd.s32 d19, d11, d12 @II row i_macro(2,3) = x1(2,3) + x2(2,3)
+
+ vqrshrn.s32 d18, q9, #6 @II row i_macro = D_SHIFT(i_macro,6,shft)
+
+ vmovl.u8 q13, d26 @III row Convert 8 bit pred buffer to 16 bit
+ vadd.u16 d18, d18, d28 @II row i_macro += *pu1_pred_tmp
+
+ vst1.32 d16[0], [r2], r5 @I row store the value
+ vsub.s32 d20, d7, d8 @III row i_macro(0,1) = x1(0,1) - x2(0,1)
+
+ vqmovun.s16 d18, q9 @II row CLIP_U8(i_macro)
+ vsub.s32 d21, d11, d12 @III row i_macro(2,3) = x1(2,3) - x2(2,3)
+
+ vld1.32 d24[0], [r1], r4 @IV row Load pu1_pred buffer
+ vqrshrn.s32 d20, q10, #6 @III row i_macro = D_SHIFT(i_macro,6,shft)
+
+ vmovl.u8 q12, d24 @IV row Convert 8 bit pred buffer to 16 bit
+ vadd.u16 d20, d20, d26 @III row i_macro += *pu1_pred_tmp
+
+ vqmovun.s16 d20, q10 @III row CLIP_U8(i_macro)
+ vsub.s32 d22, d6, d9 @IV row i_macro(0,1) = x0(0,1) - x3(0,1)
+
+ vst1.32 d18[0], [r2], r5 @II row store the value
+ vsub.s32 d23, d10, d13 @IV row i_macro(2,3) = x0(2,3) - x3(2,3)
+
+ vqrshrn.s32 d22, q11, #6 @IV row i_macro = D_SHIFT(i_macro,6,shft)
+
+ vst1.32 d20[0], [r2], r5 @III row store the value
+ vadd.u16 d22, d22, d24 @IV row i_macro += *pu1_pred_tmp
+
+ vqmovun.s16 d22, q11 @IV row CLIP_U8(i_macro)
+ vst1.32 d22[0], [r2], r5 @IV row store the value
+
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+
+
+
diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s
new file mode 100755
index 0000000..2808897
--- /dev/null
+++ b/common/arm/ih264_mem_fns_neon.s
@@ -0,0 +1,268 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264_mem_fns_neon.s
+@ *
+@ * @brief
+@ * Contains function definitions for memory manipulation
+@ *
+@ * @author
+@ * Naveen SR
+@ *
+@ * @par List of Functions:
+@ * - ih264_memcpy_mul_8_a9q()
+@ * - ih264_memcpy_a9q()
+@ * - ih264_memset_mul_8_a9q()
+@ * - ih264_memset_a9q()
+@ * - ih264_memset_16bit_mul_8_a9q()
+@ * - ih264_memset_a9q()
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* memcpy of a 1d array
+@*
+@* @par Description:
+@* Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+@*
+@* @param[in] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[in] num_bytes
+@* number of bytes to copy
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
+@ UWORD8 *pu1_src,
+@ UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_dst
+@ r1 => *pu1_src
+@ r2 => num_bytes
+
+.text
+.p2align 2
+
+
+ .global ih264_memcpy_mul_8_a9q
+
+ih264_memcpy_mul_8_a9q:
+
+loop_neon_memcpy_mul_8:
+ @ Memcpy 8 bytes
+ vld1.8 d0, [r1]!
+ vst1.8 d0, [r0]!
+
+ subs r2, r2, #8
+ bne loop_neon_memcpy_mul_8
+ bx lr
+
+
+
+@*******************************************************************************
+@*/
+@void ih264_memcpy(UWORD8 *pu1_dst,
+@ UWORD8 *pu1_src,
+@ UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_dst
+@ r1 => *pu1_src
+@ r2 => num_bytes
+
+
+
+ .global ih264_memcpy_a9q
+
+ih264_memcpy_a9q:
+ subs r2, #8
+ blt memcpy
+loop_neon_memcpy:
+ @ Memcpy 8 bytes
+ vld1.8 d0, [r1]!
+ vst1.8 d0, [r0]!
+
+ subs r2, #8
+ bge loop_neon_memcpy
+ cmp r2, #-8
+ bxeq lr
+
+memcpy:
+ add r2, #8
+
+loop_memcpy:
+ ldrb r3, [r1], #1
+ strb r3, [r0], #1
+ subs r2, #1
+ bne loop_memcpy
+ bx lr
+
+
+
+
+@void ih264_memset_mul_8(UWORD8 *pu1_dst,
+@ UWORD8 value,
+@ UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_dst
+@ r1 => value
+@ r2 => num_bytes
+
+
+
+ .global ih264_memset_mul_8_a9q
+
+ih264_memset_mul_8_a9q:
+
+@ Assumptions: numbytes is either 8, 16 or 32
+ vdup.8 d0, r1
+loop_memset_mul_8:
+ @ Memset 8 bytes
+ vst1.8 d0, [r0]!
+
+ subs r2, r2, #8
+ bne loop_memset_mul_8
+
+ bx lr
+
+
+
+
+@void ih264_memset(UWORD8 *pu1_dst,
+@ UWORD8 value,
+@ UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_dst
+@ r1 => value
+@ r2 => num_bytes
+
+
+
+ .global ih264_memset_a9q
+
+ih264_memset_a9q:
+ subs r2, #8
+ blt memset
+ vdup.8 d0, r1
+loop_neon_memset:
+ @ Memcpy 8 bytes
+ vst1.8 d0, [r0]!
+
+ subs r2, #8
+ bge loop_neon_memset
+ cmp r2, #-8
+ bxeq lr
+
+memset:
+ add r2, #8
+
+loop_memset:
+ strb r1, [r0], #1
+ subs r2, #1
+ bne loop_memset
+ bx lr
+
+
+
+
+@void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
+@ UWORD16 value,
+@ UWORD8 num_words)
+@**************Variables Vs Registers*************************
+@ r0 => *pu2_dst
+@ r1 => value
+@ r2 => num_words
+
+
+
+ .global ih264_memset_16bit_mul_8_a9q
+
+ih264_memset_16bit_mul_8_a9q:
+
+@ Assumptions: num_words is either 8, 16 or 32
+
+ @ Memset 8 words
+ vdup.16 d0, r1
+loop_memset_16bit_mul_8:
+ vst1.16 d0, [r0]!
+ vst1.16 d0, [r0]!
+
+ subs r2, r2, #8
+ bne loop_memset_16bit_mul_8
+
+ bx lr
+
+
+
+
+@void ih264_memset_16bit(UWORD16 *pu2_dst,
+@ UWORD16 value,
+@ UWORD8 num_words)
+@**************Variables Vs Registers*************************
+@ r0 => *pu2_dst
+@ r1 => value
+@ r2 => num_words
+
+
+
+ .global ih264_memset_16bit_a9q
+
+ih264_memset_16bit_a9q:
+ subs r2, #8
+ blt memset_16bit
+ vdup.16 d0, r1
+loop_neon_memset_16bit:
+ @ Memset 8 words
+ vst1.16 d0, [r0]!
+ vst1.16 d0, [r0]!
+
+ subs r2, #8
+ bge loop_neon_memset_16bit
+ cmp r2, #-8
+ bxeq lr
+
+memset_16bit:
+ add r2, #8
+
+loop_memset_16bit:
+ strh r1, [r0], #2
+ subs r2, #1
+ bne loop_memset_16bit
+ bx lr
+
+
+
+
diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s
new file mode 100755
index 0000000..9bab268
--- /dev/null
+++ b/common/arm/ih264_padding_neon.s
@@ -0,0 +1,646 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264_padding_neon.s
+@ *
+@ * @brief
+@ * Contains function definitions padding
+@ *
+@ * @author
+@ * Ittiam
+@ *
+@ * @par List of Functions:
+@ * - ih264_pad_top_a9q()
+@ * - ih264_pad_left_luma_a9q()
+@ * - ih264_pad_left_chroma_a9q()
+@ * - ih264_pad_right_luma_a9q()
+@ * - ih264_pad_right_chroma_a9q()
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@*/
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief pad at the top of a 2d array
+@*
+@* @par Description:
+@* The top row of a 2d array is replicated for pad_size times at the top
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @returns none
+@*
+@* @remarks none
+@*
+@*******************************************************************************
+@*/
+@void ih264_pad_top(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ WORD32 wd,
+@ WORD32 pad_size)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => wd
+@ r3 => pad_size
+
+.text
+.p2align 2
+
+ .global ih264_pad_top_a9q
+
+ih264_pad_top_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+ sub r5, r0, r1
+ rsb r6, r1, #0
+
+loop_neon_memcpy_mul_16:
+ @ Load 16 bytes
+ vld1.8 {d0, d1}, [r0]!
+ mov r4, r5
+ mov r7, r3
+ add r5, r5, #16
+
+loop_neon_pad_top:
+ vst1.8 {d0, d1}, [r4], r6
+ subs r7, r7, #1
+ bne loop_neon_pad_top
+
+ subs r2, r2, #16
+ bne loop_neon_memcpy_mul_16
+
+ ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Padding (luma block) at the left of a 2d array
+@*
+@* @par Description:
+@* The left column of a 2d array is replicated for pad_size times at the left
+@*
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@#if PAD_LEFT_LUMA == C
+@void ih264_pad_left_luma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ WORD32 ht,
+@ WORD32 pad_size)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => ht
+@ r3 => pad_size
+
+
+ .global ih264_pad_left_luma_a9q
+
+ih264_pad_left_luma_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+
+ sub r4, r0, r3
+ sub r6, r1, #16
+ subs r5, r3, #16
+ bne loop_32
+loop_16: @ /*hard coded for width=16 ,height =8,16*/
+ ldrb r8, [r0], r1
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4], r1 @ 16 bytes store
+ vdup.u8 q1, r9
+ vst1.8 {q1}, [r4], r1 @ 16 bytes store
+ ldrb r11, [r0], r1
+ vdup.u8 q2, r10
+ vdup.u8 q3, r11
+ vst1.8 {q2}, [r4], r1 @ 16 bytes store
+ ldrb r8, [r0], r1
+ vst1.8 {q3}, [r4], r1 @ 16 bytes store
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4], r1 @ 16 bytes store
+ vdup.u8 q1, r9
+ ldrb r11, [r0], r1
+ vst1.8 {q1}, [r4], r1 @ 16 bytes store
+ vdup.u8 q2, r10
+ vdup.u8 q3, r11
+ subs r2, r2, #8
+ vst1.8 {q2}, [r4], r1 @ 16 bytes store
+ vst1.8 {q3}, [r4], r1 @ 16 bytes store
+ bne loop_16
+ b end_func
+
+loop_32: @ /*hard coded for width=32 ,height =8,16*/
+ ldrb r8, [r0], r1
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u8 q1, r9
+ vst1.8 {q0}, [r4], r6
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u8 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ ldrb r11, [r0], r1
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u8 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ ldrb r8, [r0], r1
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vdup.u8 q0, r8
+ ldrb r9, [r0], r1
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u8 q1, r9
+ vst1.8 {q0}, [r4], r6 @ 16 bytes store
+ ldrb r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u8 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u8 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ subs r2, r2, #8
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+ bne loop_32
+
+
+
+end_func:
+ ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Padding (chroma block) at the left of a 2d array
+@*
+@* @par Description:
+@* The left column of a 2d array is replicated for pad_size times at the left
+@*
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array (each colour component)
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@#if PAD_LEFT_CHROMA == C
+@void ih264_pad_left_chroma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ WORD32 ht,
+@ WORD32 pad_size)
+@{
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => ht
+@ r3 => pad_size
+
+
+
+ .global ih264_pad_left_chroma_a9q
+
+ih264_pad_left_chroma_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+ sub r4, r0, r3
+ sub r6, r1, #16
+
+
+loop_32_l_c: @ /*hard coded for width=32 ,height =4,8,12*/
+ ldrh r8, [r0], r1
+ ldrh r9, [r0], r1
+ vdup.u16 q0, r8
+ ldrh r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u16 q1, r9
+ vst1.8 {q0}, [r4], r6 @ 16 bytes store
+ ldrh r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u16 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vdup.u16 q3, r11
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ subs r2, r2, #4
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+
+
+ beq end_func_l_c @/* Branching when ht=4*/
+
+ ldrh r8, [r0], r1
+ ldrh r9, [r0], r1
+ vdup.u16 q0, r8
+ ldrh r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u16 q1, r9
+ vst1.8 {q0}, [r4], r6
+ ldrh r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u16 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vdup.u16 q3, r11
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ subs r2, r2, #4
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+
+ beq end_func_l_c @/* Branching when ht=8*/
+ bne loop_32_l_c
+
+ ldrh r8, [r0], r1
+ ldrh r9, [r0], r1
+ vdup.u16 q0, r8
+ ldrh r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u16 q1, r9
+ vst1.8 {q0}, [r4], r6
+ ldrh r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u16 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vdup.u16 q3, r11
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+
+end_func_l_c:
+ ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Padding (luma block) at the right of a 2d array
+@*
+@* @par Description:
+@* The right column of a 2d array is replicated for pad_size times at the right
+@*
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@#if PAD_RIGHT_LUMA == C
+@void ih264_pad_right_luma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ WORD32 ht,
+@ WORD32 pad_size)
+@{
+@ WORD32 row;
+@
+@ for(row = 0; row < ht; row++)
+@ {
+@ memset(pu1_src, *(pu1_src -1), pad_size);
+@
+@ pu1_src += src_strd;
+@ }
+@}
+@
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => ht
+@ r3 => pad_size
+
+
+
+ .global ih264_pad_right_luma_a9q
+
+ih264_pad_right_luma_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+ mov r4, r0
+ sub r6, r1, #16
+ sub r0, r0, #1
+ subs r5, r3, #16
+ bne loop_32
+loop_16_r: @ /*hard coded for width=16 ,height =8,16*/
+ ldrb r8, [r0], r1
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4], r1 @ 16 bytes store
+ vdup.u8 q1, r9
+ vst1.8 {q1}, [r4], r1 @ 16 bytes store
+ ldrb r11, [r0], r1
+ vdup.u8 q2, r10
+ vdup.u8 q3, r11
+ vst1.8 {q2}, [r4], r1 @ 16 bytes store
+ ldrb r8, [r0], r1
+ vst1.8 {q3}, [r4], r1 @ 16 bytes store
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4], r1 @ 16 bytes store
+ vdup.u8 q1, r9
+ ldrb r11, [r0], r1
+ vst1.8 {q1}, [r4], r1 @ 16 bytes store
+ vdup.u8 q2, r10
+ vdup.u8 q3, r11
+ subs r2, r2, #8
+ vst1.8 {q2}, [r4], r1 @ 16 bytes store
+ vst1.8 {q3}, [r4], r1 @ 16 bytes store
+ bne loop_16_r
+ b end_func_r
+
+loop_32_r: @ /*hard coded for width=32 ,height =8,16*/
+ ldrb r8, [r0], r1
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u8 q1, r9
+ vst1.8 {q0}, [r4], r6
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u8 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ ldrb r11, [r0], r1
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u8 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ ldrb r8, [r0], r1
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u8 q1, r9
+ vst1.8 {q0}, [r4], r6 @ 16 bytes store
+ ldrb r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u8 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u8 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ subs r2, r2, #8
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+ bne loop_32_r
+
+
+
+end_func_r:
+ ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@;* Padding (chroma block) at the right of a 2d array
+@*
+@* @par Description:
+@* The right column of a 2d array is replicated for pad_size times at the right
+@*
+@*
+@* @param[in] pu1_src
+@;* UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] ht
+@;* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array (each colour component)
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @param[in] ht
+@;* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@#if PAD_RIGHT_CHROMA == C
+@void ih264_pad_right_chroma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ WORD32 ht,
+@ WORD32 pad_size)
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => ht
+@ r3 => pad_size
+
+
+
+ .global ih264_pad_right_chroma_a9q
+
+ih264_pad_right_chroma_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+ mov r4, r0
+ sub r6, r1, #16
+ sub r0, r0, #2
+loop_32_r_c: @ /*hard coded for width=32 ,height =8,4*/
+ ldrh r8, [r0], r1
+ ldrh r9, [r0], r1
+ vdup.u16 q0, r8
+ ldrh r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u16 q1, r9
+ vst1.8 {q0}, [r4], r6
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u16 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ subs r2, r2, #4
+ ldrh r11, [r0], r1
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u16 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+
+ beq end_func_r_c @/* Branching when ht=4*/
+
+ ldrh r8, [r0], r1
+ vdup.u16 q0, r8
+ ldrh r9, [r0], r1
+ ldrh r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u16 q1, r9
+ vst1.8 {q0}, [r4], r6 @ 16 bytes store
+ ldrh r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u16 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u16 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ subs r2, r2, #4
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+
+ beq end_func_r_c @/* Branching when ht=8*/
+ bne loop_32_r_c
+
+ ldrh r8, [r0], r1
+ vdup.u16 q0, r8
+ ldrh r9, [r0], r1
+ ldrh r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u16 q1, r9
+ vst1.8 {q0}, [r4], r6 @ 16 bytes store
+ ldrh r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u16 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u16 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+
+end_func_r_c:
+ ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
+
+
+
+
+
diff --git a/common/arm/ih264_platform_macros.h b/common/arm/ih264_platform_macros.h
new file mode 100755
index 0000000..1f67403
--- /dev/null
+++ b/common/arm/ih264_platform_macros.h
@@ -0,0 +1,152 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_platform_macros.h
+*
+* @brief
+* Platform specific Macro definitions used in the codec
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_PLATFORM_MACROS_H_
+#define _IHEVC_PLATFORM_MACROS_H_
+
+#ifndef ARMV8
+void ih264_arm_dsb(void);
+
+#define DATA_SYNC() ih264_arm_dsb()
+static __inline WORD32 CLIP_U8(WORD32 x)
+{
+ asm("usat %0, #8, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S8(WORD32 x)
+{
+ asm("ssat %0, #8, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U10(WORD32 x)
+{
+ asm("usat %0, #10, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S10(WORD32 x)
+{
+ asm("ssat %0, #10, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U12(WORD32 x)
+{
+ asm("usat %0, #12, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S12(WORD32 x)
+{
+ asm("ssat %0, #12, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U16(WORD32 x)
+{
+ asm("usat %0, #16, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+static __inline WORD32 CLIP_S16(WORD32 x)
+{
+ asm("ssat %0, #16, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+
+static __inline UWORD32 ITT_BIG_ENDIAN(UWORD32 x)
+{
+ asm("rev %0, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+#else
+#define DATA_SYNC() ;
+
+#define CLIP_U8(x) CLIP3(0, 255, (x))
+#define CLIP_S8(x) CLIP3(-128, 127, (x))
+
+#define CLIP_U10(x) CLIP3(0, 1023, (x))
+#define CLIP_S10(x) CLIP3(-512, 511, (x))
+
+#define CLIP_U12(x) CLIP3(0, 4095, (x))
+#define CLIP_S12(x) CLIP3(-2048, 2047, (x))
+
+#define CLIP_U16(x) CLIP3(0, 65535, (x))
+#define CLIP_S16(x) CLIP3(-32768, 32767, (x))
+
+#define ITT_BIG_ENDIAN(x) ((x & 0x000000ff) << 24) | \
+ ((x & 0x0000ff00) << 8) | \
+ ((x & 0x00ff0000) >> 8) | \
+ ((UWORD32)x >> 24);
+#endif
+
+#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0)
+#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0)
+
+#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift)))
+#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift))
+
+#define INLINE inline
+
+static INLINE UWORD32 CLZ(UWORD32 u4_word)
+{
+ if(u4_word)
+ return (__builtin_clz(u4_word));
+ else
+ return 32;
+}
+static INLINE UWORD32 CTZ(UWORD32 u4_word)
+{
+ if(0 == u4_word)
+ return 31;
+ else
+ {
+ unsigned int index;
+ index = __builtin_ctz(u4_word);
+ return (UWORD32)index;
+ }
+}
+
+
+#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
+
+
+#define MEM_ALIGN8 __attribute__ ((aligned (8)))
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+#define MEM_ALIGN32 __attribute__ ((aligned (32)))
+
+#endif /* _IHEVC_PLATFORM_MACROS_H_ */
diff --git a/common/arm/ih264_resi_trans_a9.s b/common/arm/ih264_resi_trans_a9.s
new file mode 100755
index 0000000..08821f5
--- /dev/null
+++ b/common/arm/ih264_resi_trans_a9.s
@@ -0,0 +1,604 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@*******************************************************************************
+@* @file
+@* ih264_resi_trans_a9.s
+@*
+@* @brief
+@* Contains function definitions for residual and forward trans
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@* ih264_resi_trans_4x4_a9
+@* ih264_resi_trans_8x8_a9
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+
+
+.text
+.p2align 2
+@*****************************************************************************
+@*
+@* Function Name : ih264_resi_trans_4x4_a9
+@* Description : This function does cf4 of H264 followed by and approximate scaling
+@*
+@* Arguments :
+@ R0 :pointer to src buffer
+@ R1 :pointer to pred buffer
+@ R2 :pointer to dst buffer
+@ R3 :src_stride
+@ STACk :pred_stride,dst_stride
+
+@* Values Returned : NONE
+@*
+@* Register Usage :
+@* Stack Usage :
+@* Cycles : Around
+@* Interruptiaility : Interruptable
+@*
+@* Known Limitations
+@* \Assumptions :
+@*
+@* Revision History :
+@* DD MM YYYY Author(s) Changes
+@* 30 12 2009 100633 First version
+@*
+@*****************************************************************************
+
+
+ .global ih264_resi_trans_4x4_a9
+ .extern g_scal_coff_h264_4x4
+g_scal_coff_h264_4x4_addr:
+ .long g_scal_coff_h264_4x4 - 4x4lbl - 8
+
+ih264_resi_trans_4x4_a9:
+
+ @R0 :pointer to src buffer
+ @R1 :pointer to pred buffer
+ @R2 :pointer to dst buffer
+ @R3 :src_stride
+ @STACk :pred_stride,dst_stride
+
+ push {r4-r12, lr} @push all the variables first
+
+ mov r6, sp
+ add r6, r6, #40 @decrement stack pointer,to accomodate two variables
+ ldmfd r6, {r4-r5} @load the strides into registers
+ @R4 pred_stride
+ @R5 dst_stride
+
+
+ @we have to give the stride as post inrement in VLDR1
+ @but since thr stride is from end of row 1 to start of row 2,
+ @we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes)
+ @ADD R3,#4
+ @ADD R4,#4
+ @ADD R5,#4
+ @in case of dst the stride represnts 16 bit ie 2*8bits
+ @hence we need to add #4 to it and thenm multiply by 2
+ @--------------------function loading done------------------------
+
+ @lets find residual
+ @data is like 1a -> d0[1:31] d0[32:64]
+ @ a b c d # # # #
+ vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer
+ vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer
+ @ data is like 1a -> q4[1:63] q4[64:148]
+ @ d8[1:63] d9[1:63]
+ @ a b c d # # # #
+
+ vld1.u8 d28, [r0], r3 @load row 2 of src to d28[0]
+ vld1.u8 d29, [r1], r4 @load row2 of pred to d29[0]
+
+ vld1.u8 d26, [r0], r3 @load row 3 of src to d26[0]
+ vsubl.u8 q0, d30, d31 @curr - pred for row one
+
+ vld1.u8 d27, [r1], r4 @load row 3of pred t0 d27[0]
+ vsubl.u8 q1, d28, d29 @find row 2 of src -pred to d0
+
+ vld1.u8 d24, [r0], r3 @load row 4 of src to d24[0]
+
+ vld1.u8 d25, [r1], r4 @load row 4 of src tp d25[0]
+ vsubl.u8 q2, d26, d27 @load src-pred row 3 to d[2]
+
+ lsl r5, r5, #2 @ multiply dst stride by since we are storing 32 bit values
+ ldr r6, g_scal_coff_h264_4x4_addr
+4x4lbl:
+ add r6, r6, pc @ load the address of global array
+
+ vsubl.u8 q3, d24, d25 @load row 4 of src - pred to q6
+
+ @after this
+ @D0 -> 1a
+ @D2 -> 2a
+ @D4 -> 3a
+ @D6 -> 4a
+
+ @transpose the matrix so that we can do the horizontal transform first
+ @#1 #2 #3 #4
+ @a b c d ---- D0
+ @e f g h -----D2
+ @i j k l -----D4
+ @m n o p -----D6
+ @transpose the inner 2x2 blocks
+ vtrn.16 d0, d2
+ vld1.s16 {q10}, [r6]! @ load the scaling values 0-7;
+ vtrn.16 d4, d6
+ @a e c g
+ @b f d h
+ @i m k o
+ @j n l p
+ vtrn.32 d0, d4
+ vtrn.32 d2, d6
+ @a e i m #1 -- D0 --- x4
+ @b f j n #2 -- D2 --- x5
+ @c g k o #3 -- D4 ----x6
+ @d h l p #4 -- D6 ----x7
+
+ @we have loaded the residuals into the registers , now we need to add and subtract them
+ @let us do the horiz transform first
+
+ vsub.s16 d5, d2, d4 @x2 = x5-x6
+ vsub.s16 d7, d0, d6 @x3 = x4-x7;
+
+ vadd.s16 d3, d2, d4 @x1 = x5+x6
+ vadd.s16 d1, d0, d6 @x0 = x4+x7
+
+
+ vshl.s16 d31, d7, #1 @
+ vshl.s16 d30, d5, #1 @
+
+ vadd.s16 d0, d1, d3 @x0 + x1;
+ vsub.s16 d4, d1, d3 @x0 - x1;
+
+ vadd.s16 d2, d31, d5 @U_SHIFT(x3,1,shft) + x2;
+ vsub.s16 d6, d7, d30 @x3 - U_SHIFT(x2,1,shft);
+
+ @taking transform again so as to make do vert transform
+ vtrn.16 d0, d2
+ vtrn.16 d4, d6
+
+ vtrn.32 d0, d4
+ vtrn.32 d2, d6
+
+ @let us do vertical transform
+ @same code as horiz
+
+ vadd.s16 d1, d0, d6 @x0 = x4+x7
+ vadd.s16 d3, d2, d4 @x1 = x5+x6
+ vsub.s16 d7, d0, d6 @x3 = x4-x7;
+ vsub.s16 d5, d2, d4 @x2 = x5-x6
+
+
+@Since we are going to do scal / quant or whatever, we are going to divide by
+@a 32 bit number. So we have to expand the values
+
+ @VADDL.S16 Q12,D1,D3;x0 + x1
+ @VSUBL.S16 Q14,D1,D3;x0 - x1
+
+ @VSHL.S16 D8,D5,#1;
+ @VSHL.S16 D9,D7,#1;
+
+ @VADDL.S16 Q13,D9,D5 ; + x2
+ @VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft)
+
+@scaling follows
+
+@now we need to do the scaling,so load the scaling matrix
+@mutliplying by the scaling coeffient; store the results from q5-q8 ;
+
+ vadd.s16 d24, d3, d1 @x4 = x0 + x1
+ vsub.s16 d28, d1, d3 @x6 = x0 - x1
+
+ vshl.s16 d0, d7, #1 @ U_SHIFT(x3,1,shft)
+ vmull.s16 q4, d24, d20 @x4*s0
+
+ vshl.s16 d2, d5, #1 @ U_SHIFT(x2,1,shft)
+
+ vadd.s16 d26, d0, d5 @x5 = U_SHIFT(x3,1,shft) + x2
+ vmull.s16 q5, d26, d21 @x5*s1
+
+ vst1.s32 {q4}, [r2], r5 @save 4 pixels of row1 current buffer and increment pointer by stride
+
+ vld1.s16 {q10}, [r6] @load 8-16 scaling coeffcients
+
+ vsub.s16 d30, d7, d2 @x7 = x3 - U_SHIFT(x2,1,shft)
+
+ vmull.s16 q6, d28, d20 @x6*s2
+ vst1.s32 {q5}, [r2], r5
+
+ vmull.s16 q7, d30, d21 @x7*s3
+
+
+ vst1.s32 {q6}, [r2], r5
+ vst1.s32 {q7}, [r2]
+
+ pop {r4-r12, pc} @pop back all variables
+
+
+
+
+@*****************************************************************************
+@* Function Name : ih264_resi_trans_8x8_a9
+@* Description : This function does cf8 followd by an approximate normalization of H264
+@*
+@* Arguments :
+@* R0 :pointer to src buffer
+@ R1 :pointer to pred buffer
+@ R2 :pointer to dst buffer
+@ R3 :src_stride
+@ STACk :pred_stride,dst_st
+@*
+@*
+@* Values Returned : NONE
+@*
+@* Register Usage :
+@* Stack Usage :
+@* Cycles : Around
+@* Interruptiaility : Interruptable
+@*
+@* Known Limitations
+@* \Assumptions :
+@*
+@* Revision History :
+@* DD MM YYYY Author(s) Changes
+@* 30 12 2009 100633 First version
+@*
+@*****************************************************************************
+
+
+ .global ih264_resi_trans_8x8_a9
+ .extern g_scal_coff_h264_8x8
+g_scal_coff_h264_8x8_addr:
+ .long g_scal_coff_h264_8x8 - 8x8lbl - 8
+
+
+ih264_resi_trans_8x8_a9:
+
+ @R0 :pointer to src buffer
+ @R1 :pointer to pred buffer
+ @R2 :pointer to dst buffer
+ @R3 :src_stride
+ @STACk :pred_stride,dst_stride
+
+ push {r4-r12, lr} @push all the variables first
+
+ mov r6, sp
+ add r6, r6, #40 @decrement stack pointer,to accomodate two variables
+ ldmfd r6, {r4-r5} @load the strides into registers
+ @R4 pred_stride
+ @R5 dst_stride
+
+ @we have to give the stride as post inrement in vst1
+ @in case of dst the stride represnts 16 bit ie 2*8bits
+ @hence we need to add #4 to it and thenm multiply by 2
+ @--------------------function loading done------------------------
+
+ @lets find residual
+ @data is like 1a -> d0[1:31] d0[32:64]
+ @ a b c d # # # #
+ vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer
+ vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer
+
+ vld1.u8 d28, [r0], r3 @src rw2
+ vld1.u8 d29, [r1], r4 @pred rw2
+ vsubl.u8 q0, d30, d31 @src-pred rw1
+
+ vld1.u8 d26, [r0], r3
+ vld1.u8 d27, [r1], r4
+ vsubl.u8 q1, d28, d29
+
+ vld1.u8 d24, [r0], r3
+ vld1.u8 d25, [r1], r4
+ vsubl.u8 q2, d26, d27
+
+ vld1.u8 d22, [r0], r3
+ vld1.u8 d23, [r1], r4
+ vsubl.u8 q3, d24, d25
+
+ vld1.u8 d20, [r0], r3
+ vld1.u8 d21, [r1], r4
+ vsubl.u8 q4, d22, d23
+
+ vld1.u8 d18, [r0], r3
+ vld1.u8 d19, [r1], r4
+ vsubl.u8 q5, d20, d21
+
+ vld1.u8 d16, [r0], r3
+ vld1.u8 d17, [r1], r4
+ vsubl.u8 q6, d18, d19
+
+ lsl r5, r5, #2
+
+
+ vsubl.u8 q7, d16, d17
+
+ @after this
+ @Q0 -> 1a
+ @Q1 -> 2a
+ @Q2 -> 3a
+ @Q3 -> 4a
+ @Q4 -> 5a
+ @Q5 -> 6a
+ @Q6 -> 7a
+ @Q7 -> 8a
+
+ @transpose the matrix so that we can do the horizontal transform first
+
+ @transpose the inner 2x2 blocks
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ @transpose the inner 4x4 blocks
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+
+ @transpose the outer 8x8 blocks
+ vswp d1, d8
+ vswp d7, d14
+ vswp d3, d10
+ vswp d5, d12
+ @transpose done
+
+@@this point we will have data in Q0-Q7
+@Q7 will be populated within 2 clock cycle
+@all others are availabe @ this clock cycle
+
+ @we have loaded the residuals into the registers , now we need to add and subtract them
+ @let us do the horiz transform first
+
+ vadd.s16 q8, q0, q7 @ a0 = r0 + r7;
+ vadd.s16 q9, q1, q6 @ a1 = r1 + r6;
+ vadd.s16 q10, q2, q5 @ a2 = r2 + r5;
+ vadd.s16 q11, q3, q4 @ a3 = r3 + r4;
+
+ vsub.s16 q12, q0, q7 @ b0 = r0 - r7;
+ vsub.s16 q13, q1, q6 @ b1 = r1 - r6;
+ vsub.s16 q15, q3, q4 @ b3 = r3 - r4;
+ vsub.s16 q14, q2, q5 @ b2 = r2 - r5;
+
+ vadd.s16 q1, q8, q11 @ a4 = a0 + a3;
+ vadd.s16 q3, q9, q10 @ a5 = a1 + a2;
+ vsub.s16 q7, q9, q10 @ a7 = a1 - a2;
+ vsub.s16 q5, q8, q11 @ a6 = a0 - a3;
+
+ ldr r6, g_scal_coff_h264_8x8_addr
+8x8lbl:
+ add r6, r6, pc @ load the address of global array
+
+ vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5;
+ vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
+
+ vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5;
+
+ vadd.s16 q2, q5, q8 @
+
+
+ vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
+ vsub.s16 q6, q9, q7 @
+
+@do not change Q0,Q2.Q4,Q6 they contain results
+@Q1,Q3,Q5,Q7 TO STORE RESULTS
+@Q8 Q9 Q10 Q11 USE @WILL
+
+ vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft)
+ vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft)
+ vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft)
+ vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft)
+
+ vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0);
+ vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1);
+ vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2);
+ vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3);
+
+ vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0);
+ vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1);
+ vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2);
+ vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3);
+
+ vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
+ vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
+ vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
+ vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
+
+ vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft)
+ vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft);
+ vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft);
+ vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft);
+
+
+ vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
+ vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
+ vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
+ vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
+
+ @------------horiz transform done-------------------------
+ @results are in Q0-Q7
+ @all other neon registes can be used at will
+
+@doing vertical transform
+@code exact copy of horiz transform above
+
+ @transpose the inner 2x2 blocks
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ @transpose the inner 4x4 blocks
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+
+ @transpose the outer 8x8 blocks
+ vswp d1, d8
+ vswp d3, d10
+ vswp d5, d12
+ vswp d7, d14
+
+ @transpose done
+
+ vadd.s16 q8, q0, q7 @ a0 = r0 + r7;
+ vadd.s16 q9, q1, q6 @ a1 = r1 + r6;
+ vadd.s16 q10, q2, q5 @ a2 = r2 + r5;
+ vadd.s16 q11, q3, q4 @ a3 = r3 + r4;
+
+ vsub.s16 q12, q0, q7 @ b0 = r0 - r7;
+ vsub.s16 q13, q1, q6 @ b1 = r1 - r6;
+ vsub.s16 q14, q2, q5 @ b2 = r2 - r5;
+ vsub.s16 q15, q3, q4 @ b3 = r3 - r4;
+
+ vadd.s16 q1, q8, q11 @ a4 = a0 + a3;
+ vadd.s16 q3, q9, q10 @ a5 = a1 + a2;
+ vsub.s16 q5, q8, q11 @ a6 = a0 - a3;
+ vsub.s16 q7, q9, q10 @ a7 = a1 - a2;
+
+
+ vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5;
+
+ vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
+ @DSHIFT_TO_0 Q8,Q7,#1,#0
+ vadd.s16 q2, q5, q8 @
+
+ vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5;
+
+ vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
+ vsub.s16 q6, q9, q7 @
+
+@do not change Q0,Q2.Q4,Q6 they contain results
+@Q1,Q3,Q5,Q7 TO STORE RESULTS
+@Q8 Q9 Q10 Q11 USE @WILL
+
+ vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft)
+ vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft)
+ vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft)
+ vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft)
+
+
+ vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0);
+ vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1);
+ vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2);
+ vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3);
+
+ vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0);
+ vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2);
+ vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1);
+ vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3);
+
+ vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
+ vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
+ vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
+ vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
+
+ vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft)
+ vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft);
+ vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft);
+ vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft);
+
+
+@since we are going to scal by small values, we need not expand the guys to 32 bit bit values
+ vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
+ vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
+ vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
+ vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
+
+ @------------vert transform done-------------------------
+ @results are in Q0-Q7
+ @all other neon registes can be used at will
+
+ @scaling
+ @since the 8x8 scaling matrix repeats in 1x4,1x4 block ,
+ @we need only load 4 values for each row and in total 4 rows
+ vld1.s16 {q14-q15}, [r6] @
+
+ @since we need to get a 32 bit o/p for two 16 bit multiplications
+ @we need a VMULL instruction
+@-----------------------------first and second row
+
+ vmull.s16 q8, d0, d28 @scale the first row first 4 elem
+ vmull.s16 q9, d28, d1 @scale the second row last 4 elemts
+
+ vmull.s16 q10, d2, d29 @ scale second row first 4 elem
+ vmull.s16 q11, d29, d3 @scale the second row last 4 elem
+ vmull.s16 q12, d4, d30 @scale third row first 4 elem
+
+ vst1.s32 {q8, q9}, [r2], r5 @ write the first row complete
+
+ vmull.s16 q13, d30, d5 @scale the third row last 4 elem
+ vmull.s16 q8, d6, d31 @scale the fourth row first 4 elem
+
+
+ vst1.s32 {q10, q11}, [r2], r5 @store the second row complete
+
+@------------------------------- 3rd and 4th row
+
+ vmull.s16 q9, d31, d7 @scale the fourth row second column
+
+ vst1.s32 {q12, q13}, [r2], r5 @store the third row complete
+
+ vmull.s16 q10, d8, d28 @scale the 5th row fisrst 4 elms
+ vmull.s16 q11, d28, d9 @scale the 5th row second 4 elems
+
+ vmull.s16 q12, d10, d29 @scale the 6th row first4 elements
+
+
+ vst1.s32 {q8, q9}, [r2], r5 @store fifth row
+
+@--------------------------------5th and 6th row
+
+ vmull.s16 q13, d29, d11 @scale 6th row sendond 4 elems
+
+ vmull.s16 q8, d12, d30 @scale 7th rw first 4 elms
+
+ vst1.s32 {q10, q11}, [r2], r5 @store 6th row second 4 elements
+
+ vmull.s16 q9, d30, d13 @scale 7th rw second 4 elms
+ vmull.s16 q10, d14, d31 @scale 8th rw forst 4 elms
+
+
+ vst1.s32 {q12, q13}, [r2], r5 @store 6th row
+
+@----------------------------------7th and 8th row
+ vmull.s16 q11, d31, d15 @scale 8th row second 4 elms
+
+ vst1.s32 {q8, q9}, [r2], r5 @store 7th row
+ vst1.s32 {q10, q11}, [r2], r5 @store 8th row
+
+@----------------------------------done writing
+
+ pop {r4-r12, pc} @pop back all variables
+
+
+
+
+
+
diff --git a/common/arm/ih264_resi_trans_quant_a9.s b/common/arm/ih264_resi_trans_quant_a9.s
new file mode 100755
index 0000000..caf362e
--- /dev/null
+++ b/common/arm/ih264_resi_trans_quant_a9.s
@@ -0,0 +1,694 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@*******************************************************************************
+@* @file
+@* ih264_resi_trans_quant_a9.s
+@*
+@* @brief
+@* Contains function definitions for residual and forward trans
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@* ih264_resi_trans_quant_4x4_a9
+@* ih264_resi_trans_quant_8x8_a9
+@* ih264_resi_trans_quant_chroma_4x4_a9
+@* ih264_hadamard_quant_4x4_a9
+@* ih264_hadamard_quant_2x2_uv_a9
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+
+
+.text
+.p2align 2
+@*****************************************************************************
+@*
+@* Function Name : ih264_resi_trans_quant_4x4_a9
+@* Description : This function does cf4 of H264
+@*
+@* Arguments : R0 :pointer to src buffer
+@ R1 :pointer to pred buffer
+@ R2 :pointer to dst buffer
+@ R3 :source stride
+@ STACK : pred stride,
+@ dst stride,
+@ pointer to scaling matrix,
+@ pointer to threshold matrix,
+@ qbits,
+@ rounding factor,
+@ pointer to store nnz
+@ pointer to store non quantized dc value
+@ Values Returned : NONE
+@
+@ Register Usage :
+@ Stack Usage : 40 bytes
+@ Cycles : Around
+@ Interruptiaility : Interruptable
+@
+@ Known Limitations
+@ \Assumptions :
+@
+@ Revision History :
+@ DD MM YYYY Author(s) Changes
+@ 1 12 2013 100633 First version
+@ 20 1 2014 100633 Changes the API, Optimization
+@
+@*****************************************************************************
+
+ .global ih264_resi_trans_quant_4x4_a9
+ih264_resi_trans_quant_4x4_a9:
+
+ @R0 :pointer to src buffer
+ @R1 :pointer to pred buffer
+ @R2 :pointer to dst buffer
+ @R3 :Source stride
+ @STACk :pred stride
+ @ :scale matirx,
+ @ :threshold matrix
+ @ :qbits
+ @ :round factor
+ @ :nnz
+
+ push {r4-r12, lr} @push all the variables first
+
+ add r11, sp, #40 @decrement stack pointer,to accomodate two variables
+ ldmfd r11, {r4-r10} @load the strides into registers
+
+ @R0 :pointer to src buffer
+ @R1 :pointer to pred buffer
+ @R2 :pointer to dst buffer
+ @R3 :Source stride
+ @R4 :Pred stride
+ @R5 :scale matirx,
+ @R6 :threshold matrix
+ @R7 :qbits
+ @R8 :round factor
+ @R9 :nnz
+
+ vpush {d8-d15}
+
+ mov r11, #0
+ sub r7, r11, r7 @Negate the qbit value for usiing LSL
+
+ @------------Fucntion Loading done----------------;
+
+ vld1.u8 d30, [r0], r3 @load first 8 pix src row 1
+
+ vld1.u8 d31, [r1], r4 @load first 8 pix pred row 1
+
+ vld1.u8 d28, [r0], r3 @load first 8 pix src row 2
+
+ vld1.u8 d29, [r1], r4 @load first 8 pix pred row 2
+
+ vld1.u8 d26, [r0], r3 @load first 8 pix src row 3
+
+ vld1.u8 d27, [r1], r4 @load first 8 pix pred row 3
+ vsubl.u8 q0, d30, d31 @find residue row 1
+
+ vld1.u8 d24, [r0], r3 @load first 8 pix src row 4
+
+ vld1.u8 d25, [r1], r4 @load first 8 pix pred row 4
+ vsubl.u8 q1, d28, d29 @find residue row 2
+
+ vsubl.u8 q2, d26, d27 @find residue row 3
+ vsubl.u8 q3, d24, d25 @find residue row 4
+
+ vtrn.16 d0, d2 @T12
+ vtrn.16 d4, d6 @T23
+ vtrn.32 d0, d4 @T13
+ vtrn.32 d2, d6 @T14
+
+ vadd.s16 d8 , d0, d6 @x0 = x4+x7
+ vadd.s16 d9 , d2, d4 @x1 = x5+x6
+ vsub.s16 d10, d2, d4 @x2 = x5-x6
+ vsub.s16 d11, d0, d6 @x3 = x4-x7
+
+ vshl.s16 d12, d10, #1 @U_SHIFT(x2,1,shft)
+ vshl.s16 d13, d11, #1 @U_SHIFT(x3,1,shft)
+
+ vadd.s16 d14, d8, d9 @x4 = x0 + x1;
+ vsub.s16 d16, d8, d9 @x6 = x0 - x1;
+ vadd.s16 d15, d13, d10 @x5 = U_SHIFT(x3,1,shft) + x2;
+ vsub.s16 d17, d11, d12 @x7 = x3 - U_SHIFT(x2,1,shft);
+
+ @taking transpose again so as to make do vert transform
+ vtrn.16 d14, d15 @T12
+ vtrn.16 d16, d17 @T23
+ vtrn.32 d14, d16 @T13
+ vtrn.32 d15, d17 @T24
+
+ @let us do vertical transform
+ @same code as horiz
+ vadd.s16 d18, d14, d17 @x0 = x4+x7
+ vadd.s16 d19, d15, d16 @x1 = x5+x6
+ vsub.s16 d20, d15, d16 @x2 = x5-x6
+ vsub.s16 d21, d14, d17 @x3 = x4-x7
+
+ vshl.s16 d22, d20, #1 @U_SHIFT(x2,1,shft)
+ vshl.s16 d23, d21, #1 @U_SHIFT(x3,1,shft)
+
+ vdup.s32 q4, r8 @Load rounding value row 1
+
+ vadd.s16 d24, d18, d19 @x5 = x0 + x1;
+ vsub.s16 d26, d18, d19 @x7 = x0 - x1;
+ vadd.s16 d25, d23, d20 @x6 = U_SHIFT(x3,1,shft) + x2;
+ vsub.s16 d27, d21, d22 @x8 = x3 - U_SHIFT(x2,1,shft);
+ vdup.s32 q10, r7 @Load qbit values
+
+ vst1.s16 d24[0], [r10] @Store the dc value to alternate dc sddress
+
+@core tranform is done for 4x8 block 1
+ vld1.s16 {q14-q15}, [r5] @load the scaling values
+
+ vabs.s16 q0, q12 @Abs val of row 1 blk 1
+
+ vabs.s16 q1, q13 @Abs val of row 2 blk 1
+
+ vmov.s32 q5, q4 @copy round fact for row 2
+
+ vmov.s32 q6, q4 @copy round fact for row 2
+ vclt.s16 q2, q12, #0 @Get the sign of row 1 blk 1
+
+ vmov.s32 q7, q4 @copy round fact for row 2
+ vclt.s16 q3, q13, #0 @Get the sign of row 2 blk 1
+
+ vmlal.s16 q4, d0, d28 @Multiply and add row 1
+ vmlal.s16 q5, d1, d29 @Multiply and add row 2
+ vmlal.s16 q6, d2, d30 @Multiply and add row 3
+ vmlal.s16 q7, d3, d31 @Multiply and add row 4
+
+ vshl.s32 q11, q4, q10 @Shift row 1
+ vshl.s32 q12, q5, q10 @Shift row 2
+ vshl.s32 q13, q6, q10 @Shift row 3
+ vshl.s32 q14, q7, q10 @Shift row 4
+
+ vmovn.s32 d30, q11 @Narrow row 1
+ vmovn.s32 d31, q12 @Narrow row 2
+ vmovn.s32 d0 , q13 @Narrow row 3
+ vmovn.s32 d1 , q14 @Narrow row 4
+
+ vneg.s16 q1, q15 @Get negative
+ vneg.s16 q4, q0 @Get negative
+
+ vceq.s16 q5, q15, #0 @I compare with zero row 1 and 2 blk 1
+ vceq.s16 q6, q0 , #0 @I compare with zero row 1 and 2 blk 1
+
+ vbsl.s16 q2, q1, q15 @Restore sign of row 1 and 2
+ vbsl.s16 q3, q4, q0 @Restore sign of row 3 and 4
+
+
+ vmovn.u16 d14, q5 @I Narrow the comparison for row 1 and 2 blk 1
+ vmovn.u16 d15, q6 @I Narrow the comparison for row 1 and 2 blk 2
+
+ vshr.u8 q8, q7, #7 @I Reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ]
+
+ vpadd.u8 d18, d16, d17 @I pair add nnz 1
+ vpadd.u8 d20, d18, d19 @I Pair add nnz 2
+ vpadd.u8 d22, d20, d21 @I Pair add nnz 3
+ vpadd.u8 d24, d22, d23 @I Pair add nnz4
+ vst1.s16 {q2-q3}, [r2] @Store blk
+
+ vmov.u8 d25, #16 @I Get max nnz
+ vsub.u8 d26, d25, d24 @I invert current nnz
+
+ vst1.u8 d26[0], [r9] @I Write nnz
+
+ vpop {d8-d15}
+ pop {r4-r12, pc}
+
+
+
+@*****************************************************************************
+@*
+@* Function Name : ih264_resi_trans_quant_chroma_4x4_a9
+@* Description : This function does residue calculation, forward transform
+@* and quantization for 4x4 chroma block.
+@*
+@* Arguments : R0 :pointer to src buffer
+@ R1 :pointer to pred buffer
+@ R2 :pointer to dst buffer
+@ R3 :source stride
+@ STACK : pred stride,
+@ dst stride,
+@ pointer to scaling matrix,
+@ pointer to threshold matrix,
+@ qbits,
+@ rounding factor,
+@ pointer to store nnz
+@ pointer to store unquantized dc values
+@ Values Returned : NONE
+@
+@ Register Usage :
+@ Stack Usage : 40 bytes
+@ Cycles : Around
+@ Interruptiaility : Interruptable
+@
+@ Known Limitations
+@ \Assumptions :
+@
+@ Revision History :
+@ DD MM YYYY Author(s) Changes
+@ 11 2 2015 100664 First version
+@
+@*****************************************************************************
+
+ .global ih264_resi_trans_quant_chroma_4x4_a9
+ih264_resi_trans_quant_chroma_4x4_a9:
+
+ @R0 :pointer to src buffer
+ @R1 :pointer to pred buffer
+ @R2 :pointer to dst buffer
+ @R3 :Source stride
+ @STACk :pred stride
+ @ :scale matirx,
+ @ :threshold matrix
+ @ :qbits
+ @ :round factor
+ @ :nnz
+ @ :pu1_dc_alt_addr
+ push {r4-r12, lr} @push all the variables first
+
+ add r11, sp, #40 @decrement stack pointer,to accomodate two variables
+ ldmfd r11, {r4-r10} @load the strides into registers
+
+ @R0 :pointer to src buffer
+ @R1 :pointer to pred buffer
+ @R2 :pointer to dst buffer
+ @R3 :Source stride
+ @R4 :Pred stride
+ @R5 :scale matirx,
+ @R6 :threshold matrix
+ @R7 :qbits
+ @R8 :round factor
+ @R9 :nnz
+ vpush {d8-d15}
+ mov r11, #0
+ sub r7, r11, r7 @Negate the qbit value for usiing LSL
+
+ @------------Fucntion Loading done----------------;
+
+ vld2.u8 {d10, d11}, [r0], r3 @load first 8 pix src row 1
+
+ vld2.u8 {d11, d12}, [r1], r4 @load first 8 pix pred row 1
+
+ vld2.u8 {d28, d29}, [r0], r3 @load first 8 pix src row 2
+
+ vld2.u8 {d29, d30}, [r1], r4 @load first 8 pix pred row 2
+
+ vld2.u8 {d25, d26}, [r0], r3 @load first 8 pix src row 3
+
+ vld2.u8 {d26, d27}, [r1], r4 @load first 8 pix pred row 3
+ vsubl.u8 q0, d10, d11 @find residue row 1
+
+ vld2.u8 {d22, d23}, [r0], r3 @load first 8 pix src row 4
+
+ vld2.u8 {d23, d24}, [r1], r4 @load first 8 pix pred row 4
+ vsubl.u8 q1, d28, d29 @find residue row 2
+
+ vsubl.u8 q2, d25, d26 @find residue row 3
+ vsubl.u8 q3, d22, d23 @find residue row 4
+
+ vtrn.16 d0, d2 @T12
+ vtrn.16 d4, d6 @T23
+ vtrn.32 d0, d4 @T13
+ vtrn.32 d2, d6 @T14
+
+ vadd.s16 d8 , d0, d6 @x0 = x4+x7
+ vadd.s16 d9 , d2, d4 @x1 = x5+x6
+ vsub.s16 d10, d2, d4 @x2 = x5-x6
+ vsub.s16 d11, d0, d6 @x3 = x4-x7
+
+ vshl.s16 d12, d10, #1 @U_SHIFT(x2,1,shft)
+ vshl.s16 d13, d11, #1 @U_SHIFT(x3,1,shft)
+
+ vadd.s16 d14, d8, d9 @x4 = x0 + x1;
+ vsub.s16 d16, d8, d9 @x6 = x0 - x1;
+ vadd.s16 d15, d13, d10 @x5 = U_SHIFT(x3,1,shft) + x2;
+ vsub.s16 d17, d11, d12 @x7 = x3 - U_SHIFT(x2,1,shft);
+
+ @taking transpose again so as to make do vert transform
+ vtrn.16 d14, d15 @T12
+ vtrn.16 d16, d17 @T23
+ vtrn.32 d14, d16 @T13
+ vtrn.32 d15, d17 @T24
+
+ @let us do vertical transform
+ @same code as horiz
+ vadd.s16 d18, d14, d17 @x0 = x4+x7
+ vadd.s16 d19, d15, d16 @x1 = x5+x6
+ vsub.s16 d20, d15, d16 @x2 = x5-x6
+ vsub.s16 d21, d14, d17 @x3 = x4-x7
+
+ vshl.s16 d22, d20, #1 @U_SHIFT(x2,1,shft)
+ vshl.s16 d23, d21, #1 @U_SHIFT(x3,1,shft)
+
+ vdup.s32 q4, r8 @Load rounding value row 1
+
+ vadd.s16 d24, d18, d19 @x5 = x0 + x1;
+ vsub.s16 d26, d18, d19 @x7 = x0 - x1;
+ vadd.s16 d25, d23, d20 @x6 = U_SHIFT(x3,1,shft) + x2;
+ vsub.s16 d27, d21, d22 @x8 = x3 - U_SHIFT(x2,1,shft);
+ vdup.s32 q10, r7 @Load qbit values
+
+ vst1.s16 d24[0], [r10] @Store Unquantized dc value to dc alte address
+
+@core tranform is done for 4x8 block 1
+ vld1.s16 {q14-q15}, [r5] @load the scaling values
+
+ vabs.s16 q0, q12 @Abs val of row 1 blk 1
+
+ vabs.s16 q1, q13 @Abs val of row 2 blk 1
+
+ vmov.s32 q5, q4 @copy round fact for row 2
+
+ vmov.s32 q6, q4 @copy round fact for row 2
+ vclt.s16 q2, q12, #0 @Get the sign of row 1 blk 1
+
+ vmov.s32 q7, q4 @copy round fact for row 2
+ vclt.s16 q3, q13, #0 @Get the sign of row 2 blk 1
+
+ vmlal.s16 q4, d0, d28 @Multiply and add row 1
+ vmlal.s16 q5, d1, d29 @Multiply and add row 2
+ vmlal.s16 q6, d2, d30 @Multiply and add row 3
+ vmlal.s16 q7, d3, d31 @Multiply and add row 4
+
+ vshl.s32 q11, q4, q10 @Shift row 1
+ vshl.s32 q12, q5, q10 @Shift row 2
+ vshl.s32 q13, q6, q10 @Shift row 3
+ vshl.s32 q14, q7, q10 @Shift row 4
+
+ vmovn.s32 d30, q11 @Narrow row 1
+ vmovn.s32 d31, q12 @Narrow row 2
+ vmovn.s32 d0 , q13 @Narrow row 3
+ vmovn.s32 d1 , q14 @Narrow row 4
+
+ vneg.s16 q1, q15 @Get negative
+ vneg.s16 q4, q0 @Get negative
+
+ vceq.s16 q5, q15, #0 @I compare with zero row 1 and 2 blk 1
+ vceq.s16 q6, q0 , #0 @I compare with zero row 1 and 2 blk 1
+
+ vbsl.s16 q2, q1, q15 @Restore sign of row 1 and 2
+ vbsl.s16 q3, q4, q0 @Restore sign of row 3 and 4
+
+ vmovn.u16 d14, q5 @I Narrow the comparison for row 1 and 2 blk 1
+ vmovn.u16 d15, q6 @I Narrow the comparison for row 1 and 2 blk 2
+
+ vshr.u8 q8, q7, #7 @I Reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ]
+
+ vpadd.u8 d18, d16, d17 @I pair add nnz 1
+ vpadd.u8 d20, d18, d19 @I Pair add nnz 2
+ vpadd.u8 d22, d20, d21 @I Pair add nnz 3
+ vpadd.u8 d24, d22, d23 @I Pair add nnz4
+ vst1.s16 {q2-q3}, [r2] @Store blk
+
+ vmov.u8 d25, #16 @I Get max nnz
+ vsub.u8 d26, d25, d24 @I invert current nnz
+
+ vst1.u8 d26[0], [r9] @I Write nnz
+
+ vpop {d8-d15}
+ pop {r4-r12, pc}
+
+
+
+@*****************************************************************************
+@*
+@* Function Name : ih264_hadamard_quant_4x4_a9
+@* Description : This function does forward hadamard transform and
+@* quantization for luma dc block
+@*
+@* Arguments : R0 :pointer to src buffer
+@ R1 :pointer to dst buffer
+@ R2 :pu2_scale_matrix
+@ R2 :pu2_threshold_matrix
+@ STACk : u4_qbits
+@ u4_round_factor
+@ pu1_nnz
+@ Values Returned : NONE
+@
+@ Register Usage :
+@ Stack Usage : 0 bytes
+@ Cycles : Around
+@ Interruptiaility : Interruptable
+@
+@ Known Limitations
+@ \Assumptions :
+@
+@ Revision History :
+@ DD MM YYYY Author(s) Changes
+@ 20 2 2015 100633 First version
+@
+@*****************************************************************************
+@ih264_hadamard_quant_4x4_a9(WORD16 *pi2_src, WORD16 *pi2_dst,
+@ const UWORD16 *pu2_scale_matrix,
+@ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
+@ UWORD32 u4_round_factor,UWORD8 *pu1_nnz
+@ )
+ .global ih264_hadamard_quant_4x4_a9
+ih264_hadamard_quant_4x4_a9:
+
+@Registert usage
+@ r0 : src
+@ r1 : dst
+@ r2 : *pu2_scale_matrix
+@ r3 : *pu2_threshold_matrix
+
+ vld4.s16 {d0, d1, d2, d3}, [r0]! @Load 4x4 block
+ vpush {d8-d15}
+
+ vld1.u16 d30[0], [r2] @load pu2_scale_matrix[0]
+
+ vaddl.s16 q3, d0, d3 @x0 = x4 + x7;
+ vaddl.s16 q4, d1, d2 @x1 = x5 + x6;
+ vsubl.s16 q5, d1, d2 @x2 = x5 - x6;
+ vsubl.s16 q6, d0, d3 @x3 = x4 - x7;
+
+ vdup.u16 d30, d30[0] @pu2_scale_matrix[0]
+
+ vadd.s32 q7, q3, q4 @pi2_dst[0] = x0 + x1;
+ vadd.s32 q8, q6, q5 @pi2_dst[1] = x3 + x2;
+ add r3, sp, #68 @Get address of u4_round_factor
+ vsub.s32 q9, q3, q4 @pi2_dst[2] = x0 - x1;
+ vsub.s32 q10, q6, q5 @pi2_dst[3] = x3 - x2;
+
+ vtrn.s32 q7, q8 @transpose 4x4 block
+ vtrn.s32 q9, q10
+ vld1.s32 d0[0], [r3] @load u4_round_factor
+ vswp d15, d18
+ vswp d17, d20
+
+ add r3, sp, #64 @Get address of u4_qbits
+ vadd.s32 q11, q7, q10 @x0 = x4 + x7;
+ vadd.s32 q12, q8, q9 @x1 = x5 + x6;
+ vld1.s32 d31[0], [r3] @load u4_qbits
+ vsub.s32 q13, q8, q9 @x2 = x5 - x6;
+ vsub.s32 q14, q7, q10 @x3 = x4 - x7;
+
+ vdup.s32 q7, d0[0] @u4_round_factor
+
+ vadd.s32 q0, q11, q12 @(x0 + x1)
+ vadd.s32 q1, q14, q13 @(x3 + x2)
+ vsub.s32 q2, q11, q12 @(x0 - x1)
+ vsub.s32 q3, q14, q13 @(x3 - x2)
+
+ vdup.s32 q11, d31[0] @u4_round_factor
+
+ vshrn.s32 d0, q0, #1 @i4_value = (x0 + x1) >> 1;
+ vshrn.s32 d1, q1, #1 @i4_value = (x3 + x2) >> 1;
+ vshrn.s32 d2, q2, #1 @i4_value = (x0 - x1) >> 1;
+ vshrn.s32 d3, q3, #1 @i4_value = (x3 - x2) >> 1;
+
+ vabs.s16 q5, q0
+ vabs.s16 q6, q1
+
+ vmov.s32 q8, q7 @Get the round fact
+ vmov.s32 q9, q7
+ vmov.s32 q10, q7
+
+ vclt.s16 q3, q0, #0 @get the sign row 1,2
+ vclt.s16 q4, q1, #0
+
+ vneg.s32 q11, q11 @-u4_round_factor
+
+ vmlal.u16 q7, d10, d30
+ vmlal.u16 q8, d11, d30
+ vmlal.u16 q9, d12, d30
+ vmlal.u16 q10, d13, d30
+
+ vshl.u32 q7, q7, q11
+ vshl.u32 q8, q8, q11
+ vshl.u32 q9, q9, q11
+ vshl.u32 q10, q10, q11
+
+ vqmovn.u32 d22, q7
+ vqmovn.u32 d23, q8
+ vqmovn.u32 d24, q9
+ vqmovn.u32 d25, q10
+
+ vneg.s16 q13, q11
+ vneg.s16 q14, q12
+
+ vbsl.s16 q3, q13, q11
+ vbsl.s16 q4, q14, q12
+
+ vceq.s16 q5, q11, #0
+ vceq.s16 q6, q12, #0
+
+ vst1.s16 {q3}, [r1]!
+
+ vshrn.u16 d14, q5, #8
+ vshrn.u16 d15, q6, #8
+
+ ldr r3, [sp, #72] @Load *pu1_nnz
+
+ vshr.u8 q7, q7, #7
+
+ vst1.s16 {q4}, [r1]!
+
+ vadd.u8 d16, d14, d15
+ vmov.u8 d20, #16
+ vpadd.u8 d17, d16, d16
+ vpadd.u8 d18, d17, d17
+ vpadd.u8 d19, d18, d18
+ vsub.u8 d20, d20, d19
+ vst1.u8 d20[0], [r3]
+
+ vpop {d8-d15}
+ bx lr
+
+
+
+
+@*****************************************************************************
+@*
+@* Function Name : ih264_hadamard_quant_2x2_uv_a9
+@* Description : This function does forward hadamard transform and
+@* quantization for dc block of chroma for both planes
+@*
+@* Arguments : R0 :pointer to src buffer
+@ R1 :pointer to dst buffer
+@ R2 :pu2_scale_matrix
+@ R2 :pu2_threshold_matrix
+@ STACk : u4_qbits
+@ u4_round_factor
+@ pu1_nnz
+@ Values Returned : NONE
+@
+@ Register Usage :
+@ Stack Usage : 0 bytes
+@ Cycles : Around
+@ Interruptiaility : Interruptable
+@
+@ Known Limitations
+@ \Assumptions :
+@
+@ Revision History :
+@ DD MM YYYY Author(s) Changes
+@ 20 2 2015 100633 First version
+@
+@*****************************************************************************
+@ ih264_hadamard_quant_2x2_uv_a9(WORD16 *pi2_src, WORD16 *pi2_dst,
+@ const UWORD16 *pu2_scale_matrix,
+@ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
+@ UWORD32 u4_round_factor,UWORD8 *pu1_nnz
+@ )
+
+ .global ih264_hadamard_quant_2x2_uv_a9
+ih264_hadamard_quant_2x2_uv_a9:
+
+ vpush {d8-d15}
+ vld2.s16 {d0-d1}, [r0] @load src
+
+ add r3, sp, #68 @Get address of u4_round_factor
+
+ vaddl.s16 q3, d0, d1 @x0 = x4 + x5;, x2 = x6 + x7;
+ vld1.u16 d30[0], [r2] @load pu2_scale_matrix[0]
+ vsubl.s16 q4, d0, d1 @x1 = x4 - x5; x3 = x6 - x7;
+
+ add r0, sp, #64 @Get affress of u4_qbits
+ vld1.s32 d28[0], [r3] @load u4_round_factor
+ vtrn.s32 q3, q4 @q1 -> x0 x1, q2 -> x2 x3
+
+ vadd.s32 q0, q3, q4 @ (x0 + x2) (x1 + x3) (y0 + y2); (y1 + y3);
+ vld1.s32 d24[0], [r0] @load u4_qbits
+ vsub.s32 q1, q3, q4 @ (x0 - x2) (x1 - x3) (y0 - y2); (y1 - y3);
+
+ vdup.u16 d30, d30[0] @pu2_scale_matrix
+
+ vabs.s32 q2, q0
+ vabs.s32 q3, q1
+
+ vdup.s32 q14, d28[0] @u4_round_factor
+
+ vmovl.u16 q15, d30 @pu2_scale_matrix
+
+ vclt.s32 q4, q0, #0 @get the sign row 1,2
+ vdup.s32 q12, d24[0] @u4_round_factor
+ vclt.s32 q5, q1, #0
+
+ vqmovn.u32 d8, q4
+ vqmovn.s32 d9, q5
+
+ vmov.s32 q13, q14 @Get the round fact
+ vneg.s32 q12, q12 @-u4_round_factor
+
+ vmla.u32 q13, q2, q15
+ vmla.u32 q14, q3, q15
+
+ vshl.u32 q13, q13, q12 @>>qbit
+ vshl.u32 q14, q14, q12 @>>qbit
+
+ vqmovn.u32 d10, q13
+ vqmovn.u32 d11, q14
+
+ vneg.s16 q6, q5
+
+ vbsl.s16 q4, q6, q5 @*sign
+
+ vtrn.s32 d8, d9
+
+ vceq.s16 q7, q4, #0 @Compute nnz
+
+ vshrn.u16 d14, q7, #8 @reduce nnz comparison to 1 bit
+
+ ldr r3, [sp, #72] @Load *pu1_nnz
+ vshr.u8 d14, d14, #7 @reduce nnz comparison to 1 bit
+ vmov.u8 d20, #4 @Since we add zeros, we need to subtract from 4 to get nnz
+ vpadd.u8 d17, d14, d14 @Sum up nnz
+
+ vst1.s16 {q4}, [r1]! @Store the block
+
+ vpadd.u8 d17, d17, d17 @Sum up nnz
+ vsub.u8 d20, d20, d17 @4- numzeros
+ vst1.u16 d20[0], [r3] @store nnz
+
+ vpop {d8-d15}
+ bx lr
+
+
+
+
+
diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s
new file mode 100755
index 0000000..ccae779
--- /dev/null
+++ b/common/arm/ih264_weighted_bi_pred_a9q.s
@@ -0,0 +1,642 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_weighted_bi_pred_a9q.s
+@*
+@* @brief
+@* Contains function definitions for weighted biprediction.
+@*
+@* @author
+@* Kaushik Senthoor R
+@*
+@* @par List of Functions:
+@*
+@* - ih264_weighted_bi_pred_luma_a9q()
+@* - ih264_weighted_bi_pred_chroma_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@*******************************************************************************
+@* @function
+@* ih264_weighted_bi_pred_luma_a9q()
+@*
+@* @brief
+@* This routine performs the weighted biprediction as described in sec
+@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
+@*
+@* @par Description:
+@* This function gets two ht x wd blocks, calculates the weighted samples,
+@* rounds off, adds offset and stores it in the destination block.
+@*
+@* @param[in] pu1_src1
+@* UWORD8 Pointer to the buffer containing the input block 1.
+@*
+@* @param[in] pu1_src2
+@* UWORD8 Pointer to the buffer containing the input block 2.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd1
+@* Stride of the input buffer 1
+@*
+@* @param[in] src_strd2
+@* Stride of the input buffer 2
+@*
+@* @param[in] dst_strd
+@* Stride of the destination buffer
+@*
+@* @param[in] log_wd
+@* number of bits to be rounded off
+@*
+@* @param[in] wt1
+@* weight for the weighted prediction
+@*
+@* @param[in] wt2
+@* weight for the weighted prediction
+@*
+@* @param[in] ofst1
+@* offset 1 used after rounding off
+@*
+@* @param[in] ofst2
+@* offset 2 used after rounding off
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
+@*
+@*******************************************************************************
+@*/
+@void ih264_weighted_bi_pred_luma_a9q(UWORD8 *pu1_src1,
+@ UWORD8 *pu1_src2,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd1,
+@ WORD32 src_strd2,
+@ WORD32 dst_strd,
+@ WORD32 log_wd,
+@ WORD32 wt1,
+@ WORD32 wt2,
+@ WORD32 ofst1,
+@ WORD32 ofst2,
+@ WORD32 ht,
+@ WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => pu1_src1
+@ r1 => pu1_src2
+@ r2 => pu1_dst
+@ r3 => src_strd1
+@ [sp] => src_strd2 (r4)
+@ [sp+4] => dst_strd (r5)
+@ [sp+8] => log_wd (r6)
+@ [sp+12] => wt1 (r7)
+@ [sp+16] => wt2 (r8)
+@ [sp+20] => ofst1 (r9)
+@ [sp+24] => ofst2 (r10)
+@ [sp+28] => ht (r11)
+@ [sp+32] => wd (r12)
+@
+.text
+.p2align 2
+
+ .global ih264_weighted_bi_pred_luma_a9q
+
+ih264_weighted_bi_pred_luma_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r6, [sp, #48] @Load log_wd in r6
+ ldr r7, [sp, #52] @Load wt1 in r7
+ ldr r8, [sp, #56] @Load wt2 in r8
+ ldr r9, [sp, #60] @Load ofst1 in r9
+
+ add r6, r6, #1 @r6 = log_wd + 1
+ sxtb r7, r7 @sign-extend 16-bit wt1 to 32-bit
+ ldr r4, [sp, #40] @Load src_strd2 in r4
+ ldr r5, [sp, #44] @Load dst_strd in r5
+ sxtb r9, r9 @sign-extend 8-bit ofst1 to 32-bit
+ rsb r10, r6, #0 @r13 = -(log_wd + 1)
+ ldr r11, [sp, #68] @Load ht in r11
+ ldr r12, [sp, #72] @Load wd in r12
+ vdup.16 q0, r10 @Q0 = -(log_wd + 1) (32-bit)
+ add r9, r9, #1 @r9 = ofst1 + 1
+
+ ldr r10, [sp, #64] @Load ofst2 in r10
+ sxtb r8, r8 @sign-extend 16-bit wt2 to 32-bit
+ cmp r12, #16 @check if wd is 16
+ vpush {d8-d15}
+ sxtb r10, r10 @sign-extend 8-bit ofst2 to 32-bit
+ add r9, r9, r10 @r9 = ofst1 + ofst2 + 1
+ vmov d2, r7, r8 @D2 = {wt1(32-bit), wt2(32-bit)}
+ asr r9, r9, #1 @r9 = ofst = (ofst1 + ofst2 + 1) >> 1
+ vdup.8 d3, r9 @D3 = ofst (8-bit)
+ beq loop_16 @branch if wd is 16
+
+ cmp r12, #8 @check if wd is 8
+ beq loop_8 @branch if wd is 8
+
+loop_4: @each iteration processes four rows
+
+ vld1.32 d4[0], [r0], r3 @load row 1 in source 1
+ vld1.32 d4[1], [r0], r3 @load row 2 in source 1
+ vld1.32 d6[0], [r1], r4 @load row 1 in source 2
+ vld1.32 d6[1], [r1], r4 @load row 2 in source 2
+
+ vmovl.u8 q2, d4 @converting rows 1,2 in source 1 to 16-bit
+ vld1.32 d8[0], [r0], r3 @load row 3 in source 1
+ vld1.32 d8[1], [r0], r3 @load row 4 in source 1
+ vmovl.u8 q3, d6 @converting rows 1,2 in source 2 to 16-bit
+ vld1.32 d10[0], [r1], r4 @load row 3 in source 2
+ vld1.32 d10[1], [r1], r4 @load row 4 in source 2
+
+ vmovl.u8 q4, d8 @converting rows 3,4 in source 1 to 16-bit
+ vmovl.u8 q5, d10 @converting rows 3,4 in source 2 to 16-bit
+
+ vmul.s16 q2, q2, d2[0] @weight 1 mult. for rows 1,2
+ vmla.s16 q2, q3, d2[2] @weight 2 mult. for rows 1,2
+ vmul.s16 q4, q4, d2[0] @weight 1 mult. for rows 3,4
+ vmla.s16 q4, q5, d2[2] @weight 2 mult. for rows 3,4
+
+ subs r11, r11, #4 @decrement ht by 4
+ vrshl.s16 q2, q2, q0 @rounds off the weighted samples from rows 1,2
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from rows 3,4
+
+ vaddw.s8 q2, q2, d3 @adding offset for rows 1,2
+ vaddw.s8 q4, q4, d3 @adding offset for rows 3,4
+
+ vqmovun.s16 d4, q2 @saturating rows 1,2 to unsigned 8-bit
+ vqmovun.s16 d8, q4 @saturating rows 3,4 to unsigned 8-bit
+
+ vst1.32 d4[0], [r2], r5 @store row 1 in destination
+ vst1.32 d4[1], [r2], r5 @store row 2 in destination
+ vst1.32 d8[0], [r2], r5 @store row 3 in destination
+ vst1.32 d8[1], [r2], r5 @store row 4 in destination
+
+ bgt loop_4 @if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_8: @each iteration processes four rows
+
+ vld1.8 d4, [r0], r3 @load row 1 in source 1
+ vld1.8 d6, [r1], r4 @load row 1 in source 2
+ vld1.8 d8, [r0], r3 @load row 2 in source 1
+ vld1.8 d10, [r1], r4 @load row 2 in source 2
+ vmovl.u8 q2, d4 @converting row 1 in source 1 to 16-bit
+ vld1.8 d12, [r0], r3 @load row 3 in source 1
+ vld1.8 d14, [r1], r4 @load row 3 in source 2
+ vmovl.u8 q3, d6 @converting row 1 in source 2 to 16-bit
+ vld1.8 d16, [r0], r3 @load row 4 in source 1
+ vld1.8 d18, [r1], r4 @load row 4 in source 2
+
+ vmovl.u8 q4, d8 @converting row 2 in source 1 to 16-bit
+ vmovl.u8 q5, d10 @converting row 2 in source 2 to 16-bit
+
+ vmul.s16 q2, q2, d2[0] @weight 1 mult. for row 1
+ vmla.s16 q2, q3, d2[2] @weight 2 mult. for row 1
+ vmovl.u8 q6, d12 @converting row 3 in source 1 to 16-bit
+ vmovl.u8 q7, d14 @converting row 3 in source 2 to 16-bit
+ vmul.s16 q4, q4, d2[0] @weight 1 mult. for row 2
+ vmla.s16 q4, q5, d2[2] @weight 2 mult. for row 2
+ vmovl.u8 q8, d16 @converting row 4 in source 1 to 16-bit
+ vmovl.u8 q9, d18 @converting row 4 in source 2 to 16-bit
+
+ vmul.s16 q6, q6, d2[0] @weight 1 mult. for row 3
+ vmla.s16 q6, q7, d2[2] @weight 2 mult. for row 3
+ vmul.s16 q8, q8, d2[0] @weight 1 mult. for row 4
+ vmla.s16 q8, q9, d2[2] @weight 2 mult. for row 4
+
+ vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2
+ vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 3
+ vaddw.s8 q2, q2, d3 @adding offset for row 1
+ vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 4
+ vaddw.s8 q4, q4, d3 @adding offset for row 2
+
+ vaddw.s8 q6, q6, d3 @adding offset for row 3
+ vqmovun.s16 d4, q2 @saturating row 1 to unsigned 8-bit
+ vaddw.s8 q8, q8, d3 @adding offset for row 4
+ vqmovun.s16 d8, q4 @saturating row 2 to unsigned 8-bit
+
+ vqmovun.s16 d12, q6 @saturating row 3 to unsigned 8-bit
+ vqmovun.s16 d16, q8 @saturating row 4 to unsigned 8-bit
+
+ vst1.8 d4, [r2], r5 @store row 1 in destination
+ vst1.8 d8, [r2], r5 @store row 2 in destination
+ subs r11, r11, #4 @decrement ht by 4
+ vst1.8 d12, [r2], r5 @store row 3 in destination
+ vst1.8 d16, [r2], r5 @store row 4 in destination
+
+ bgt loop_8 @if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_16: @each iteration processes two rows
+
+ vld1.8 {q2}, [r0], r3 @load row 1 in source 1
+ vld1.8 {q3}, [r1], r4 @load row 1 in source 2
+ vld1.8 {q4}, [r0], r3 @load row 2 in source 1
+ vld1.8 {q5}, [r1], r4 @load row 2 in source 2
+ vmovl.u8 q10, d4 @converting row 1L in source 1 to 16-bit
+ vld1.8 {q6}, [r0], r3 @load row 3 in source 1
+ vld1.8 {q7}, [r1], r4 @load row 3 in source 2
+ vmovl.u8 q11, d6 @converting row 1L in source 2 to 16-bit
+ vld1.8 {q8}, [r0], r3 @load row 4 in source 1
+ vld1.8 {q9}, [r1], r4 @load row 4 in source 2
+
+ vmovl.u8 q2, d5 @converting row 1H in source 1 to 16-bit
+ vmovl.u8 q3, d7 @converting row 1H in source 2 to 16-bit
+
+ vmul.s16 q10, q10, d2[0] @weight 1 mult. for row 1L
+ vmla.s16 q10, q11, d2[2] @weight 2 mult. for row 1L
+ vmovl.u8 q12, d8 @converting row 2L in source 1 to 16-bit
+ vmovl.u8 q13, d10 @converting row 2L in source 2 to 16-bit
+
+ vmul.s16 q2, q2, d2[0] @weight 1 mult. for row 1H
+ vmla.s16 q2, q3, d2[2] @weight 2 mult. for row 1H
+ vmovl.u8 q4, d9 @converting row 2H in source 1 to 16-bit
+ vmovl.u8 q5, d11 @converting row 2H in source 2 to 16-bit
+
+ vmul.s16 q12, q12, d2[0] @weight 1 mult. for row 2L
+ vmla.s16 q12, q13, d2[2] @weight 2 mult. for row 2L
+ vmovl.u8 q14, d12 @converting row 3L in source 1 to 16-bit
+ vmovl.u8 q15, d14 @converting row 3L in source 2 to 16-bit
+
+ vmul.s16 q4, q4, d2[0] @weight 1 mult. for row 2H
+ vmla.s16 q4, q5, d2[2] @weight 2 mult. for row 2H
+ vmovl.u8 q6, d13 @converting row 3H in source 1 to 16-bit
+ vmovl.u8 q7, d15 @converting row 3H in source 2 to 16-bit
+
+ vmul.s16 q14, q14, d2[0] @weight 1 mult. for row 3L
+ vmla.s16 q14, q15, d2[2] @weight 2 mult. for row 3L
+ vmovl.u8 q11, d16 @converting row 4L in source 1 to 16-bit
+ vmovl.u8 q3, d18 @converting row 4L in source 2 to 16-bit
+
+ vmul.s16 q6, q6, d2[0] @weight 1 mult. for row 3H
+ vmla.s16 q6, q7, d2[2] @weight 2 mult. for row 3H
+ vmovl.u8 q8, d17 @converting row 4H in source 1 to 16-bit
+ vmovl.u8 q9, d19 @converting row 4H in source 2 to 16-bit
+
+ vmul.s16 q11, q11, d2[0] @weight 1 mult. for row 4L
+ vmla.s16 q11, q3, d2[2] @weight 2 mult. for row 4L
+ vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 1L
+
+ vmul.s16 q8, q8, d2[0] @weight 1 mult. for row 4H
+ vmla.s16 q8, q9, d2[2] @weight 2 mult. for row 4H
+ vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1H
+
+ vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 2L
+ vaddw.s8 q10, q10, d3 @adding offset for row 1L
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2H
+ vaddw.s8 q2, q2, d3 @adding offset for row 1H
+ vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 3L
+ vaddw.s8 q12, q12, d3 @adding offset for row 2L
+ vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 3H
+ vaddw.s8 q4, q4, d3 @adding offset for row 2H
+ vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 4L
+ vaddw.s8 q14, q14, d3 @adding offset for row 3L
+ vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 4H
+ vaddw.s8 q6, q6, d3 @adding offset for row 3H
+
+ vqmovun.s16 d26, q10 @saturating row 1L to unsigned 8-bit
+ vaddw.s8 q11, q11, d3 @adding offset for row 4L
+ vqmovun.s16 d27, q2 @saturating row 1H to unsigned 8-bit
+ vaddw.s8 q8, q8, d3 @adding offset for row 4H
+
+ vqmovun.s16 d10, q12 @saturating row 2L to unsigned 8-bit
+ vqmovun.s16 d11, q4 @saturating row 2H to unsigned 8-bit
+ vqmovun.s16 d30, q14 @saturating row 3L to unsigned 8-bit
+ vqmovun.s16 d31, q6 @saturating row 3H to unsigned 8-bit
+ vst1.8 {q13}, [r2], r5 @store row 1 in destination
+ vqmovun.s16 d14, q11 @saturating row 4L to unsigned 8-bit
+ vqmovun.s16 d15, q8 @saturating row 4H to unsigned 8-bit
+
+ vst1.8 {q5}, [r2], r5 @store row 2 in destination
+ subs r11, r11, #4 @decrement ht by 4
+ vst1.8 {q15}, [r2], r5 @store row 3 in destination
+ vst1.8 {q7}, [r2], r5 @store row 4 in destination
+
+ bgt loop_16 @if greater than 0 repeat the loop again
+
+end_loops:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from sp
+
+
+@*******************************************************************************
+@* @function
+@* ih264_weighted_bi_pred_chroma_a9q()
+@*
+@* @brief
+@* This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
+@*
+@* @par Description:
+@* This function gets two ht x wd blocks, calculates the weighted samples,
+@* rounds off, adds offset and stores it in the destination block for U and V.
+@*
+@* @param[in] pu1_src1
+@* UWORD8 Pointer to the buffer containing the input block 1.
+@*
+@* @param[in] pu1_src2
+@* UWORD8 Pointer to the buffer containing the input block 2.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd1
+@* Stride of the input buffer 1
+@*
+@* @param[in] src_strd2
+@* Stride of the input buffer 2
+@*
+@* @param[in] dst_strd
+@* Stride of the destination buffer
+@*
+@* @param[in] log_wd
+@* number of bits to be rounded off
+@*
+@* @param[in] wt1
+@* weights for the weighted prediction in U and V
+@*
+@* @param[in] wt2
+@* weights for the weighted prediction in U and V
+@*
+@* @param[in] ofst1
+@* offset 1 used after rounding off for U an dV
+@*
+@* @param[in] ofst2
+@* offset 2 used after rounding off for U and V
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
+@*
+@*******************************************************************************
+@*/
+@void ih264_weighted_bi_pred_chroma_a9q(UWORD8 *pu1_src1,
+@ UWORD8 *pu1_src2,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd1,
+@ WORD32 src_strd2,
+@ WORD32 dst_strd,
+@ WORD32 log_wd,
+@ WORD32 wt1,
+@ WORD32 wt2,
+@ WORD32 ofst1,
+@ WORD32 ofst2,
+@ WORD32 ht,
+@ WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => pu1_src1
+@ r1 => pu1_src2
+@ r2 => pu1_dst
+@ r3 => src_strd1
+@ [sp] => src_strd2 (r4)
+@ [sp+4] => dst_strd (r5)
+@ [sp+8] => log_wd (r6)
+@ [sp+12] => wt1 (r7)
+@ [sp+16] => wt2 (r8)
+@ [sp+20] => ofst1 (r9)
+@ [sp+24] => ofst2 (r10)
+@ [sp+28] => ht (r11)
+@ [sp+32] => wd (r12)
+@
+
+
+ .global ih264_weighted_bi_pred_chroma_a9q
+
+ih264_weighted_bi_pred_chroma_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r6, [sp, #48] @Load log_wd in r6
+ ldr r7, [sp, #52] @Load wt1 in r7
+ ldr r8, [sp, #56] @Load wt2 in r8
+ add r6, r6, #1 @r6 = log_wd + 1
+ ldr r9, [sp, #60] @Load ofst1 in r9
+ ldr r10, [sp, #64] @Load ofst2 in r10
+
+ rsb r12, r6, #0 @r12 = -(log_wd + 1)
+ ldr r4, [sp, #40] @Load src_strd2 in r4
+ ldr r5, [sp, #44] @Load dst_strd in r5
+ vdup.16 q0, r12 @Q0 = -(log_wd + 1) (16-bit)
+
+ ldr r11, [sp, #68] @Load ht in r11
+ vdup.32 q1, r7 @Q1 = (wt1_u, wt1_v) (32-bit)
+ ldr r12, [sp, #72] @Load wd in r12
+ vdup.32 q2, r8 @Q2 = (wt2_u, wt2_v) (32-bit)
+ asr r7, r9, #8 @r7 = ofst1_v
+ asr r8, r10, #8 @r8 = ofst2_v
+ vpush {d8-d15}
+ sxtb r9, r9 @sign-extend 8-bit ofst1_u to 32-bit
+ sxtb r10, r10 @sign-extend 8-bit ofst2_u to 32-bit
+ sxtb r7, r7 @sign-extend 8-bit ofst1_v to 32-bit
+ sxtb r8, r8 @sign-extend 8-bit ofst2_v to 32-bit
+
+ add r9, r9, #1 @r9 = ofst1_u + 1
+ add r7, r7, #1 @r7 = ofst1_v + 1
+ add r9, r9, r10 @r9 = ofst1_u + ofst2_u + 1
+ add r7, r7, r8 @r7 = ofst1_v + ofst2_v + 1
+ asr r9, r9, #1 @r9 = ofst_u = (ofst1_u + ofst2_u + 1) >> 1
+ asr r7, r7, #1 @r7 = ofst_v = (ofst1_v + ofst2_v + 1) >> 1
+ cmp r12, #8 @check if wd is 8
+ pkhbt r9, r9, r7, lsl #16 @r9 = {ofst_u(16-bit), ofst_v(16-bit)}
+ vdup.32 q3, r9 @Q3 = {ofst_u(16-bit), ofst_v(16-bit)}
+ beq loop_8_uv @branch if wd is 8
+
+ cmp r12, #4 @check if wd is 4
+ beq loop_4_uv @branch if wd is 4
+
+loop_2_uv: @each iteration processes two rows
+
+ vld1.32 d8[0], [r0], r3 @load row 1 in source 1
+ vld1.32 d8[1], [r0], r3 @load row 2 in source 1
+ vld1.32 d10[0], [r1], r4 @load row 1 in source 2
+ vld1.32 d10[1], [r1], r4 @load row 2 in source 2
+
+ vmovl.u8 q4, d8 @converting rows 1,2 in source 1 to 16-bit
+ vmovl.u8 q5, d10 @converting rows 1,2 in source 2 to 16-bit
+
+ vmul.s16 q4, q4, q1 @weight 1 mult. for rows 1,2
+ vmla.s16 q4, q5, q2 @weight 2 mult. for rows 1,2
+
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from rows 1,2
+
+ vadd.s16 q4, q4, q3 @adding offset for rows 1,2
+
+ vqmovun.s16 d8, q4 @saturating rows 1,2 to unsigned 8-bit
+
+ vst1.32 d8[0], [r2], r5 @store row 1 in destination
+ vst1.32 d8[1], [r2], r5 @store row 2 in destination
+
+ subs r11, r11, #2 @decrement ht by 2
+ bgt loop_2_uv @if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_4_uv: @each iteration processes two rows
+
+ vld1.8 d8, [r0], r3 @load row 1 in source 1
+ vld1.8 d10, [r1], r4 @load row 1 in source 2
+ vmovl.u8 q4, d8 @converting row 1 in source 1 to 16-bit
+ vld1.8 d12, [r0], r3 @load row 2 in source 1
+ vmovl.u8 q5, d10 @converting row 1 in source 2 to 16-bit
+ vld1.8 d14, [r1], r4 @load row 2 in source 2
+
+ vmovl.u8 q6, d12 @converting row 2 in source 1 to 16-bit
+ vmul.s16 q4, q4, q1 @weight 1 mult. for row 1
+ vmla.s16 q4, q5, q2 @weight 2 mult. for row 1
+ vmovl.u8 q7, d14 @converting row 2 in source 2 to 16-bit
+
+ vmul.s16 q6, q6, q1 @weight 1 mult. for row 2
+ vmla.s16 q6, q7, q2 @weight 2 mult. for row 2
+
+ subs r11, r11, #2 @decrement ht by 2
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 1
+ vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 2
+ vadd.s16 q4, q4, q3 @adding offset for row 1
+ vadd.s16 q6, q6, q3 @adding offset for row 2
+
+ vqmovun.s16 d8, q4 @saturating row 1 to unsigned 8-bit
+ vqmovun.s16 d12, q6 @saturating row 2 to unsigned 8-bit
+
+ vst1.8 d8, [r2], r5 @store row 1 in destination
+ vst1.8 d12, [r2], r5 @store row 2 in destination
+
+ bgt loop_4_uv @if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_8_uv: @each iteration processes two rows
+
+ vld1.8 {q4}, [r0], r3 @load row 1 in source 1
+ vld1.8 {q5}, [r1], r4 @load row 1 in source 2
+ vld1.8 {q6}, [r0], r3 @load row 2 in source 1
+ vld1.8 {q7}, [r1], r4 @load row 2 in source 2
+ vmovl.u8 q12, d8 @converting row 1L in source 1 to 16-bit
+ vld1.8 {q8}, [r0], r3 @load row 3 in source 1
+ vld1.8 {q9}, [r1], r4 @load row 3 in source 2
+ vmovl.u8 q13, d10 @converting row 1L in source 2 to 16-bit
+ vld1.8 {q10}, [r0], r3 @load row 4 in source 1
+ vld1.8 {q11}, [r1], r4 @load row 4 in source 2
+
+ vmovl.u8 q4, d9 @converting row 1H in source 1 to 16-bit
+ vmovl.u8 q5, d11 @converting row 1H in source 2 to 16-bit
+
+ vmul.s16 q12, q12, q1 @weight 1 mult. for row 1L
+ vmla.s16 q12, q13, q2 @weight 2 mult. for row 1L
+ vmovl.u8 q14, d12 @converting row 2L in source 1 to 16-bit
+ vmovl.u8 q15, d14 @converting row 2L in source 2 to 16-bit
+
+ vmul.s16 q4, q4, q1 @weight 1 mult. for row 1H
+ vmla.s16 q4, q5, q2 @weight 2 mult. for row 1H
+ vmovl.u8 q6, d13 @converting row 2H in source 1 to 16-bit
+ vmovl.u8 q7, d15 @converting row 2H in source 2 to 16-bit
+
+ vmul.s16 q14, q14, q1 @weight 1 mult. for row 2L
+ vmla.s16 q14, q15, q2 @weight 2 mult. for row 2L
+ vmovl.u8 q13, d16 @converting row 3L in source 1 to 16-bit
+ vmovl.u8 q5, d18 @converting row 3L in source 2 to 16-bit
+
+ vmul.s16 q6, q6, q1 @weight 1 mult. for row 2H
+ vmla.s16 q6, q7, q2 @weight 2 mult. for row 2H
+ vmovl.u8 q8, d17 @converting row 3H in source 1 to 16-bit
+ vmovl.u8 q9, d19 @converting row 3H in source 2 to 16-bit
+
+ vmul.s16 q13, q13, q1 @weight 1 mult. for row 3L
+ vmla.s16 q13, q5, q2 @weight 2 mult. for row 3L
+ vmovl.u8 q15, d20 @converting row 4L in source 1 to 16-bit
+ vmovl.u8 q7, d22 @converting row 4L in source 2 to 16-bit
+
+ vmul.s16 q8, q8, q1 @weight 1 mult. for row 3H
+ vmla.s16 q8, q9, q2 @weight 2 mult. for row 3H
+ vmovl.u8 q10, d21 @converting row 4H in source 1 to 16-bit
+ vmovl.u8 q11, d23 @converting row 4H in source 2 to 16-bit
+
+ vmul.s16 q15, q15, q1 @weight 1 mult. for row 4L
+ vmla.s16 q15, q7, q2 @weight 2 mult. for row 4L
+ vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 1L
+
+ vmul.s16 q10, q10, q1 @weight 1 mult. for row 4H
+ vmla.s16 q10, q11, q2 @weight 2 mult. for row 4H
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 1H
+
+ vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 2L
+ vadd.s16 q12, q12, q3 @adding offset for row 1L
+ vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 2H
+ vadd.s16 q4, q4, q3 @adding offset for row 1H
+ vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 3L
+ vadd.s16 q14, q14, q3 @adding offset for row 2L
+ vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 3H
+ vadd.s16 q6, q6, q3 @adding offset for row 2H
+ vrshl.s16 q15, q15, q0 @rounds off the weighted samples from row 4L
+ vadd.s16 q13, q13, q3 @adding offset for row 3L
+ vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 4H
+ vadd.s16 q8, q8, q3 @adding offset for row 3H
+
+ vqmovun.s16 d10, q12 @saturating row 1L to unsigned 8-bit
+ vadd.s16 q15, q15, q3 @adding offset for row 4L
+ vqmovun.s16 d11, q4 @saturating row 1H to unsigned 8-bit
+ vadd.s16 q10, q10, q3 @adding offset for row 4H
+
+ vqmovun.s16 d18, q14 @saturating row 2L to unsigned 8-bit
+ vqmovun.s16 d19, q6 @saturating row 2H to unsigned 8-bit
+ vqmovun.s16 d14, q13 @saturating row 3L to unsigned 8-bit
+ vqmovun.s16 d15, q8 @saturating row 3H to unsigned 8-bit
+ vst1.8 {q5}, [r2], r5 @store row 1 in destination
+ vqmovun.s16 d22, q15 @saturating row 4L to unsigned 8-bit
+ vqmovun.s16 d23, q10 @saturating row 4H to unsigned 8-bit
+
+ vst1.8 {q9}, [r2], r5 @store row 2 in destination
+ subs r11, r11, #4 @decrement ht by 4
+ vst1.8 {q7}, [r2], r5 @store row 3 in destination
+ vst1.8 {q11}, [r2], r5 @store row 4 in destination
+
+ bgt loop_8_uv @if greater than 0 repeat the loop again
+
+end_loops_uv:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from sp
+
+
diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s
new file mode 100755
index 0000000..1ce94d0
--- /dev/null
+++ b/common/arm/ih264_weighted_pred_a9q.s
@@ -0,0 +1,479 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_weighted_pred_a9q.s
+@*
+@* @brief
+@* Contains function definitions for weighted prediction.
+@*
+@* @author
+@* Kaushik Senthoor R
+@*
+@* @par List of Functions:
+@*
+@* - ih264_weighted_pred_luma_a9q()
+@* - ih264_weighted_pred_chroma_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@*******************************************************************************
+@* @function
+@* ih264_weighted_pred_luma_a9q()
+@*
+@* @brief
+@* This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
+@*
+@* @par Description:
+@* This function gets a ht x wd block, calculates the weighted sample, rounds
+@* off, adds offset and stores it in the destination block.
+@*
+@* @param[in] pu1_src:
+@* UWORD8 Pointer to the buffer containing the input block.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd
+@* Stride of the input buffer
+@*
+@* @param[in] dst_strd
+@* Stride of the destination buffer
+@*
+@* @param[in] log_wd
+@* number of bits to be rounded off
+@*
+@* @param[in] wt
+@* weight for the weighted prediction
+@*
+@* @param[in] ofst
+@* offset used after rounding off
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
+@*
+@*******************************************************************************
+@*/
+@void ih264_weighted_pred_luma_a9q(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 log_wd,
+@ WORD32 wt,
+@ WORD32 ofst,
+@ WORD32 ht,
+@ WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => pu1_src
+@ r1 => pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ [sp] => log_wd (r4)
+@ [sp+4] => wt (r5)
+@ [sp+8] => ofst (r6)
+@ [sp+12] => ht (r7)
+@ [sp+16] => wd (r8)
+@
+.text
+.p2align 2
+
+ .global ih264_weighted_pred_luma_a9q
+
+ih264_weighted_pred_luma_a9q:
+
+ stmfd sp!, {r4-r9, r14} @stack stores the values of the arguments
+ ldr r5, [sp, #32] @Load wt
+ ldr r4, [sp, #28] @Load log_wd in r4
+ ldr r6, [sp, #36] @Load ofst
+ ldr r7, [sp, #40] @Load ht
+ ldr r8, [sp, #44] @Load wd
+ vpush {d8-d15}
+
+ vdup.16 d2, r5 @D2 = wt (16-bit)
+ rsb r9, r4, #0 @r9 = -log_wd
+ vdup.8 d3, r6 @D3 = ofst (8-bit)
+ cmp r8, #16 @check if wd is 16
+ vdup.16 q0, r9 @Q0 = -log_wd (16-bit)
+ beq loop_16 @branch if wd is 16
+
+ cmp r8, #8 @check if wd is 8
+ beq loop_8 @branch if wd is 8
+
+loop_4: @each iteration processes four rows
+
+ vld1.32 d4[0], [r0], r2 @load row 1 in source
+ vld1.32 d4[1], [r0], r2 @load row 2 in source
+ vld1.32 d6[0], [r0], r2 @load row 3 in source
+ vld1.32 d6[1], [r0], r2 @load row 4 in source
+
+ vmovl.u8 q2, d4 @converting rows 1,2 to 16-bit
+ vmovl.u8 q3, d6 @converting rows 3,4 to 16-bit
+
+ vmul.s16 q2, q2, d2[0] @weight mult. for rows 1,2
+ vmul.s16 q3, q3, d2[0] @weight mult. for rows 3,4
+
+ subs r7, r7, #4 @decrement ht by 4
+ vrshl.s16 q2, q2, q0 @rounds off the weighted samples from rows 1,2
+ vrshl.s16 q3, q3, q0 @rounds off the weighted samples from rows 3,4
+
+ vaddw.s8 q2, q2, d3 @adding offset for rows 1,2
+ vaddw.s8 q3, q3, d3 @adding offset for rows 3,4
+
+ vqmovun.s16 d4, q2 @saturating rows 1,2 to unsigned 8-bit
+ vqmovun.s16 d6, q3 @saturating rows 3,4 to unsigned 8-bit
+
+ vst1.32 d4[0], [r1], r3 @store row 1 in destination
+ vst1.32 d4[1], [r1], r3 @store row 2 in destination
+ vst1.32 d6[0], [r1], r3 @store row 3 in destination
+ vst1.32 d6[1], [r1], r3 @store row 4 in destination
+
+ bgt loop_4 @if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_8: @each iteration processes four rows
+
+ vld1.8 d4, [r0], r2 @load row 1 in source
+ vld1.8 d6, [r0], r2 @load row 2 in source
+ vld1.8 d8, [r0], r2 @load row 3 in source
+ vmovl.u8 q2, d4 @converting row 1 to 16-bit
+ vld1.8 d10, [r0], r2 @load row 4 in source
+ vmovl.u8 q3, d6 @converting row 2 to 16-bit
+
+ vmovl.u8 q4, d8 @converting row 3 to 16-bit
+ vmul.s16 q2, q2, d2[0] @weight mult. for row 1
+ vmovl.u8 q5, d10 @converting row 4 to 16-bit
+ vmul.s16 q3, q3, d2[0] @weight mult. for row 2
+ vmul.s16 q4, q4, d2[0] @weight mult. for row 3
+ vmul.s16 q5, q5, d2[0] @weight mult. for row 4
+
+ vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1
+ vrshl.s16 q3, q3, q0 @rounds off the weighted samples from row 2
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 3
+ vaddw.s8 q2, q2, d3 @adding offset for row 1
+ vrshl.s16 q5, q5, q0 @rounds off the weighted samples from row 4
+ vaddw.s8 q3, q3, d3 @adding offset for row 2
+
+ vaddw.s8 q4, q4, d3 @adding offset for row 3
+ vqmovun.s16 d4, q2 @saturating row 1 to unsigned 8-bit
+ vaddw.s8 q5, q5, d3 @adding offset for row 4
+ vqmovun.s16 d6, q3 @saturating row 2 to unsigned 8-bit
+ vqmovun.s16 d8, q4 @saturating row 3 to unsigned 8-bit
+ vqmovun.s16 d10, q5 @saturating row 4 to unsigned 8-bit
+
+ vst1.8 d4, [r1], r3 @store row 1 in destination
+ vst1.8 d6, [r1], r3 @store row 2 in destination
+ subs r7, r7, #4 @decrement ht by 4
+ vst1.8 d8, [r1], r3 @store row 3 in destination
+ vst1.8 d10, [r1], r3 @store row 4 in destination
+
+ bgt loop_8 @if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_16: @each iteration processes two rows
+
+ vld1.8 {q2}, [r0], r2 @load row 1 in source
+ vld1.8 {q3}, [r0], r2 @load row 2 in source
+ vmovl.u8 q6, d4 @converting row 1L to 16-bit
+ vld1.8 {q4}, [r0], r2 @load row 3 in source
+ vmovl.u8 q7, d5 @converting row 1H to 16-bit
+ vld1.8 {q5}, [r0], r2 @load row 4 in source
+
+ vmovl.u8 q8, d6 @converting row 2L to 16-bit
+ vmul.s16 q6, q6, d2[0] @weight mult. for row 1L
+ vmovl.u8 q9, d7 @converting row 2H to 16-bit
+ vmul.s16 q7, q7, d2[0] @weight mult. for row 1H
+ vmovl.u8 q10, d8 @converting row 3L to 16-bit
+ vmul.s16 q8, q8, d2[0] @weight mult. for row 2L
+ vmovl.u8 q11, d9 @converting row 3H to 16-bit
+ vmul.s16 q9, q9, d2[0] @weight mult. for row 2H
+ vmovl.u8 q12, d10 @converting row 4L to 16-bit
+ vmul.s16 q10, q10, d2[0] @weight mult. for row 3L
+ vmovl.u8 q13, d11 @converting row 4H to 16-bit
+ vmul.s16 q11, q11, d2[0] @weight mult. for row 3H
+
+ vmul.s16 q12, q12, d2[0] @weight mult. for row 4L
+ vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 1L
+ vmul.s16 q13, q13, d2[0] @weight mult. for row 4H
+
+ vrshl.s16 q7, q7, q0 @rounds off the weighted samples from row 1H
+ vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 2L
+ vaddw.s8 q6, q6, d3 @adding offset for row 1L
+ vrshl.s16 q9, q9, q0 @rounds off the weighted samples from row 2H
+ vaddw.s8 q7, q7, d3 @adding offset for row 1H
+ vqmovun.s16 d4, q6 @saturating row 1L to unsigned 8-bit
+ vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 3L
+ vaddw.s8 q8, q8, d3 @adding offset for row 2L
+ vqmovun.s16 d5, q7 @saturating row 1H to unsigned 8-bit
+ vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 3H
+ vaddw.s8 q9, q9, d3 @adding offset for row 2H
+ vqmovun.s16 d6, q8 @saturating row 2L to unsigned 8-bit
+ vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 4L
+ vaddw.s8 q10, q10, d3 @adding offset for row 3L
+ vqmovun.s16 d7, q9 @saturating row 2H to unsigned 8-bit
+ vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 4H
+ vaddw.s8 q11, q11, d3 @adding offset for row 3H
+
+ vqmovun.s16 d8, q10 @saturating row 3L to unsigned 8-bit
+ vaddw.s8 q12, q12, d3 @adding offset for row 4L
+ vqmovun.s16 d9, q11 @saturating row 3H to unsigned 8-bit
+ vaddw.s8 q13, q13, d3 @adding offset for row 4H
+
+ vqmovun.s16 d10, q12 @saturating row 4L to unsigned 8-bit
+ vst1.8 {q2}, [r1], r3 @store row 1 in destination
+ vqmovun.s16 d11, q13 @saturating row 4H to unsigned 8-bit
+ vst1.8 {q3}, [r1], r3 @store row 2 in destination
+ subs r7, r7, #4 @decrement ht by 4
+ vst1.8 {q4}, [r1], r3 @store row 3 in destination
+ vst1.8 {q5}, [r1], r3 @store row 4 in destination
+
+ bgt loop_16 @if greater than 0 repeat the loop again
+
+end_loops:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r9, r15} @Reload the registers from sp
+
+
+@*******************************************************************************
+@* @function
+@* ih264_weighted_pred_chroma_a9q()
+@*
+@* @brief
+@* This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
+@*
+@* @par Description:
+@* This function gets a ht x wd block, calculates the weighted sample, rounds
+@* off, adds offset and stores it in the destination block for U and V.
+@*
+@* @param[in] pu1_src:
+@* UWORD8 Pointer to the buffer containing the input block.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd
+@* Stride of the input buffer
+@*
+@* @param[in] dst_strd
+@* Stride of the destination buffer
+@*
+@* @param[in] log_wd
+@* number of bits to be rounded off
+@*
+@* @param[in] wt
+@* weights for the weighted prediction for U and V
+@*
+@* @param[in] ofst
+@* offsets used after rounding off for U and V
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
+@*
+@*******************************************************************************
+@*/
+@void ih264_weighted_pred_chroma_a9q(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 log_wd,
+@ WORD32 wt,
+@ WORD32 ofst,
+@ WORD32 ht,
+@ WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => pu1_src
+@ r1 => pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ [sp] => log_wd (r4)
+@ [sp+4] => wt (r5)
+@ [sp+8] => ofst (r6)
+@ [sp+12] => ht (r7)
+@ [sp+16] => wd (r8)
+@
+
+
+ .global ih264_weighted_pred_chroma_a9q
+
+ih264_weighted_pred_chroma_a9q:
+
+ stmfd sp!, {r4-r9, r14} @stack stores the values of the arguments
+
+ ldr r4, [sp, #28] @Load log_wd in r4
+ ldr r5, [sp, #32] @Load wt = {wt_u (16-bit), wt_v (16-bit)}
+ ldr r6, [sp, #36] @Load ofst = {ofst_u (8-bit), ofst_v (8-bit)}
+ ldr r8, [sp, #44] @Load wd
+
+ rsb r9, r4, #0 @r9 = -log_wd
+ vdup.32 q1, r5 @Q1 = {wt_u (16-bit), wt_v (16-bit)}
+ ldr r7, [sp, #40] @Load ht
+ vpush {d8-d15}
+ vdup.16 d4, r6 @D4 = {ofst_u (8-bit), ofst_v (8-bit)}
+ cmp r8, #8 @check if wd is 8
+ vdup.16 q0, r9 @Q0 = -log_wd (16-bit)
+ beq loop_8_uv @branch if wd is 8
+
+ cmp r8, #4 @check if ws is 4
+ beq loop_4_uv @branch if wd is 4
+
+loop_2_uv: @each iteration processes two rows
+
+ vld1.32 d6[0], [r0], r2 @load row 1 in source
+ vld1.32 d6[1], [r0], r2 @load row 2 in source
+
+ vmovl.u8 q3, d6 @converting rows 1,2 to 16-bit
+
+ vmul.s16 q3, q3, q1 @weight mult. for rows 1,2
+
+ vrshl.s16 q3, q3, q0 @rounds off the weighted samples from rows 1,2
+
+ vaddw.s8 q3, q3, d4 @adding offset for rows 1,2
+
+ vqmovun.s16 d6, q3 @saturating rows 1,2 to unsigned 8-bit
+
+ subs r7, r7, #2 @decrement ht by 2
+ vst1.32 d6[0], [r1], r3 @store row 1 in destination
+ vst1.32 d6[1], [r1], r3 @store row 2 in destination
+
+ bgt loop_2_uv @if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_4_uv: @each iteration processes two rows
+
+ vld1.8 d6, [r0], r2 @load row 1 in source
+ vld1.8 d8, [r0], r2 @load row 2 in source
+
+ vmovl.u8 q3, d6 @converting row 1 to 16-bit
+ vmovl.u8 q4, d8 @converting row 2 to 16-bit
+
+ vmul.s16 q3, q3, q1 @weight mult. for row 1
+ vmul.s16 q4, q4, q1 @weight mult. for row 2
+
+ subs r7, r7, #2 @decrement ht by 2
+ vrshl.s16 q3, q3, q0 @rounds off the weighted samples from row 1
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2
+
+ vaddw.s8 q3, q3, d4 @adding offset for row 1
+ vaddw.s8 q4, q4, d4 @adding offset for row 2
+
+ vqmovun.s16 d6, q3 @saturating row 1 to unsigned 8-bit
+ vqmovun.s16 d8, q4 @saturating row 2 to unsigned 8-bit
+
+ vst1.8 d6, [r1], r3 @store row 1 in destination
+ vst1.8 d8, [r1], r3 @store row 2 in destination
+
+ bgt loop_4_uv @if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_8_uv: @each iteration processes two rows
+
+ vld1.8 {q3}, [r0], r2 @load row 1 in source
+ vld1.8 {q4}, [r0], r2 @load row 2 in source
+ vmovl.u8 q7, d6 @converting row 1L to 16-bit
+ vld1.8 {q5}, [r0], r2 @load row 3 in source
+ vmovl.u8 q8, d7 @converting row 1H to 16-bit
+ vld1.8 {q6}, [r0], r2 @load row 4 in source
+
+ vmul.s16 q7, q7, q1 @weight mult. for row 1L
+ vmovl.u8 q9, d8 @converting row 2L to 16-bit
+ vmul.s16 q8, q8, q1 @weight mult. for row 1H
+ vmovl.u8 q10, d9 @converting row 2H to 16-bit
+ vmul.s16 q9, q9, q1 @weight mult. for row 2L
+ vmovl.u8 q11, d10 @converting row 3L to 16-bit
+ vmul.s16 q10, q10, q1 @weight mult. for row 2H
+ vmovl.u8 q12, d11 @converting row 3H to 16-bit
+ vmul.s16 q11, q11, q1 @weight mult. for row 3L
+ vmovl.u8 q13, d12 @converting row 4L to 16-bit
+ vmul.s16 q12, q12, q1 @weight mult. for row 3H
+ vmovl.u8 q14, d13 @converting row 4H to 16-bit
+
+ vmul.s16 q13, q13, q1 @weight mult. for row 4L
+ vrshl.s16 q7, q7, q0 @rounds off the weighted samples from row 1L
+ vmul.s16 q14, q14, q1 @weight mult. for row 4H
+
+ vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 1H
+ vrshl.s16 q9, q9, q0 @rounds off the weighted samples from row 2L
+ vaddw.s8 q7, q7, d4 @adding offset for row 1L
+ vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 2H
+ vaddw.s8 q8, q8, d4 @adding offset for row 1H
+ vqmovun.s16 d6, q7 @saturating row 1L to unsigned 8-bit
+ vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 3L
+ vaddw.s8 q9, q9, d4 @adding offset for row 2L
+ vqmovun.s16 d7, q8 @saturating row 1H to unsigned 8-bit
+ vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 3H
+ vaddw.s8 q10, q10, d4 @adding offset for row 2H
+ vqmovun.s16 d8, q9 @saturating row 2L to unsigned 8-bit
+ vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 4L
+ vaddw.s8 q11, q11, d4 @adding offset for row 3L
+ vqmovun.s16 d9, q10 @saturating row 2H to unsigned 8-bit
+ vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 4H
+ vaddw.s8 q12, q12, d4 @adding offset for row 3H
+
+ vqmovun.s16 d10, q11 @saturating row 3L to unsigned 8-bit
+ vaddw.s8 q13, q13, d4 @adding offset for row 4L
+ vqmovun.s16 d11, q12 @saturating row 3H to unsigned 8-bit
+ vaddw.s8 q14, q14, d4 @adding offset for row 4H
+
+ vqmovun.s16 d12, q13 @saturating row 4L to unsigned 8-bit
+ vst1.8 {q3}, [r1], r3 @store row 1 in destination
+ vqmovun.s16 d13, q14 @saturating row 4H to unsigned 8-bit
+ vst1.8 {q4}, [r1], r3 @store row 2 in destination
+ subs r7, r7, #4 @decrement ht by 4
+ vst1.8 {q5}, [r1], r3 @store row 3 in destination
+ vst1.8 {q6}, [r1], r3 @store row 4 in destination
+
+ bgt loop_8_uv @if greater than 0 repeat the loop again
+
+end_loops_uv:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r9, r15} @Reload the registers from sp
+
+