summaryrefslogtreecommitdiffstats
path: root/encoder/arm
diff options
context:
space:
mode:
authorHamsalekha S <hamsalekha.s@ittiam.com>2015-03-13 21:24:58 +0530
committerHamsalekha S <hamsalekha.s@ittiam.com>2015-04-02 15:59:02 +0530
commit8d3d303c7942ced6a987a52db8977d768dc3605f (patch)
treecc806c96794356996b13ba9970941d0aed74a97e /encoder/arm
parent3956d913d37327dcb340f836e604b04bd478b158 (diff)
downloadandroid_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz
android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2
android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip
Initial version
Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017
Diffstat (limited to 'encoder/arm')
-rwxr-xr-xencoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s313
-rwxr-xr-xencoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s529
-rwxr-xr-xencoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s346
-rwxr-xr-xencoder/arm/ih264e_fmt_conv.s329
-rwxr-xr-xencoder/arm/ih264e_function_selector.c170
-rwxr-xr-xencoder/arm/ih264e_function_selector_a9q.c252
-rwxr-xr-xencoder/arm/ih264e_function_selector_av8.c259
-rwxr-xr-xencoder/arm/ih264e_half_pel.s951
-rwxr-xr-xencoder/arm/ih264e_platform_macros.h143
-rwxr-xr-xencoder/arm/ime_distortion_metrics_a9q.s1353
-rwxr-xr-xencoder/arm/ime_platform_macros.h51
11 files changed, 4696 insertions, 0 deletions
diff --git a/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s b/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s
new file mode 100755
index 0000000..fe0ce17
--- /dev/null
+++ b/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s
@@ -0,0 +1,313 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+
+@/**
+@******************************************************************************
+@*
+@* @brief :Evaluate best intra 16x16 mode (among VERT, HORZ and DC )
+@* and do the prediction.
+@*
+@* @par Description
+@* This function evaluates first three 16x16 modes and compute corresponding sad
+@* and return the buffer predicted with best mode.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@** @param[in] pu1_ngbr_pels_i16
+@* UWORD8 pointer to neighbouring pels
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] u4_n_avblty
+@* availability of neighbouring pixels
+@*
+@* @param[in] u4_intra_mode
+@* Pointer to the variable in which best mode is returned
+@*
+@* @param[in] pu4_sadmin
+@* Pointer to the variable in which minimum sad is returned
+@*
+@* @param[in] u4_valid_intra_modes
+@* Says what all modes are valid
+@*
+@*
+@* @return none
+@*
+@******************************************************************************
+@*/
+@
+@void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src,
+@ UWORD8 *pu1_ngbr_pels_i16,
+@ UWORD8 *pu1_dst,
+@ UWORD32 src_strd,
+@ UWORD32 dst_strd,
+@ WORD32 u4_n_avblty,
+@ UWORD32 *u4_intra_mode,
+@ WORD32 *pu4_sadmin,
+@ UWORD32 u4_valid_intra_modes)
+@
+.text
+.p2align 2
+
+ .global ih264e_evaluate_intra16x16_modes_a9q
+
+ih264e_evaluate_intra16x16_modes_a9q:
+
+@r0 = pu1_src,
+@r1 = pu1_ngbr_pels_i16,
+@r2 = pu1_dst,
+@r3 = src_strd,
+@r4 = dst_strd,
+@r5 = u4_n_avblty,
+@r6 = u4_intra_mode,
+@r7 = pu4_sadmin
+
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ ldr r5, [sp, #44]
+
+
+ vpush {d8-d15}
+ vld1.32 {q4}, [r1]!
+ sub r6, r1, #1
+ add r1, r1, #1
+ mov r10, #0
+ vld1.32 {q5}, [r1]!
+ mov r11, #0
+ mov r4, #0
+ @/* Left available ????
+ ands r7, r5, #01
+ movne r10, #1
+
+ @/* Top available ????
+ ands r8, r5, #04
+ lsl r9, r10, #3
+ movne r11, #1
+ lsl r12, r11, #3
+ adds r8, r9, r12
+
+
+ @/* None available :(
+ moveq r4, #128
+
+
+
+@/fINDING dc val*/
+ @----------------------
+ vaddl.u8 q15, d8, d9
+
+ vaddl.u8 q14, d10, d11
+
+ vadd.u16 q15, q14, q15
+ @ VLD1.32 {q2},[r0],r3;row 2
+ vadd.u16 d30, d31, d30
+ vpadd.u16 d30, d30
+ @ VLD1.32 {q3},[r0],r3 ;row 3
+ vpadd.u16 d30, d30
+ @---------------------
+
+
+ vmov.u16 r7, d30[0]
+ add r7, r7, r8
+ add r11, r11, #3
+ add r8, r10, r11
+
+ lsr r7, r8
+ add r7, r4, r7
+ vld1.32 {q0}, [r0], r3 @ source r0w 0
+ vdup.8 q15, r7 @dc val
+
+@/* computing SADs for all three modes*/
+ ldrb r7, [r6]
+ vdup.8 q10, r7 @/HORIZONTAL VALUE ROW=0;
+ @/vertical row 0;
+ vabdl.u8 q8, d0, d10
+ vabdl.u8 q9, d1, d11
+ sub r6, r6, #1
+ @/HORZ row 0;
+ vabdl.u8 q13, d0, d20
+ vabdl.u8 q14, d1, d21
+ mov r1, #15
+ @/dc row 0;
+ vabdl.u8 q11, d0, d30
+ vabdl.u8 q12, d1, d31
+
+
+loop:
+ vld1.32 {q1}, [r0], r3 @row i
+ @/dc row i;
+ vabal.u8 q11, d2, d30
+ ldrb r7, [r6]
+ vabal.u8 q12, d3, d31
+
+ @/vertical row i;
+ vabal.u8 q8, d2, d10
+ vdup.8 q10, r7 @/HORIZONTAL VALUE ROW=i;
+ sub r6, r6, #1
+ vabal.u8 q9, d3, d11
+
+ subs r1, r1, #1
+ @/HORZ row i;
+ vabal.u8 q13, d2, d20
+ vabal.u8 q14, d3, d21
+ bne loop
+
+ @------------------------------------------------------------------------------
+
+ vadd.i16 q9, q9, q8 @/VERT
+ vadd.i16 d18, d19, d18 @/VERT
+ vpaddl.u16 d18, d18 @/VERT
+ vadd.i16 q14, q13, q14 @/HORZ
+ vadd.i16 d28, d29, d28 @/HORZ
+ vpaddl.u32 d18, d18 @/VERT
+ vpaddl.u16 d28, d28 @/HORZ
+
+ vpaddl.u32 d28, d28 @/HORZ
+ vmov.u32 r8, d18[0] @ vert
+ vadd.i16 q12, q11, q12 @/DC
+ vmov.u32 r9, d28[0] @horz
+ mov r11, #1
+ vadd.i16 d24, d24, d25 @/DC
+ lsl r11 , #30
+
+ @-----------------------
+ ldr r0, [sp, #120] @ u4_valid_intra_modes
+ @--------------------------------------------
+ ands r7, r0, #01 @ vert mode valid????????????
+ moveq r8, r11
+ vpaddl.u16 d24, d24 @/DC
+
+ ands r6, r0, #02 @ horz mode valid????????????
+ moveq r9, r11
+ vpaddl.u32 d24, d24 @/DC
+
+ vmov.u32 r10, d24[0] @dc
+@--------------------------------
+ ldr r4, [sp, #104] @r4 = dst_strd,
+ ldr r7, [sp, #116] @r7 = pu4_sadmin
+@----------------------------------------------
+ ands r6, r0, #04 @ dc mode valid????????????
+ moveq r10, r11
+
+ @---------------------------
+ ldr r6, [sp, #112] @ R6 =MODE
+ @--------------------------
+
+ cmp r8, r9
+ bgt not_vert
+ cmp r8, r10
+ bgt do_dc
+
+ @/----------------------
+ @DO VERTICAL PREDICTION
+ str r8 , [r7] @MIN SAD
+ mov r8, #0
+ str r8 , [r6] @ MODE
+ vmov q15, q5
+
+ b do_dc_vert
+ @-----------------------------
+not_vert:
+ cmp r9, r10
+ bgt do_dc
+
+ @/----------------------
+ @DO HORIZONTAL
+ vdup.8 q5, d9[7] @0
+ str r9 , [r7] @MIN SAD
+ vdup.8 q6, d9[6] @1
+ mov r9, #1
+ vdup.8 q7, d9[5] @2
+ vst1.32 {d10, d11} , [r2], r4 @0
+ vdup.8 q8, d9[4] @3
+ str r9 , [r6] @ MODE
+ vdup.8 q9, d9[3] @4
+ vst1.32 {d12, d13} , [r2], r4 @1
+ vdup.8 q10, d9[2] @5
+ vst1.32 {d14, d15} , [r2], r4 @2
+ vdup.8 q11, d9[1] @6
+ vst1.32 {d16, d17} , [r2], r4 @3
+ vdup.8 q12, d9[0] @7
+ vst1.32 {d18, d19} , [r2], r4 @4
+ vdup.8 q13, d8[7] @8
+ vst1.32 {d20, d21} , [r2], r4 @5
+ vdup.8 q14, d8[6] @9
+ vst1.32 {d22, d23} , [r2], r4 @6
+ vdup.8 q15, d8[5] @10
+ vst1.32 {d24, d25} , [r2], r4 @7
+ vdup.8 q1, d8[4] @11
+ vst1.32 {d26, d27} , [r2], r4 @8
+ vdup.8 q2, d8[3] @12
+ vst1.32 {d28, d29} , [r2], r4 @9
+ vdup.8 q3, d8[2] @13
+ vst1.32 {d30, d31}, [r2], r4 @10
+ vdup.8 q5, d8[1] @14
+ vst1.32 {d2, d3} , [r2], r4 @11
+ vdup.8 q6, d8[0] @15
+ vst1.32 {d4, d5} , [r2], r4 @12
+
+ vst1.32 {d6, d7} , [r2], r4 @13
+
+ vst1.32 {d10, d11} , [r2], r4 @14
+
+ vst1.32 {d12, d13} , [r2], r4 @15
+ b end_func
+
+
+ @/-----------------------------
+
+do_dc: @/---------------------------------
+ @DO DC
+ str r10 , [r7] @MIN SAD
+ mov r10, #2
+ str r10 , [r6] @ MODE
+do_dc_vert:
+ vst1.32 {d30, d31}, [r2], r4 @0
+ vst1.32 {d30, d31}, [r2], r4 @1
+ vst1.32 {d30, d31}, [r2], r4 @2
+ vst1.32 {d30, d31}, [r2], r4 @3
+ vst1.32 {d30, d31}, [r2], r4 @4
+ vst1.32 {d30, d31}, [r2], r4 @5
+ vst1.32 {d30, d31}, [r2], r4 @6
+ vst1.32 {d30, d31}, [r2], r4 @7
+ vst1.32 {d30, d31}, [r2], r4 @8
+ vst1.32 {d30, d31}, [r2], r4 @9
+ vst1.32 {d30, d31}, [r2], r4 @10
+ vst1.32 {d30, d31}, [r2], r4 @11
+ vst1.32 {d30, d31}, [r2], r4 @12
+ vst1.32 {d30, d31}, [r2], r4 @13
+ vst1.32 {d30, d31}, [r2], r4 @14
+ vst1.32 {d30, d31}, [r2], r4 @15
+ @/------------------
+end_func:
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s b/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s
new file mode 100755
index 0000000..568e623
--- /dev/null
+++ b/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s
@@ -0,0 +1,529 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+
+@/**
+
+.data
+.p2align 2
+
+scratch_intrapred_luma_4x4_prediction:
+ .long ver, hor, d_c, dia_dl
+ .long dia_dr, ver_r, hor_d, ver_l
+ .long hor_u
+
+
+.text
+.p2align 2
+
+scratch_intrapred_luma_4x4_prediction_addr1:
+ .long scratch_intrapred_luma_4x4_prediction - scrintra_4x4 - 8
+
+
+
+@/**
+@/**
+@******************************************************************************
+@*
+@* @brief :Evaluate best intra 4x4 mode
+@* and do the prediction.
+@*
+@* @par Description
+@* This function evaluates 4x4 modes and compute corresponding sad
+@* and return the buffer predicted with best mode.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@** @param[in] pu1_ngbr_pels
+@* UWORD8 pointer to neighbouring pels
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] u4_n_avblty
+@* availability of neighbouring pixels
+@*
+@* @param[in] u4_intra_mode
+@* Pointer to the variable in which best mode is returned
+@*
+@* @param[in] pu4_sadmin
+@* Pointer to the variable in which minimum cost is returned
+@*
+@* @param[in] u4_valid_intra_modes
+@* Says what all modes are valid
+@*
+@* * @param[in] u4_lambda
+@* Lamda value for computing cost from SAD
+@*
+@* @param[in] u4_predictd_mode
+@* Predicted mode for cost computation
+@*
+@*
+@*
+@* @return none
+@*
+@******************************************************************************
+@*/
+@void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src,
+@ UWORD8 *pu1_ngbr_pels,
+@ UWORD8 *pu1_dst,
+@ UWORD32 src_strd,
+@ UWORD32 dst_strd,
+@ WORD32 u4_n_avblty,
+@ UWORD32 *u4_intra_mode,
+@ WORD32 *pu4_sadmin,
+@ UWORD32 u4_valid_intra_modes,
+@ UWORD32 u4_lambda,
+@ UWORD32 u4_predictd_mode)
+
+
+
+ .global ih264e_evaluate_intra_4x4_modes_a9q
+
+ih264e_evaluate_intra_4x4_modes_a9q:
+
+@r0 = pu1_src,
+@r1 = pu1_ngbr_pels_i16,
+@r2 = pu1_dst,
+@r3 = src_strd,
+@r4 = dst_strd,
+@r5 = u4_n_avblty,
+@r6 = u4_intra_mode,
+@r7 = pu4_sadmin
+@r8 = u4_valid_intra_modes
+@r0 =u4_lambda
+@r1 = u4_predictd_mode
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+@--------------------
+ ldr r5, [sp, #44] @r5 = u4_n_avblty,
+@----------------------
+ vpush {d8-d15}
+@Loading neighbours
+ vld1.32 {q0}, [r1]
+ add r4, r1, #12
+ vld1.8 d1[5], [r4]
+ vld1.8 d1[7], [r1]
+ @--------------------------------
+ ldr r8, [sp, #120] @u4_valid_intra_modes
+@----------------------------------------------
+
+
+
+@ LOADING pu1_src
+ vld1.32 {d20[0]}, [r0], r3
+ vext.8 q1, q0, q0, #1
+ vld1.32 {d20[1]}, [r0], r3
+ mov r11, #1
+ vld1.32 {d21[0]}, [r0], r3
+ lsl r11, r11, #30
+ vld1.32 {d21[1]}, [r0], r3
+
+
+
+@--------------------------------
+ ldr r0, [sp, #124] @r0 =u4_lambda
+ ldr r1, [sp, #128] @r1 = u4_predictd_mode
+@------
+
+
+vert:
+ ands r10, r8, #01 @VERT sad ??
+ beq horz
+ vdup.32 q2, d2[1]
+ vabdl.u8 q14, d4, d20
+ vabal.u8 q14, d4, d21
+ vadd.i16 d28, d29, d28
+ subs r6, r1, #0
+ vpaddl.u16 d28, d28 @
+ lslne r6, r0, #2
+ vpaddl.u32 d28, d28 @/
+ moveq r6, r0 @
+ vmov.u32 r9, d28[0] @ vert
+ add r9, r6, r9
+
+ subs r6, r11, r9
+ movgt r11, r9
+ movgt r12, #0
+
+horz:
+ ands r10, r8, #02 @HORZ sad ??
+ beq dc
+ vdup.32 q3, d0[0]
+ vmov.32 q4, q3
+ vtrn.8 q3, q4
+ vtrn.16 d7, d6
+ vtrn.16 d9, d8
+ vtrn.32 d9, d7
+ vtrn.32 d8, d6
+ vabdl.u8 q14, d6, d20
+ subs r6, r1, #1
+ vabal.u8 q14, d7, d21
+ vadd.i16 d28, d29, d28
+ lslne r6, r0, #2
+ vpaddl.u16 d28, d28 @
+ vpaddl.u32 d28, d28 @/
+ vmov.u32 r9, d28[0] @
+ moveq r6, r0 @
+ add r9, r6, r9
+
+ subs r6, r11, r9
+ movgt r11, r9
+ movgt r12, #1
+
+dc:
+ ands r10, r8, #04 @DC sad ??
+ beq diags
+ vext.8 q4, q0, q0, #5
+ vaddl.u8 q4, d0, d8
+ vpaddl.u16 d8, d8 @
+ vpaddl.u32 d8, d8 @/
+ vmov.u32 r4, d8[0] @
+ mov r14, #1
+ ands r10, r5, #1
+ addne r4, r4, #2
+ addne r14, r14, #1
+ ands r10, r5, #4
+ addne r4, r4, #2
+ addne r14, r14, #1
+ ands r10, r5, #5
+ moveq r4, #128
+ moveq r14, #0
+ subs r6, r1, #2
+ lsr r4, r4, r14
+ vdup.8 q4, r4
+ lslne r6, r0, #2
+ vabdl.u8 q14, d8, d20
+ vabal.u8 q14, d9, d21
+ vadd.i16 d28, d29, d28
+ vpaddl.u16 d28, d28 @
+ vpaddl.u32 d28, d28 @/
+ vmov.u32 r9, d28[0] @
+
+ moveq r6, r0 @
+ add r9, r6, r9
+
+ subs r6, r11, r9
+ movgt r11, r9
+ movgt r12, #2
+
+diags:
+ ands r10, r8, #504 @/* if modes other than VERT, HORZ and DC are valid ????*/
+ beq pred
+ @/* Performing FILT11 and FILT121 operation for all neighbour values*/
+ vext.8 q5, q0, q0, #2
+ vaddl.u8 q6, d0, d2
+ vaddl.u8 q7, d1, d3
+ vaddl.u8 q8, d10, d2
+ vaddl.u8 q9, d11, d3
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d10, q6, #1
+ vqrshrun.s16 d11, q7, #1
+ vadd.u16 q11, q6, q8
+ vadd.u16 q12, q7, q9
+ vqrshrun.s16 d12, q11, #2
+ vqrshrun.s16 d13, q12, #2
+ mov r14, #0
+ vdup.32 q13 , r14
+ mov r14, #-1
+ vmov.i32 d26[0], r14
+
+diag_dl:
+ ands r10, r8, #0x08 @DIAG_DL sad ??
+ beq diag_dr
+
+ vext.8 q15, q6, q6, #5
+ vbit.32 d14, d30, d26
+ vext.8 q15, q6, q6, #15
+ vbit.32 d15, d31, d26
+ vext.8 q15, q6, q6, #2
+ vext.32 q14, q13, q13, #3
+ vbit.32 d14, d30, d28
+ vext.8 q15, q6, q6, #4
+ vbit.32 d15, d30, d28
+ vabdl.u8 q14, d14, d20
+ subs r6, r1, #3
+ vabal.u8 q14, d15, d21
+ vadd.i16 d28, d29, d28
+ vpaddl.u16 d28, d28 @
+ lslne r6, r0, #2
+ vpaddl.u32 d28, d28 @/
+ vmov.u32 r9, d28[0] @
+
+ moveq r6, r0 @
+ add r9, r6, r9
+
+ subs r6, r11, r9
+ movgt r11, r9
+ movgt r12, #3
+
+diag_dr:
+ ands r10, r8, #16 @DIAG_DR sad ??
+ beq vert_r
+
+ vext.8 q15, q6, q6, #3
+ vbit.32 d16, d30, d26
+ vext.8 q15, q6, q6, #1
+ vbit.32 d17, d30, d26
+ vext.8 q15, q6, q6, #4
+ vext.32 q14, q13, q13, #3
+ vbit.32 d17, d31, d28
+ vext.8 q15, q6, q6, #6
+ vbit.32 d16, d31, d28
+ vabdl.u8 q14, d16, d20
+ subs r6, r1, #4
+ vabal.u8 q14, d17, d21
+ vadd.i16 d28, d29, d28
+ vpaddl.u16 d28, d28 @
+ lslne r6, r0, #2
+ vpaddl.u32 d28, d28 @/
+ vmov.u32 r9, d28[0] @
+
+ moveq r6, r0 @
+ add r9, r6, r9
+
+ subs r6, r11, r9
+ movgt r11, r9
+ movgt r12, #4
+
+vert_r:
+ ands r10, r8, #32 @VERT_R sad ??
+ beq horz_d
+ vext.8 q15, q5, q5, #4
+ vbit.32 d18, d30, d26
+ vext.8 q15, q5, q5, #3
+ vbit.32 d19, d30, d26
+ vext.32 q14, q13, q13, #3
+ vext.8 q15, q6, q6, #15
+ vbit.32 d18, d30, d28
+ vext.8 q15, q6, q6, #14
+ vbit.32 d19, d30, d28
+ mov r14, #0
+ vdup.32 q14 , r14
+ mov r14, #0xff
+ vmov.i8 d28[0], r14
+ vext.8 q15, q6, q6, #2
+ vbit.32 d19, d30, d28
+ vext.32 q14, q14, q14, #3
+ subs r6, r1, #5
+ vext.8 q15, q6, q6, #13
+ vbit.32 d19, d30, d28
+ lslne r6, r0, #2
+ vabdl.u8 q14, d18, d20
+ vabal.u8 q14, d19, d21
+ vadd.i16 d28, d29, d28
+ vpaddl.u16 d28, d28 @
+ vpaddl.u32 d28, d28 @/
+ vmov.u32 r9, d28[0] @
+
+
+ moveq r6, r0 @
+ add r9, r6, r9
+
+ subs r6, r11, r9
+ movgt r11, r9
+ movgt r12, #5
+
+horz_d:
+ vmov.8 q1, q5
+ vmov.8 q15, q6
+ vzip.8 q1, q15
+
+ ands r10, r8, #64 @HORZ_D sad ??
+ beq vert_l
+ vext.8 q15, q6, q6, #2
+ vbit.32 d8, d30, d26
+ mov r14, #0
+ vdup.32 q14 , r14
+ mov r14, #0xff
+ vmov.i8 d28[0], r14
+ vext.8 q15, q5, q5, #3
+ vbit.32 d8, d30, d28
+ vext.8 q15, q1, q1, #2
+ vbit.32 d9, d30, d26
+ vext.32 q14, q13, q13, #3
+ vbit.32 d8, d2, d28
+ subs r6, r1, #6
+ vext.8 q15, q1, q1, #12
+ vbit.32 d9, d30, d28
+ vabdl.u8 q14, d8, d20
+ vabal.u8 q14, d9, d21
+ vadd.i16 d28, d29, d28
+ vpaddl.u16 d28, d28 @
+ lslne r6, r0, #2
+ vpaddl.u32 d28, d28 @/
+ vmov.u32 r9, d28[0] @
+
+
+ moveq r6, r0 @
+ add r9, r6, r9
+
+ subs r6, r11, r9
+ movgt r11, r9
+ movgt r12, #6
+vert_l:
+ ands r10, r8, #128 @VERT_L sad ??
+ beq horz_u
+ vext.8 q15, q5, q5, #5
+ vbit.32 d24, d30, d26
+ vext.8 q15, q15, q15, #1
+ vbit.32 d25, d30, d26
+ vext.8 q15, q6, q6, #1
+ vext.32 q14, q13, q13, #3
+ vbit.32 d24, d30, d28
+ vext.8 q15, q15, q15, #1
+ subs r6, r1, #7
+ vbit.32 d25, d30, d28
+ vabdl.u8 q14, d24, d20
+ vabal.u8 q14, d25, d21
+ vadd.i16 d28, d29, d28
+ vpaddl.u16 d28, d28 @
+ lslne r6, r0, #2
+ vpaddl.u32 d28, d28 @/
+ vmov.u32 r9, d28[0] @
+
+ moveq r6, r0 @
+ add r9, r6, r9
+
+ subs r6, r11, r9
+ movgt r11, r9
+ movgt r12, #7
+
+horz_u:
+ ands r10, r8, #256 @HORZ_U sad ??
+ beq pred
+ vrev64.8 q5, q1
+ vdup.8 q1, d0[0]
+ vext.8 q6, q6, #7
+ mov r14, #0
+ vdup.32 q14 , r14
+ mov r14, #0xff
+ vmov.i8 d28[0], r14
+ vbit.32 d11, d13, d28
+ movw r14, #0xffff
+ vmov.i16 d28[0], r14
+ vext.8 q6, q5, q5, #7
+ subs r6, r1, #8
+ vbit.32 d3, d12, d28
+ vext.8 q6, q5, q5, #3
+ vbit.32 d2, d12, d26
+ vext.32 q14, q13, q13, #3
+ vext.8 q6, q5, q5, #1
+ vbit.32 d2, d12, d28
+ vabdl.u8 q14, d2, d20
+ vabal.u8 q14, d3, d21
+ vadd.i16 d28, d29, d28
+ vpaddl.u16 d28, d28 @
+ lslne r6, r0, #2
+ vpaddl.u32 d28, d28 @/
+ vmov.u32 r9, d28[0] @
+
+
+ moveq r6, r0 @
+ add r9, r6, r9
+
+ subs r6, r11, r9
+ movgt r11, r9
+ movgt r12, #8
+
+pred: @/*dOING FINAL PREDICTION*/
+@---------------------------
+ ldr r7, [sp, #116] @r7 = pu4_sadmin
+ ldr r6, [sp, #112] @ R6 =MODE
+@--------------------------
+ str r11, [r7] @/STORING MIN SAD*/
+ str r12, [r6] @/FINAL MODE*/
+
+
+ ldr r3, scratch_intrapred_luma_4x4_prediction_addr1
+scrintra_4x4:
+ add r3, r3, pc
+ lsl r12, r12, #2
+ add r3, r3, r12
+
+ ldr r5, [r3]
+ and r5, r5, #0xfffffffe
+
+ bx r5
+
+
+ver:
+ vext.8 q0, q0, q0, #1
+ vdup.32 q15, d0[1]
+ b store
+
+hor:
+ vmov.32 q15, q3
+ b store
+
+d_c:
+ vdup.8 q15, r4
+ b store
+
+dia_dl:
+ vmov.32 q15, q7
+ b store
+
+dia_dr:
+ vmov.32 q15, q8
+ b store
+
+ver_r:
+ vmov.32 q15, q9
+ b store
+
+hor_d:
+ vmov.32 q15, q4
+ b store
+
+ver_l:
+ vmov.32 q15, q12
+ b store
+
+hor_u:
+ vmov.32 q15, q1
+
+store: @/* storing to pu1_dst*/
+
+ ldr r4, [sp, #104] @r4 = dst_strd,
+
+ vst1.32 {d30[0]}, [r2], r4
+ vst1.32 {d30[1]}, [r2], r4
+ vst1.32 {d31[0]}, [r2], r4
+ vst1.32 {d31[1]}, [r2], r4
+
+
+end_func:
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
diff --git a/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s b/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s
new file mode 100755
index 0000000..e4dfca8
--- /dev/null
+++ b/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s
@@ -0,0 +1,346 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+
+@/**
+@******************************************************************************
+@*
+@* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC )
+@* and do the prediction.
+@*
+@* @par Description
+@* This function evaluates first three intra chroma modes and compute corresponding sad
+@* and return the buffer predicted with best mode.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@** @param[in] pu1_ngbr_pels
+@* UWORD8 pointer to neighbouring pels
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] u4_n_avblty
+@* availability of neighbouring pixels
+@*
+@* @param[in] u4_intra_mode
+@* Pointer to the variable in which best mode is returned
+@*
+@* @param[in] pu4_sadmin
+@* Pointer to the variable in which minimum sad is returned
+@*
+@* @param[in] u4_valid_intra_modes
+@* Says what all modes are valid
+@*
+@*
+@* @return none
+@*
+@******************************************************************************
+@*/
+@
+@void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
+@ UWORD8 *pu1_ngbr_pels_i16,
+@ UWORD8 *pu1_dst,
+@ UWORD32 src_strd,
+@ UWORD32 dst_strd,
+@ WORD32 u4_n_avblty,
+@ UWORD32 *u4_intra_mode,
+@ WORD32 *pu4_sadmin,
+@ UWORD32 u4_valid_intra_modes)
+@
+.text
+.p2align 2
+
+ .global ih264e_evaluate_intra_chroma_modes_a9q
+
+ih264e_evaluate_intra_chroma_modes_a9q:
+
+@r0 = pu1_src,
+@r1 = pu1_ngbr_pels_i16,
+@r2 = pu1_dst,
+@r3 = src_strd,
+@r4 = dst_strd,
+@r5 = u4_n_avblty,
+@r6 = u4_intra_mode,
+@r7 = pu4_sadmin
+
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ @-----------------------
+ ldr r5, [sp, #44] @r5 = u4_n_avblty,
+ @-------------------------
+ mov r12, r1 @
+ vpush {d8-d15}
+ vld1.32 {q4}, [r1]!
+ add r1, r1, #2
+ vld1.32 {q5}, [r1]!
+
+ vuzp.u8 q4, q5 @
+
+ vpaddl.u8 d8, d8
+ vpadd.u16 d8, d8
+
+ vpaddl.u8 d9, d9
+ vpadd.u16 d9, d9
+
+ vpaddl.u8 d10, d10
+ vpadd.u16 d10, d10
+
+ vpaddl.u8 d11, d11
+
+ and r7, r5, #5
+ vpadd.u16 d11, d11
+ subs r8, r7, #5
+ beq all_available
+ subs r8, r7, #4
+ beq top_available
+ subs r8, r7, #1
+ beq left_available
+ mov r10, #128
+ vdup.8 q14, r10
+ vdup.8 q15, r10
+ b sad
+
+all_available:
+ vzip.u16 q4, q5
+ vext.16 q6, q4, q4, #2
+ vadd.u16 q7, q5, q6
+ vqrshrn.u16 d14, q7, #3
+ vqrshrn.u16 d15, q4, #2
+ vqrshrn.u16 d16, q5, #2
+ vdup.16 d28, d14[0]
+ vdup.16 d29, d16[1]
+ vdup.16 d30, d15[0]
+ vdup.16 d31, d14[1]
+ b sad
+top_available:
+ vzip.u16 q4, q5
+ vqrshrn.u16 d16, q5, #2
+ vdup.16 d28, d16[0]
+ vdup.16 d29, d16[1]
+ vdup.16 d30, d16[0]
+ vdup.16 d31, d16[1]
+ b sad
+left_available:
+ vzip.u16 q4, q5
+ vqrshrn.u16 d16, q4, #2
+ vdup.16 d28, d16[3]
+ vdup.16 d29, d16[3]
+ vdup.16 d30, d16[2]
+ vdup.16 d31, d16[2]
+
+
+sad:
+ vld1.32 {q4}, [r12]!
+ sub r8, r12, #2
+ add r12, r12, #2
+ vld1.32 {q5}, [r12]!
+ add r12, r0, r3, lsl #2
+ sub r10, r8, #8
+ vld1.32 {q0}, [r0], r3
+ ldrh r9, [r8]
+ vdup.16 q10, r9 @ row 0
+
+ @/vertical row 0;
+ vabdl.u8 q8, d0, d10
+ vabdl.u8 q9, d1, d11
+ sub r8, r8, #2
+ vld1.32 {q1}, [r12], r3
+
+ @/HORZ row 0;
+ vabdl.u8 q13, d0, d20
+ vabdl.u8 q7, d1, d21
+ ldrh r9, [r10]
+ @/dc row 0;
+ vabdl.u8 q11, d0, d28
+ vabdl.u8 q12, d1, d29
+
+
+ vdup.16 q10, r9 @ row 4
+ @/vertical row 4;
+ vabal.u8 q8, d2, d10
+ vabal.u8 q9, d3, d11
+ sub r10, r10, #2
+
+ @/HORZ row 4;
+ vabal.u8 q13, d2, d20
+ vabal.u8 q7, d3, d21
+ @/dc row 4;
+ vabal.u8 q11, d2, d30
+ vabal.u8 q12, d3, d31
+
+ mov r11, #3
+
+loop:
+ vld1.32 {q0}, [r0], r3
+ ldrh r9, [r8]
+
+
+ @/vertical row i;
+ vabal.u8 q8, d0, d10
+ vabal.u8 q9, d1, d11
+
+ vdup.16 q10, r9 @ row i
+ vld1.32 {q1}, [r12], r3
+ sub r8, r8, #2
+ @/HORZ row i;
+ vabal.u8 q13, d0, d20
+ vabal.u8 q7, d1, d21
+ ldrh r9, [r10]
+ @/dc row i;
+ vabal.u8 q11, d0, d28
+ vabal.u8 q12, d1, d29
+ sub r10, r10, #2
+
+ vdup.16 q10, r9 @ row i+4
+ @/vertical row 4;
+ vabal.u8 q8, d2, d10
+ vabal.u8 q9, d3, d11
+ subs r11, r11, #1
+
+ @/HORZ row i+4;
+ vabal.u8 q13, d2, d20
+ vabal.u8 q7, d3, d21
+ @/dc row i+4;
+ vabal.u8 q11, d2, d30
+ vabal.u8 q12, d3, d31
+ bne loop
+
+
+
+@-------------------------------------------
+
+ vadd.i16 q9, q9, q8 @/VERT
+ vadd.i16 q7, q13, q7 @/HORZ
+ vadd.i16 q12, q11, q12 @/DC
+ vadd.i16 d18, d19, d18 @/VERT
+ vadd.i16 d14, d15, d14 @/HORZ
+ vadd.i16 d24, d24, d25 @/DC
+ vpaddl.u16 d18, d18 @/VERT
+ vpaddl.u16 d14, d14 @/HORZ
+ vpaddl.u16 d24, d24 @/DC
+ vpaddl.u32 d18, d18 @/VERT
+ vpaddl.u32 d14, d14 @/HORZ
+ vpaddl.u32 d24, d24 @/DC
+
+
+
+ vmov.u32 r8, d18[0] @ vert
+ vmov.u32 r9, d14[0] @horz
+ vmov.u32 r10, d24[0] @dc
+
+ mov r11, #1
+@-----------------------
+ ldr r0, [sp, #120] @ u4_valid_intra_modes
+@--------------------------------------------
+
+
+ lsl r11 , #30
+
+ ands r7, r0, #04 @ vert mode valid????????????
+ moveq r8, r11
+
+ ands r6, r0, #02 @ horz mode valid????????????
+ moveq r9, r11
+
+ ands r6, r0, #01 @ dc mode valid????????????
+ moveq r10, r11
+
+
+ @---------------------------
+ ldr r4, [sp, #104] @r4 = dst_strd,
+ ldr r6, [sp, #112] @ R6 =MODE
+ ldr r7, [sp, #116] @r7 = pu4_sadmin
+
+ @--------------------------
+
+ cmp r10, r9
+ bgt not_dc
+ cmp r10, r8
+ bgt do_vert
+
+ @/----------------------
+ @DO DC PREDICTION
+ str r10 , [r7] @MIN SAD
+ mov r10, #0
+ str r10 , [r6] @ MODE
+ b do_dc_vert
+ @-----------------------------
+
+not_dc:
+ cmp r9, r8
+ bgt do_vert
+ @/----------------------
+ @DO HORIZONTAL
+
+ vdup.16 q10, d9[3] @/HORIZONTAL VALUE ROW=0;
+ str r9 , [r7] @MIN SAD
+ mov r9, #1
+ vdup.16 q11, d9[2] @/HORIZONTAL VALUE ROW=1;
+ str r9 , [r6] @ MODE
+ vdup.16 q12, d9[1] @/HORIZONTAL VALUE ROW=2;
+ vst1.32 {d20, d21} , [r2], r4 @0
+ vdup.16 q13, d9[0] @/HORIZONTAL VALUE ROW=3;
+ vst1.32 {d22, d23} , [r2], r4 @1
+ vdup.16 q14, d8[3] @/HORIZONTAL VALUE ROW=4;
+ vst1.32 {d24, d25} , [r2], r4 @2
+ vdup.16 q15, d8[2] @/HORIZONTAL VALUE ROW=5;
+ vst1.32 {d26, d27} , [r2], r4 @3
+ vdup.16 q1, d8[1] @/HORIZONTAL VALUE ROW=6;
+ vst1.32 {d28, d29} , [r2], r4 @4
+ vdup.16 q2, d8[0] @/HORIZONTAL VALUE ROW=7;
+ vst1.32 {d30, d31} , [r2], r4 @5
+ vst1.32 {d2, d3} , [r2], r4 @6
+ vst1.32 {d4, d5} , [r2], r4 @7
+ b end_func
+
+do_vert:
+ @DO VERTICAL PREDICTION
+ str r8 , [r7] @MIN SAD
+ mov r8, #2
+ str r8 , [r6] @ MODE
+ vmov q15, q5
+ vmov q14, q5
+
+do_dc_vert:
+ vst1.32 {d28, d29} , [r2], r4 @0
+ vst1.32 {d28, d29} , [r2], r4 @1
+ vst1.32 {d28, d29} , [r2], r4 @2
+ vst1.32 {d28, d29} , [r2], r4 @3
+ vst1.32 {d30, d31} , [r2], r4 @4
+ vst1.32 {d30, d31} , [r2], r4 @5
+ vst1.32 {d30, d31} , [r2], r4 @6
+ vst1.32 {d30, d31} , [r2], r4 @7
+
+
+end_func:
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
diff --git a/encoder/arm/ih264e_fmt_conv.s b/encoder/arm/ih264e_fmt_conv.s
new file mode 100755
index 0000000..2bf1479
--- /dev/null
+++ b/encoder/arm/ih264e_fmt_conv.s
@@ -0,0 +1,329 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+
+.text
+.p2align 2
+@/**
+
+@/*****************************************************************************
+@* *
+@* Function Name : IH264D_CXA8_YUV420toYUV420SP_UV() *
+@* *
+@* Description : This function conversts the image from YUV420P color *
+@* space to 420SP color space(UV interleaved). *
+@* *
+@* Arguments : R0 pu1_y *
+@* R1 pu1_u *
+@* R2 pu1_v *
+@* R3 pu1_dest_y *
+@* [R13 #40] pu1_dest_uv *
+@* [R13 #44] u2_height *
+@* [R13 #48] u2_width *
+@* [R13 #52] u2_stridey *
+@* [R13 #56] u2_strideu *
+@* [R13 #60] u2_stridev *
+@* [R13 #64] u2_dest_stride_y *
+@* [R13 #68] u2_dest_stride_uv *
+@* [R13 #72] convert_uv_only *
+@* *
+@* Values Returned : None *
+@* *
+@* Register Usage : R0 - R14 *
+@* *
+@* Stack Usage : 40 Bytes *
+@* *
+@* Interruptibility : Interruptible *
+@* *
+@* Known Limitations *
+@* Assumptions: Image Width: Assumed to be multiple of 16 and *
+@* greater than or equal to 16 *
+@* Image Height: Assumed to be even. *
+@* *
+@* Revision History : *
+@* DD MM YYYY Author(s) Changes (Describe the changes made) *
+@* 07 06 2010 Varshita Draft *
+@* 07 06 2010 Naveen Kr T Completed *
+@* *
+@*****************************************************************************/
+ .global ih264e_fmt_conv_420p_to_420sp_a9q
+
+ih264e_fmt_conv_420p_to_420sp_a9q:
+
+ @// push the registers on the stack
+ stmfd sp!, {r4-r12, lr}
+
+ ldr r4, [sp, #72] @// Load convert_uv_only
+
+ cmp r4, #1
+ beq yuv420sp_uv_chroma
+ @/* Do the preprocessing before the main loops start */
+ @// Load the parameters from stack
+ ldr r4, [sp, #44] @// Load u2_height from stack
+ ldr r5, [sp, #48] @// Load u2_width from stack
+ ldr r7, [sp, #52] @// Load u2_stridey from stack
+ ldr r8, [sp, #64] @// Load u2_dest_stride_y from stack
+ sub r7, r7, r5 @// Source increment
+ sub r8, r8, r5 @// Destination increment
+
+ vpush {d8-d15}
+yuv420sp_uv_row_loop_y:
+ mov r6, r5
+
+yuv420sp_uv_col_loop_y:
+ pld [r0, #128]
+ vld1.8 {d0, d1}, [r0]!
+ vst1.8 {d0, d1}, [r3]!
+ sub r6, r6, #16
+ cmp r6, #15
+ bgt yuv420sp_uv_col_loop_y
+
+ cmp r6, #0
+ beq yuv420sp_uv_row_loop_end_y
+ @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ @//Ex if width is 162, above loop will process 160 pixels. And
+ @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+ @// and written using VLD1 and VST1
+ rsb r6, r6, #16
+ sub r0, r0, r6
+ sub r3, r3, r6
+
+ vld1.8 {d0, d1}, [r0]!
+ vst1.8 {d0, d1}, [r3]!
+
+yuv420sp_uv_row_loop_end_y:
+ add r0, r0, r7
+ add r3, r3, r8
+ subs r4, r4, #1
+ bgt yuv420sp_uv_row_loop_y
+
+yuv420sp_uv_chroma:
+
+ ldr r3, [sp, #40] @// Load pu1_dest_uv from stack
+
+ ldr r4, [sp, #44] @// Load u2_height from stack
+
+ ldr r5, [sp, #48] @// Load u2_width from stack
+
+
+ ldr r7, [sp, #56] @// Load u2_strideu from stack
+
+ ldr r8, [sp, #68] @// Load u2_dest_stride_uv from stack
+
+ sub r7, r7, r5, lsr #1 @// Source increment
+
+ sub r8, r8, r5 @// Destination increment
+
+ mov r5, r5, lsr #1
+ mov r4, r4, lsr #1
+ ldr r3, [sp, #40] @// Load pu1_dest_uv from stack
+ vpush {d8-d15}
+yuv420sp_uv_row_loop_uv:
+ mov r6, r5
+
+
+yuv420sp_uv_col_loop_uv:
+ pld [r1, #128]
+ pld [r2, #128]
+ vld1.8 d0, [r1]!
+ vld1.8 d1, [r2]!
+ vst2.8 {d0, d1}, [r3]!
+ sub r6, r6, #8
+ cmp r6, #7
+ bgt yuv420sp_uv_col_loop_uv
+
+ cmp r6, #0
+ beq yuv420sp_uv_row_loop_end_uv
+ @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ @//Ex if width is 162, above loop will process 160 pixels. And
+ @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+ @// and written using VLD1 and VST1
+ rsb r6, r6, #8
+ sub r1, r1, r6
+ sub r2, r2, r6
+ sub r3, r3, r6, lsl #1
+
+ vld1.8 d0, [r1]!
+ vld1.8 d1, [r2]!
+ vst2.8 {d0, d1}, [r3]!
+
+yuv420sp_uv_row_loop_end_uv:
+ add r1, r1, r7
+ add r2, r2, r7
+ add r3, r3, r8
+ subs r4, r4, #1
+ bgt yuv420sp_uv_row_loop_uv
+ @//POP THE REGISTERS
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc}
+
+
+
+
+
+@ /**
+@ *******************************************************************************
+@ *
+@ * @brief ih264e_fmt_conv_422i_to_420sp_a9q
+@ * Function used from format conversion or frame copy
+@ *
+@ *
+@ *
+@ *Inputs : r0 - pu1_y - UWORD8 pointer to y plane.
+@ * r1 - pu1_u - UWORD8 pointer to u plane.
+@ * r2 - pu1_v - UWORD8 pointer to u plane.
+@ * r3 - pu2_yuv422i - UWORD16 pointer to yuv422iimage.
+@ * stack + 40 - u4_width - Width of the Y plane.
+@ * 44 - u4_height - Height of the Y plane.
+@ * 48 - u4_stride_y - Stride in pixels of Y plane.
+@ * 52 - u4_stride_u - Stride in pixels of U plane.
+@ * 56 - u4_stride_v - Stride in pixels of V plane.
+@ * 60 - u4_stride_yuv422i- Stride in pixels of yuv422i image.
+@ *
+@ * @par Description
+@ * Function used from copying or converting a reference frame to display buffer
+@ * in non shared mode
+@ *
+@ * @param[in] pu1_y_dst
+@ * Output Y pointer
+@ *
+@ * @param[in] pu1_u_dst
+@ * Output U/UV pointer ( UV is interleaved in the same format as that of input)
+@ *
+@ * @param[in] pu1_v_dst
+@ * Output V pointer ( used in 420P output case)
+@ *
+@ * @param[in] u4_dst_y_strd
+@ * Stride of destination Y buffer
+@ *
+@ * @param[in] u4_dst_u_strd
+@ * Stride of destination U/V buffer
+@ *
+@ *
+@ * @param[in] blocking
+@ * To indicate whether format conversion should wait till frame is reconstructed
+@ * and then return after complete copy is done. To be set to 1 when called at the
+@ * end of frame processing and set to 0 when called between frame processing modules
+@ * in order to utilize available MCPS
+@ *
+@ * @returns Error from IH264E_ERROR_T
+@ *
+@ * @remarks
+@ * Assumes that the stride of U and V buffers are same.
+@ * This is correct in most cases
+@ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also
+@ * Since we read 4 pixels ata time the width should be aligned to 4
+@ * In assembly width should be aligned to 16 and height to 2.
+@ *
+@ *
+@ * Revision History :
+@ * DD MM YYYY Author(s) Changes (Describe the changes made)
+@ * 07 06 2010 Harinarayanan K K Adapeted to 422p
+@ *
+@ *******************************************************************************
+@ */
+
+@//`
+@*/
+ .global ih264e_fmt_conv_422i_to_420sp_a9q
+ih264e_fmt_conv_422i_to_420sp_a9q:
+ stmfd sp!, {r4-r12, lr} @// Back the register which are used
+
+
+
+ @/* Do the preprocessing before the main loops start */
+ @// Load the parameters from stack
+ ldr r4, [sp, #48] @// Load u4_stride_y from stack
+
+ ldr r5, [sp, #60] @// Load u4_stride_yuv422i from stack
+ add r6, r0, r4 @// pu1_y_nxt_row = pu1_y + u4_stride_y
+
+ ldr r7, [sp, #40] @// Load u4_width from stack
+ add r8, r3, r5, lsl #1 @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel)
+
+ ldr r9, [sp, #52] @// Load u4_stride_u from stack
+ sub r12, r4, r7 @// u2_offset1 = u4_stride_y - u4_width
+
+@LDR r10,[sp,#56] ;// Load u4_stride_v from stack
+ sub r14, r5, r7 @// u2_offset_yuv422i = u4_stride_yuv422i - u4_width
+
+ ldr r11, [sp, #44] @// Load u4_height from stack
+ sub r9, r9, r7 @// u2_offset2 = u4_stride_u - u4_width >> 1
+
+@ SUB r10,r10,r7,ASR #1 ;// u2_offset3 = u4_stride_v - u4_width >> 1
+ mov r14, r14, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i * 2
+
+ mov r7, r7, asr #4 @// u4_width = u4_width / 16 (u4_width >> 4)
+ mov r11, r11, asr #1 @// u4_width = u4_width / 2 (u4_width >> 1)
+
+ add r4, r12, r4 @// u2_offset1 = u2_offset1 + u4_stride_y
+ add r5, r14, r5, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i
+
+ vpush {d8-d15}
+
+@// Register Assignment
+@// pu1_y - r0
+@// pu1_y_nxt_row - r6
+@// pu1_u - r1
+@// pu1_v - r2
+@// pu2_yuv422i - r3
+@// pu2_yuv422i_nxt_row - r8
+@// u2_offset1 - r4
+@// u2_offset2 - r9
+@// u2_offset3 - r10
+@// u2_offset_yuv422i - r5
+@// u4_width / 16 - r7
+@// u4_height / 2 - r11
+@// inner loop count - r12
+yuv420_to_yuv422i_hight_loop:
+
+ mov r12, r7 @// Inner loop count = u4_width / 16
+
+yuv420_to_yuv422i_width_loop:
+ vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
+ vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
+ subs r12, r12, #1
+
+ vrhadd.u8 d0, d0, d4
+ vrhadd.u8 d2, d2, d6
+
+ vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y
+ vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y
+
+ vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U
+
+ bgt yuv420_to_yuv422i_width_loop
+
+ @// Update the buffer pointer so that they will refer to next pair of rows
+ add r0, r0, r4 @// pu1_y = pu1_y + u2_offset1
+ add r6, r6, r4 @// pu1_y_nxt_row = pu1_y_nxt_row + u2_offset1
+
+ add r1, r1, r9 @// pu1_u = pu1_u + u2_offset2
+ subs r11, r11, #1
+
+ add r3, r3, r5 @// pu2_yuv422i = pu2_yuv422i + u2_offset_yuv422i
+
+ add r8, r8, r5 @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row + u2_offset_yuv422i
+ bgt yuv420_to_yuv422i_hight_loop
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @// Restore the register which are used
+
+
+
diff --git a/encoder/arm/ih264e_function_selector.c b/encoder/arm/ih264e_function_selector.c
new file mode 100755
index 0000000..bb181c1
--- /dev/null
+++ b/encoder/arm/ih264e_function_selector.c
@@ -0,0 +1,170 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+*******************************************************************************
+* @file
+* ih264e_function_selector.c
+*
+* @brief
+* Contains functions to initialize function pointers used in h264
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System Include Files */
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* User Include Files */
+#include "ih264_typedefs.h"
+#include "iv2.h"
+#include "ive2.h"
+#include "ih264_defs.h"
+#include "ih264_size_defs.h"
+#include "ih264e_defs.h"
+#include "ih264e_error.h"
+#include "ih264e_bitstream.h"
+#include "ime_distortion_metrics.h"
+#include "ime_structs.h"
+#include "ih264_defs.h"
+#include "ih264_error.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include "ih264_inter_pred_filters.h"
+#include "ih264_mem_fns.h"
+#include "ih264_padding.h"
+#include "ih264_intra_pred_filters.h"
+#include "ih264_deblk_edge_filters.h"
+
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264e_defs.h"
+#include "irc_cntrl_param.h"
+#include "irc_frame_info_collector.h"
+#include "ih264e_rate_control.h"
+#include "ih264e_structs.h"
+#include "ih264e_platform_macros.h"
+
+/**
+*******************************************************************************
+*
+* @brief Initialize the intra/inter/transform/deblk function pointers of
+* codec context
+*
+* @par Description: the current routine initializes the function pointers of
+* codec context basing on the architecture in use
+*
+* @param[in] ps_codec
+* Codec context pointer
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+#ifdef ARMV8
+void ih264e_init_function_ptr(void *pv_codec)
+{
+ codec_t *ps_codec = (codec_t *)pv_codec;
+ ih264e_init_function_ptr_generic(ps_codec);
+ switch(ps_codec->s_cfg.e_arch)
+ {
+ case ARCH_ARM_NONEON:
+ break;
+ case ARCH_ARM_A53:
+ case ARCH_ARM_A57:
+ case ARCH_ARM_V8_NEON:
+ ih264e_init_function_ptr_neon_av8(ps_codec);
+ break;
+ default:
+ ih264e_init_function_ptr_neon_av8(ps_codec);
+ break;
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief Determine the architecture of the encoder executing environment
+*
+* @par Description: This routine returns the architecture of the enviro-
+* ment in which the current encoder is being tested
+*
+* @param[in] void
+*
+* @returns IV_ARCH_T
+* architecture
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+IV_ARCH_T ih264e_default_arch(void)
+{
+ return ARCH_ARM_V8_NEON;
+}
+
+#else
+
+void ih264e_init_function_ptr(void *pv_codec)
+{
+ codec_t *ps_codec = (codec_t *)pv_codec;
+ ih264e_init_function_ptr_generic(ps_codec);
+ switch(ps_codec->s_cfg.e_arch)
+ {
+ case ARCH_ARM_NONEON:
+ break;
+ case ARCH_ARM_A9Q:
+ case ARCH_ARM_A9A:
+ case ARCH_ARM_A9:
+ case ARCH_ARM_A7:
+ case ARCH_ARM_A5:
+ case ARCH_ARM_A15:
+ ih264e_init_function_ptr_neon_a9q(ps_codec);
+ break;
+ default:
+ ih264e_init_function_ptr_neon_a9q(ps_codec);
+ break;
+ }
+}
+
+IV_ARCH_T ih264e_default_arch(void)
+{
+ return ARCH_ARM_A9Q;
+}
+
+#endif
diff --git a/encoder/arm/ih264e_function_selector_a9q.c b/encoder/arm/ih264e_function_selector_a9q.c
new file mode 100755
index 0000000..8b2879b
--- /dev/null
+++ b/encoder/arm/ih264e_function_selector_a9q.c
@@ -0,0 +1,252 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264e_function_selector_generic.c
+*
+* @brief
+* Contains functions to initialize function pointers of codec context
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* - ih264e_init_function_ptr_generic
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System Include files */
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* User Include files */
+#include "ih264_typedefs.h"
+#include "iv2.h"
+#include "ive2.h"
+#include "ih264_defs.h"
+#include "ih264_size_defs.h"
+#include "ih264e_defs.h"
+#include "ih264e_error.h"
+#include "ih264e_bitstream.h"
+#include "ime_distortion_metrics.h"
+#include "ime_structs.h"
+#include "ih264_defs.h"
+#include "ih264_error.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include "ih264_inter_pred_filters.h"
+#include "ih264_mem_fns.h"
+#include "ih264_padding.h"
+#include "ih264_intra_pred_filters.h"
+#include "ih264_deblk_edge_filters.h"
+
+#include "irc_cntrl_param.h"
+#include "irc_frame_info_collector.h"
+#include "ih264e_rate_control.h"
+#include "ih264e_structs.h"
+#include "ih264e_platform_macros.h"
+#include "ih264_intra_pred_filters.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include "ih264e_defs.h"
+#include "ih264e_structs.h"
+#include "ih264_deblk_edge_filters.h"
+#include "ih264e_core_coding.h"
+#include "ih264_cavlc_tables.h"
+#include "ih264e_cavlc.h"
+#include "ih264_padding.h"
+#include "ih264e_intra_modes_eval.h"
+#include "ih264_mem_fns.h"
+#include "ih264e_fmt_conv.h"
+#include "ih264e_half_pel.h"
+
+/**
+*******************************************************************************
+*
+* @brief Initialize the intra/inter/transform/deblk function pointers of
+* codec context
+*
+* @par Description: the current routine initializes the function pointers of
+* codec context basing on the architecture in use
+*
+* @param[in] ps_codec
+* Codec context pointer
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264e_init_function_ptr_neon_a9q(codec_t *ps_codec)
+{
+ WORD32 i= 0;
+
+ /* curr proc ctxt */
+ process_ctxt_t *ps_proc = NULL;
+ me_ctxt_t *ps_me_ctxt = NULL;
+
+ /* Init function pointers for intra pred leaf level functions luma
+ * Intra 16x16 */
+ ps_codec->apf_intra_pred_16_l[0] = ih264_intra_pred_luma_16x16_mode_vert_a9q;
+ ps_codec->apf_intra_pred_16_l[1] = ih264_intra_pred_luma_16x16_mode_horz_a9q;
+ ps_codec->apf_intra_pred_16_l[2] = ih264_intra_pred_luma_16x16_mode_dc_a9q;
+ ps_codec->apf_intra_pred_16_l[3] = ih264_intra_pred_luma_16x16_mode_plane_a9q;
+
+ /* Init function pointers for intra pred leaf level functions luma
+ * Intra 4x4 */
+ ps_codec->apf_intra_pred_4_l[0] = ih264_intra_pred_luma_4x4_mode_vert_a9q;
+ ps_codec->apf_intra_pred_4_l[1] = ih264_intra_pred_luma_4x4_mode_horz_a9q;
+ ps_codec->apf_intra_pred_4_l[2] = ih264_intra_pred_luma_4x4_mode_dc_a9q;
+ ps_codec->apf_intra_pred_4_l[3] = ih264_intra_pred_luma_4x4_mode_diag_dl_a9q;
+ ps_codec->apf_intra_pred_4_l[4] = ih264_intra_pred_luma_4x4_mode_diag_dr_a9q;
+ ps_codec->apf_intra_pred_4_l[5] = ih264_intra_pred_luma_4x4_mode_vert_r_a9q;
+ ps_codec->apf_intra_pred_4_l[6] = ih264_intra_pred_luma_4x4_mode_horz_d_a9q;
+ ps_codec->apf_intra_pred_4_l[7] = ih264_intra_pred_luma_4x4_mode_vert_l_a9q;
+ ps_codec->apf_intra_pred_4_l[8] = ih264_intra_pred_luma_4x4_mode_horz_u_a9q;
+
+ /* Init function pointers for intra pred leaf level functions luma
+ * Intra 8x8 */
+ ps_codec->apf_intra_pred_8_l[0] = ih264_intra_pred_luma_8x8_mode_vert_a9q;
+ ps_codec->apf_intra_pred_8_l[2] = ih264_intra_pred_luma_8x8_mode_dc_a9q;
+ ps_codec->apf_intra_pred_8_l[3] = ih264_intra_pred_luma_8x8_mode_diag_dl_a9q;
+ ps_codec->apf_intra_pred_8_l[4] = ih264_intra_pred_luma_8x8_mode_diag_dr_a9q;
+ ps_codec->apf_intra_pred_8_l[5] = ih264_intra_pred_luma_8x8_mode_vert_r_a9q;
+ ps_codec->apf_intra_pred_8_l[6] = ih264_intra_pred_luma_8x8_mode_horz_d_a9q;
+ ps_codec->apf_intra_pred_8_l[7] = ih264_intra_pred_luma_8x8_mode_vert_l_a9q;
+ ps_codec->apf_intra_pred_8_l[8] = ih264_intra_pred_luma_8x8_mode_horz_u_a9q;
+
+ /* Init function pointers for intra pred leaf level functions chroma
+ * Intra 8x8 */
+ ps_codec->apf_intra_pred_c[0] = ih264_intra_pred_chroma_8x8_mode_dc_a9q;
+ ps_codec->apf_intra_pred_c[1] = ih264_intra_pred_chroma_8x8_mode_horz_a9q;
+ ps_codec->apf_intra_pred_c[2] = ih264_intra_pred_chroma_8x8_mode_vert_a9q;
+ ps_codec->apf_intra_pred_c[3] = ih264_intra_pred_chroma_8x8_mode_plane_a9q;
+
+ /* Init forward transform fn ptr */
+ ps_codec->pf_resi_trans_quant_8x8 = ih264_resi_trans_quant_8x8;
+ ps_codec->pf_resi_trans_quant_4x4 = ih264_resi_trans_quant_4x4_a9;
+ ps_codec->pf_resi_trans_quant_chroma_4x4 = ih264_resi_trans_quant_chroma_4x4_a9;
+ ps_codec->pf_hadamard_quant_4x4 = ih264_hadamard_quant_4x4_a9;
+ ps_codec->pf_hadamard_quant_2x2_uv = ih264_hadamard_quant_2x2_uv_a9;
+
+ /* Init inverse transform fn ptr */
+ ps_codec->pf_iquant_itrans_recon_8x8 = ih264_iquant_itrans_recon_8x8;
+ ps_codec->pf_iquant_itrans_recon_4x4 = ih264_iquant_itrans_recon_4x4_a9;
+ ps_codec->pf_iquant_itrans_recon_4x4_dc = ih264_iquant_itrans_recon_4x4_dc_a9;
+ ps_codec->pf_iquant_itrans_recon_chroma_4x4 = ih264_iquant_itrans_recon_chroma_4x4_a9;
+ ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc = ih264_iquant_itrans_recon_chroma_4x4_dc_a9;
+ ps_codec->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_a9;
+ ps_codec->pf_ihadamard_scaling_2x2_uv = ih264_ihadamard_scaling_2x2_uv_a9;
+ ps_codec->pf_interleave_copy = ih264_interleave_copy_a9;
+
+ /* Init fn ptr luma core coding */
+ ps_codec->luma_energy_compaction[0] = ih264e_code_luma_intra_macroblock_16x16;
+ ps_codec->luma_energy_compaction[1] = ih264e_code_luma_intra_macroblock_4x4;
+ ps_codec->luma_energy_compaction[3] = ih264e_code_luma_inter_macroblock_16x16;
+
+ /* Init fn ptr chroma core coding */
+ ps_codec->chroma_energy_compaction[0] = ih264e_code_chroma_intra_macroblock_8x8;
+ ps_codec->chroma_energy_compaction[1] = ih264e_code_chroma_inter_macroblock_8x8;
+
+ /* Init fn ptr luma deblocking */
+ ps_codec->pf_deblk_luma_vert_bs4 = ih264_deblk_luma_vert_bs4_a9;
+ ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_a9;
+ ps_codec->pf_deblk_luma_horz_bs4 = ih264_deblk_luma_horz_bs4_a9;
+ ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_a9;
+
+ /* Init fn ptr chroma deblocking */
+ ps_codec->pf_deblk_chroma_vert_bs4 = ih264_deblk_chroma_vert_bs4_a9;
+ ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4_a9;
+ ps_codec->pf_deblk_chroma_horz_bs4 = ih264_deblk_chroma_horz_bs4_a9;
+ ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_a9;
+
+ /* write mb syntax layer */
+ ps_codec->pf_write_mb_syntax_layer[ISLICE] = ih264e_write_islice_mb;
+ ps_codec->pf_write_mb_syntax_layer[PSLICE] = ih264e_write_pslice_mb;
+
+ /* Padding Functions */
+ ps_codec->pf_pad_top = ih264_pad_top_a9q;
+ ps_codec->pf_pad_bottom = ih264_pad_bottom;
+ ps_codec->pf_pad_left_luma = ih264_pad_left_luma_a9q;
+ ps_codec->pf_pad_left_chroma = ih264_pad_left_chroma_a9q;
+ ps_codec->pf_pad_right_luma = ih264_pad_right_luma_a9q;
+ ps_codec->pf_pad_right_chroma = ih264_pad_right_chroma_a9q;
+
+ /* Inter pred leaf level functions */
+ ps_codec->pf_inter_pred_luma_copy = ih264_inter_pred_luma_copy_a9q;
+ ps_codec->pf_inter_pred_luma_horz = ih264_inter_pred_luma_horz_a9q;
+ ps_codec->pf_inter_pred_luma_vert = ih264_inter_pred_luma_vert_a9q;
+ ps_codec->pf_inter_pred_luma_bilinear = ih264_inter_pred_luma_bilinear_a9q;
+ ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_a9q;
+
+ /* sad me level functions */
+ ps_codec->apf_compute_sad_16x16[0] = ime_compute_sad_16x16_a9q;
+ ps_codec->apf_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_a9q;
+ ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8_a9q;
+
+ /* memor handling operations */
+ ps_codec->pf_mem_cpy = ih264_memcpy_a9q;
+ ps_codec->pf_mem_cpy_mul8 = ih264_memcpy_mul_8_a9q;
+ ps_codec->pf_mem_set = ih264_memset_a9q;
+ ps_codec->pf_mem_set_mul8 = ih264_memset_mul_8_a9q;
+
+ /* sad me level functions */
+ for(i = 0; i < (MAX_PROCESS_CTXT); i++)
+ {
+ ps_proc = &ps_codec->as_process[i];
+ ps_me_ctxt = &ps_proc->s_me_ctxt;
+ ps_me_ctxt->pf_ime_compute_sad_16x16[0] = ime_compute_sad_16x16_a9q;
+ ps_me_ctxt->pf_ime_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_a9q;
+ ps_me_ctxt->pf_ime_compute_sad_16x8 = ime_compute_sad_16x8_a9q;
+ ps_me_ctxt->pf_ime_compute_sad4_diamond = ime_calculate_sad4_prog_a9q;
+ ps_me_ctxt->pf_ime_compute_sad3_diamond = ime_calculate_sad3_prog_a9q;
+ ps_me_ctxt->pf_ime_compute_sad2_diamond = ime_calculate_sad2_prog_a9q;
+ ps_me_ctxt->pf_ime_sub_pel_compute_sad_16x16 = ime_sub_pel_compute_sad_16x16_a9q;
+ ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16 = ime_compute_satqd_16x16_lumainter_a9q;
+ }
+
+ /* intra mode eval -encoder level function */
+ ps_codec->pf_ih264e_evaluate_intra16x16_modes = ih264e_evaluate_intra16x16_modes_a9q;
+ ps_codec->pf_ih264e_evaluate_intra_chroma_modes = ih264e_evaluate_intra_chroma_modes_a9q;
+ ps_codec->pf_ih264e_evaluate_intra_4x4_modes = ih264e_evaluate_intra_4x4_modes_a9q;
+
+ /* csc */
+ ps_codec->pf_ih264e_conv_420p_to_420sp = ih264e_fmt_conv_420p_to_420sp_a9q;
+ ps_codec->pf_ih264e_fmt_conv_422i_to_420sp = ih264e_fmt_conv_422i_to_420sp_a9q;
+
+ /* Halp pel generation function - encoder level*/
+ ps_codec->pf_ih264e_sixtapfilter_horz = ih264e_sixtapfilter_horz_a9q;
+ ps_codec->pf_ih264e_sixtap_filter_2dvh_vert = ih264e_sixtap_filter_2dvh_vert_a9q;
+
+ return ;
+ }
+
diff --git a/encoder/arm/ih264e_function_selector_av8.c b/encoder/arm/ih264e_function_selector_av8.c
new file mode 100755
index 0000000..173c2d5
--- /dev/null
+++ b/encoder/arm/ih264e_function_selector_av8.c
@@ -0,0 +1,259 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264e_function_selector_generic.c
+*
+* @brief
+* Contains functions to initialize function pointers of codec context
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* - ih264e_init_function_ptr_generic
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System Include files */
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* User Include files */
+#include "ih264_typedefs.h"
+#include "iv2.h"
+#include "ive2.h"
+#include "ih264_defs.h"
+#include "ih264_size_defs.h"
+#include "ih264e_defs.h"
+#include "ih264e_error.h"
+#include "ih264e_bitstream.h"
+#include "ime_distortion_metrics.h"
+#include "ime_structs.h"
+#include "ih264_defs.h"
+#include "ih264_error.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include "ih264_inter_pred_filters.h"
+#include "ih264_mem_fns.h"
+#include "ih264_padding.h"
+#include "ih264_intra_pred_filters.h"
+#include "ih264_deblk_edge_filters.h"
+
+#include "irc_cntrl_param.h"
+#include "irc_frame_info_collector.h"
+#include "ih264e_rate_control.h"
+#include "ih264e_structs.h"
+#include "ih264e_platform_macros.h"
+#include "ih264_intra_pred_filters.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include "ih264e_defs.h"
+#include "ih264e_structs.h"
+#include "ih264_deblk_edge_filters.h"
+#include "ih264e_core_coding.h"
+#include "ih264_cavlc_tables.h"
+#include "ih264e_cavlc.h"
+#include "ih264_padding.h"
+#include "ih264e_intra_modes_eval.h"
+#include "ih264_mem_fns.h"
+#include "ih264e_fmt_conv.h"
+#include "ih264e_half_pel.h"
+
+
+/**
+*******************************************************************************
+*
+* @brief Initialize the intra/inter/transform/deblk function pointers of
+* codec context
+*
+* @par Description: the current routine initializes the function pointers of
+* codec context basing on the architecture in use
+*
+* @param[in] ps_codec
+* Codec context pointer
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264e_init_function_ptr_neon_av8(codec_t *ps_codec)
+{
+
+ WORD32 i= 0;
+
+ /* curr proc ctxt */
+ process_ctxt_t *ps_proc = NULL;
+ me_ctxt_t *ps_me_ctxt = NULL;
+
+ /* Init function pointers for intra pred leaf level functions luma
+ * Intra 16x16 */
+ ps_codec->apf_intra_pred_16_l[0] = ih264_intra_pred_luma_16x16_mode_vert_av8;
+ ps_codec->apf_intra_pred_16_l[1] = ih264_intra_pred_luma_16x16_mode_horz_av8;
+ ps_codec->apf_intra_pred_16_l[2] = ih264_intra_pred_luma_16x16_mode_dc_av8;
+ ps_codec->apf_intra_pred_16_l[3] = ih264_intra_pred_luma_16x16_mode_plane_av8;
+
+ /* Init function pointers for intra pred leaf level functions luma
+ * Intra 4x4 */
+ ps_codec->apf_intra_pred_4_l[0] = ih264_intra_pred_luma_4x4_mode_vert_av8;
+ ps_codec->apf_intra_pred_4_l[1] = ih264_intra_pred_luma_4x4_mode_horz_av8;
+ ps_codec->apf_intra_pred_4_l[2] = ih264_intra_pred_luma_4x4_mode_dc_av8;
+ ps_codec->apf_intra_pred_4_l[3] = ih264_intra_pred_luma_4x4_mode_diag_dl_av8;
+ ps_codec->apf_intra_pred_4_l[4] = ih264_intra_pred_luma_4x4_mode_diag_dr_av8;
+ ps_codec->apf_intra_pred_4_l[5] = ih264_intra_pred_luma_4x4_mode_vert_r_av8;
+ ps_codec->apf_intra_pred_4_l[6] = ih264_intra_pred_luma_4x4_mode_horz_d_av8;
+ ps_codec->apf_intra_pred_4_l[7] = ih264_intra_pred_luma_4x4_mode_vert_l_av8;
+ ps_codec->apf_intra_pred_4_l[8] = ih264_intra_pred_luma_4x4_mode_horz_u_av8;
+
+ /* Init function pointers for intra pred leaf level functions luma
+ * Intra 8x8 */
+ ps_codec->apf_intra_pred_8_l[0] = ih264_intra_pred_luma_8x8_mode_vert_av8;
+ ps_codec->apf_intra_pred_8_l[2] = ih264_intra_pred_luma_8x8_mode_dc_av8;
+ ps_codec->apf_intra_pred_8_l[3] = ih264_intra_pred_luma_8x8_mode_diag_dl_av8;
+ ps_codec->apf_intra_pred_8_l[4] = ih264_intra_pred_luma_8x8_mode_diag_dr_av8;
+ ps_codec->apf_intra_pred_8_l[5] = ih264_intra_pred_luma_8x8_mode_vert_r_av8;
+ ps_codec->apf_intra_pred_8_l[6] = ih264_intra_pred_luma_8x8_mode_horz_d_av8;
+ ps_codec->apf_intra_pred_8_l[7] = ih264_intra_pred_luma_8x8_mode_vert_l_av8;
+ ps_codec->apf_intra_pred_8_l[8] = ih264_intra_pred_luma_8x8_mode_horz_u_av8;
+
+ /* Init function pointers for intra pred leaf level functions chroma
+ * Intra 8x8 */
+ ps_codec->apf_intra_pred_c[0] = ih264_intra_pred_chroma_8x8_mode_dc_av8;
+ ps_codec->apf_intra_pred_c[1] = ih264_intra_pred_chroma_8x8_mode_horz_av8;
+ ps_codec->apf_intra_pred_c[2] = ih264_intra_pred_chroma_8x8_mode_vert_av8;
+ ps_codec->apf_intra_pred_c[3] = ih264_intra_pred_chroma_8x8_mode_plane_av8;
+
+
+ /* Init forward transform fn ptr */
+ ps_codec->pf_resi_trans_quant_8x8 = ih264_resi_trans_quant_8x8;
+ ps_codec->pf_resi_trans_quant_4x4 = ih264_resi_trans_quant_4x4_av8;
+ ps_codec->pf_resi_trans_quant_chroma_4x4 = ih264_resi_trans_quant_chroma_4x4_av8;
+ ps_codec->pf_hadamard_quant_4x4 = ih264_hadamard_quant_4x4_av8;
+ ps_codec->pf_hadamard_quant_2x2_uv = ih264_hadamard_quant_2x2_uv_av8;
+
+ /* Init inverse transform fn ptr */
+ ps_codec->pf_iquant_itrans_recon_8x8 = ih264_iquant_itrans_recon_8x8_av8;
+ ps_codec->pf_iquant_itrans_recon_4x4 = ih264_iquant_itrans_recon_4x4_av8;
+ ps_codec->pf_iquant_itrans_recon_4x4_dc = ih264_iquant_itrans_recon_4x4_dc_av8;
+ ps_codec->pf_iquant_itrans_recon_chroma_4x4 = ih264_iquant_itrans_recon_chroma_4x4_av8;
+ ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc = ih264_iquant_itrans_recon_chroma_4x4_dc_av8;
+ ps_codec->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_av8;
+ ps_codec->pf_ihadamard_scaling_2x2_uv = ih264_ihadamard_scaling_2x2_uv_av8;
+ ps_codec->pf_interleave_copy = ih264_interleave_copy_av8;
+
+ /* Init fn ptr luma core coding */
+ ps_codec->luma_energy_compaction[0] = ih264e_code_luma_intra_macroblock_16x16;
+ ps_codec->luma_energy_compaction[1] = ih264e_code_luma_intra_macroblock_4x4;
+ ps_codec->luma_energy_compaction[3] = ih264e_code_luma_inter_macroblock_16x16;
+
+ /* Init fn ptr chroma core coding */
+ ps_codec->chroma_energy_compaction[0] = ih264e_code_chroma_intra_macroblock_8x8;
+ ps_codec->chroma_energy_compaction[1] = ih264e_code_chroma_inter_macroblock_8x8;
+
+ /* Init fn ptr luma deblocking */
+ ps_codec->pf_deblk_luma_vert_bs4 = ih264_deblk_luma_vert_bs4_av8;
+ ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_av8;
+ ps_codec->pf_deblk_luma_horz_bs4 = ih264_deblk_luma_horz_bs4_av8;
+ ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_av8;
+
+ /* Init fn ptr chroma deblocking */
+ ps_codec->pf_deblk_chroma_vert_bs4 = ih264_deblk_chroma_vert_bs4_av8;
+ ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4_av8;
+ ps_codec->pf_deblk_chroma_horz_bs4 = ih264_deblk_chroma_horz_bs4_av8;
+ ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_av8;
+
+ /* write mb syntax layer */
+ ps_codec->pf_write_mb_syntax_layer[ISLICE] = ih264e_write_islice_mb;
+ ps_codec->pf_write_mb_syntax_layer[PSLICE] = ih264e_write_pslice_mb;
+
+ /* Padding Functions */
+ ps_codec->pf_pad_top = ih264_pad_top_av8;
+ ps_codec->pf_pad_bottom = ih264_pad_bottom;
+ ps_codec->pf_pad_left_luma = ih264_pad_left_luma_av8;
+ ps_codec->pf_pad_left_chroma = ih264_pad_left_chroma_av8;
+ ps_codec->pf_pad_right_luma = ih264_pad_right_luma_av8;
+ ps_codec->pf_pad_right_chroma = ih264_pad_right_chroma_av8;
+
+ /* Inter pred leaf level functions */
+ ps_codec->pf_inter_pred_luma_copy = ih264_inter_pred_luma_copy_av8;
+ ps_codec->pf_inter_pred_luma_horz = ih264_inter_pred_luma_horz_av8;
+ ps_codec->pf_inter_pred_luma_vert = ih264_inter_pred_luma_vert_av8;
+ ps_codec->pf_inter_pred_luma_bilinear = ih264_inter_pred_luma_bilinear;
+ ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_av8;
+
+ /* sad me level functions */
+ ps_codec->apf_compute_sad_16x16[0] = ime_compute_sad_16x16_av8;
+ ps_codec->apf_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_av8;
+ ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8_av8;
+
+ /* memor handling operations */
+ ps_codec->pf_mem_cpy = ih264_memcpy_av8;
+ ps_codec->pf_mem_cpy_mul8 = ih264_memcpy_mul_8_av8;
+ ps_codec->pf_mem_set = ih264_memset_av8;
+ ps_codec->pf_mem_set_mul8 = ih264_memset_mul_8_av8;
+
+ /* sad me level functions */
+ for(i = 0; i < (MAX_PROCESS_CTXT); i++)
+ {
+ ps_proc = &ps_codec->as_process[i];
+ ps_me_ctxt = &ps_proc->s_me_ctxt;
+ ps_me_ctxt->pf_ime_compute_sad_16x16[0] = ime_compute_sad_16x16_av8;
+ ps_me_ctxt->pf_ime_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_av8;
+ ps_me_ctxt->pf_ime_compute_sad_16x8 = ime_compute_sad_16x8_av8;
+ ps_me_ctxt->pf_ime_compute_sad4_diamond = ime_calculate_sad4_prog_av8;
+ ps_me_ctxt->pf_ime_compute_sad3_diamond = ime_calculate_sad3_prog_av8;
+ ps_me_ctxt->pf_ime_compute_sad2_diamond = ime_calculate_sad2_prog_av8;
+ ps_me_ctxt->pf_ime_sub_pel_compute_sad_16x16 = ime_sub_pel_compute_sad_16x16_av8;
+ ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16 = ime_compute_satqd_16x16_lumainter_av8;
+ }
+
+ /* intra mode eval -encoder level function */
+ ps_codec->pf_ih264e_evaluate_intra16x16_modes = ih264e_evaluate_intra16x16_modes_av8;
+ ps_codec->pf_ih264e_evaluate_intra_chroma_modes = ih264e_evaluate_intra_chroma_modes_av8;
+ ps_codec->pf_ih264e_evaluate_intra_4x4_modes = ih264e_evaluate_intra_4x4_modes;
+
+ /* csc */
+ ps_codec->pf_ih264e_conv_420p_to_420sp = ih264e_fmt_conv_420p_to_420sp;
+ ps_codec->pf_ih264e_fmt_conv_422i_to_420sp = ih264e_fmt_conv_422i_to_420sp;
+
+ /* Halp pel generation function - encoder level*/
+ ps_codec->pf_ih264e_sixtapfilter_horz = ih264e_sixtapfilter_horz_av8;
+ ps_codec->pf_ih264e_sixtap_filter_2dvh_vert = ih264e_sixtap_filter_2dvh_vert_av8;
+
+ return ;
+ }
+
diff --git a/encoder/arm/ih264e_half_pel.s b/encoder/arm/ih264e_half_pel.s
new file mode 100755
index 0000000..1b9a87a
--- /dev/null
+++ b/encoder/arm/ih264e_half_pel.s
@@ -0,0 +1,951 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264e_half_pel.s
+@ *
+@ * @brief
+@ *
+@ *
+@ * @author
+@ * Ittiam
+@ *
+@ * @par List of Functions:
+@ * ih264e_sixtapfilter_horz
+@ * ih264e_sixtap_filter_2dvh_vert
+@
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+
+
+.text
+.p2align 2
+
+@ /**
+@/*******************************************************************************
+@*
+@* @brief
+@* Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16)
+@*
+@* @par Description:
+@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
+@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264e_sixtapfilter_horz(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd);
+
+
+.equ HALFPEL_WIDTH , 17 + 1 @( make it even, two rows are processed at a time)
+
+
+ .global ih264e_sixtapfilter_horz_a9q
+ih264e_sixtapfilter_horz_a9q:
+ stmfd sp!, {lr}
+
+ vmov.i8 d0, #5
+ sub r0, r0, #2
+
+ vmov.i8 d1, #20
+ mov r14, #HALFPEL_WIDTH
+ vpush {d8-d15}
+
+filter_horz_loop:
+
+
+ vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0
+ vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1
+
+ @// Processing row0 and row1
+
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0)
+
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d29, d4, d4, #5 @//extract a[5] (column3,row0)
+ vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0)
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vaddl.u8 q6, d29, d4 @// a0 + a5 (column3,row0)
+ vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d26, d7, d7, #5 @//extract a[5] (column3,row1)
+
+ vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1)
+ vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0)
+ vaddl.u8 q9, d26, d7 @// a0 + a5 (column3,row1)
+ vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0)
+ vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vext.8 d29, d4, d4, #2 @//extract a[2] (column3,row0)
+ vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0)
+ vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1)
+ vmlal.u8 q6, d29, d1 @// a0 + a5 + 20a2 (column3,row0)
+ vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1)
+ vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vext.8 d26, d7, d7, #2 @//extract a[2] (column3,row1)
+
+ vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1)
+ vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q9, d26, d1 @// a0 + a5 + 20a2 (column3,row1)
+ vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0)
+ vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vext.8 d29, d4, d4, #3 @//extract a[3] (column3,row0)
+ vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1)
+ vmlal.u8 q6, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0)
+ vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1)
+ vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vext.8 d26, d7, d7, #3 @//extract a[3] (column3,row1)
+
+ vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1)
+ vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0)
+ vmlal.u8 q9, d26, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row1)
+ vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0)
+ vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vext.8 d29, d4, d4, #1 @//extract a[1] (column3,row0)
+ vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1)
+ vmlsl.u8 q6, d29, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
+ vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1)
+ vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vext.8 d26, d7, d7, #1 @//extract a[1] (column3,row1)
+
+ vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1)
+ vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlsl.u8 q9, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row1)
+ vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0)
+ vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vext.8 d29, d4, d4, #4 @//extract a[4] (column3,row0)
+ vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+ vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1)
+ vmlsl.u8 q6, d29, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
+ vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1)
+ vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vext.8 d26, d7, d7, #4 @//extract a[4] (column3,row1)
+
+ vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1)
+ vmlsl.u8 q9, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row1)
+
+ vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ vqrshrun.s16 d22, q6, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
+ vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vqrshrun.s16 d24, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1)
+ vqrshrun.s16 d25, q9, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row1)
+
+ vst1.8 {d20, d21, d22}, [r1], r3 @//Store dest row0
+ vst1.8 {d23, d24, d25}, [r1], r3 @//Store dest row1
+
+ subs r14, r14, #2 @ decrement counter
+
+ bne filter_horz_loop
+
+ vpop {d8-d15}
+ ldmfd sp!, {pc}
+
+
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* This function implements a two stage cascaded six tap filter. It
+@* applies the six tap filter in the vertical direction on the
+@* predictor values, followed by applying the same filter in the
+@* horizontal direction on the output of the first stage. The six tap
+@* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
+@* interpolation process"
+@* (Filter run for width = 17 and height =17)
+@* @par Description:
+@* The function interpolates
+@* the predictors first in the vertical direction and then in the
+@* horizontal direction to output the (1/2,1/2). The output of the first
+@* stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C)
+@* in 16 bit precision.
+@*
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst1
+@* UWORD8 pointer to the destination(vertical filtered output)
+@*
+@* @param[out] pu1_dst2
+@* UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output)
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride of pu1_dst
+@*
+@* @param[in]pi16_pred1
+@* Pointer to 16bit intermediate buffer(used only in c)
+@*
+@* @param[in] pi16_pred1_strd
+@* integer destination stride of pi16_pred1
+@*
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst1,
+@ UWORD8 *pu1_dst2,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/
+@ WORD32 pi16_pred1_strd)
+
+
+
+
+ .global ih264e_sixtap_filter_2dvh_vert_a9q
+
+ih264e_sixtap_filter_2dvh_vert_a9q:
+ stmfd sp!, {r10, r11, r12, lr}
+
+@//r0 - pu1_ref
+@//r3 - u4_ref_width
+ vpush {d8-d15}
+ @// Load six rows for vertical interpolation
+ lsl r12, r3, #1
+ sub r0, r0, r12
+ sub r0, r0, #2
+ vld1.8 {d2, d3, d4}, [r0], r3
+ vld1.8 {d5, d6, d7}, [r0], r3
+ vld1.8 {d8, d9, d10}, [r0], r3
+ mov r12, #5
+ vld1.8 {d11, d12, d13}, [r0], r3
+ mov r14, #20
+ vld1.8 {d14, d15, d16}, [r0], r3
+ vmov.16 d0[0], r12
+ vmov.16 d0[1], r14
+ vld1.8 {d17, d18, d19}, [r0], r3
+ vmov.i8 d1, #20
+
+@// r12 - u2_buff1_width
+@// r14 - u2_buff2_width
+ ldr r12, [sp, #80]
+ add r11, r1, #6
+
+ mov r14, r12
+
+ mov r10, #3 @loop counter
+
+
+filter_2dvh_loop:
+
+ @// ////////////// ROW 1 ///////////////////////
+
+@// Process first vertical interpolated row
+@// each column is
+ vaddl.u8 q10, d2, d17 @// a0 + a5 (column1,row0)
+ vmov.i8 d31, #5
+ vmlal.u8 q10, d8, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlal.u8 q10, d11, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlsl.u8 q10, d5, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q10, d14, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+
+
+ vaddl.u8 q11, d3, d18 @// a0 + a5 (column2,row0)
+ vmlal.u8 q11, d9, d1 @// a0 + a5 + 20a2 (column2,row0)
+ vmlal.u8 q11, d12, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ vmlsl.u8 q11, d6, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ vmlsl.u8 q11, d15, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+ vext.16 d30, d20, d21, #2 @//extract a[2] (set1)
+
+ vaddl.u8 q12, d4, d19 @// a0 + a5 (column3,row0)
+ vext.16 d29, d20, d21, #3 @//extract a[3] (set1)
+ vmlal.u8 q12, d10, d1 @// a0 + a5 + 20a2 (column3,row0)
+ vmlal.u8 q12, d13, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0)
+ vmlsl.u8 q12, d7, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
+ vmlsl.u8 q12, d16, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
+
+ vqrshrun.s16 d2, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vext.16 d31, d21, d22, #1 @//extract a[5] (set1)
+ vqrshrun.s16 d3, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ vext.16 d28, d20, d21, #1 @//extract a[1] (set1)
+
+ vaddl.s16 q13, d31, d20 @// a0 + a5 (set1)
+ vext.16 d31, d22, d23, #1 @//extract a[5] (set2)
+ vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1)
+ vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1)
+ vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
+ vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
+ vext.16 d30, d21, d22, #2 @//extract a[2] (set2)
+
+ vqrshrun.s16 d4, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
+ vext.16 d29, d21, d22, #3 @//extract a[3] (set2)
+
+ vext.16 d28, d21, d22, #1 @//extract a[1] (set2)
+ vaddl.s16 q10, d31, d21 @// a0 + a5 (set2)
+ vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2)
+ vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2)
+ vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
+ vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
+ vext.16 d31, d23, d24, #1 @//extract a[5] (set3)
+
+ vext.8 d2, d2, d3, #2
+ vst1.8 {d3, d4}, [r11], r12 @// store row1 - 1,1/2 grid
+ vst1.8 {d2}, [r1], r12 @// store row1 - 1,1/2 grid
+
+ vext.16 d30, d22, d23, #2 @//extract a[2] (set3)
+ vext.16 d29, d22, d23, #3 @//extract a[3] (set3)
+
+ vaddl.s16 q1, d31, d22 @// a0 + a5 (set3)
+ vext.16 d28, d22, d23, #1 @//extract a[1] (set3)
+ vmlal.s16 q1, d30, d0[1] @// a0 + a5 + 20a2 (set3)
+ vmlal.s16 q1, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3)
+ vmlsl.s16 q1, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
+ vmlsl.s16 q1, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
+ vext.16 d31, d24, d25, #1 @//extract a[5] (set4)
+
+ vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2)
+ vext.16 d30, d23, d24, #2 @//extract a[2] (set4)
+ vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1)
+ vext.16 d29, d23, d24, #3 @//extract a[3] (set4)
+
+ vaddl.s16 q13, d31, d23 @// a0 + a5 (set4)
+ vext.16 d28, d23, d24, #1 @//extract a[1] (set4)
+ vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid
+ vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4)
+ vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4)
+ vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
+ vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
+ vext.16 d30, d24, d25, #2 @//extract a[2] (set5)
+
+ vaddl.s16 q11, d31, d24 @// a0 + a5 (set5)
+ vext.16 d29, d24, d25, #3 @//extract a[3] (set5)
+
+ vext.16 d31, d24, d25, #1 @//extract a[1] (set5)
+ vshrn.s32 d28, q1, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3)
+
+ vld1.8 {d2, d3, d4}, [r0], r3 @// Load next Row data
+ vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5)
+ vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5)
+ vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
+ vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
+ vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4)
+ vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2
+
+
+ @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
+ @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
+
+ @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
+
+ @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values
+ @// ////////////// ROW 2 ///////////////////////
+
+@// Process first vertical interpolated row
+@// each column is
+ vaddl.u8 q10, d5, d2 @// a0 + a5 (column1,row0)
+ vmov.i8 d31, #5
+ vmlal.u8 q10, d11, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlal.u8 q10, d14, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlsl.u8 q10, d8, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q10, d17, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+
+ vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4
+ vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5)
+
+ vaddl.u8 q11, d6, d3 @// a0 + a5 (column2,row0)
+ vmlal.u8 q11, d12, d1 @// a0 + a5 + 20a2 (column2,row0)
+ vmlal.u8 q11, d15, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ vmlsl.u8 q11, d9, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ vmlsl.u8 q11, d18, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+
+ vqrshrun.s16 d28, q14, #2 @// half,half gird set5
+ vext.16 d30, d20, d21, #2 @//extract a[2] (set1)
+
+ vaddl.u8 q12, d7, d4 @// a0 + a5 (column3,row0)
+ vext.16 d29, d20, d21, #3 @//extract a[3] (set1)
+ vmlal.u8 q12, d13, d1 @// a0 + a5 + 20a2 (column3,row0)
+ vmlal.u8 q12, d16, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0)
+ vmlsl.u8 q12, d10, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
+ vmlsl.u8 q12, d19, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
+ vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values
+
+ vqrshrun.s16 d5, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vext.16 d31, d21, d22, #1 @//extract a[5] (set1)
+ vqrshrun.s16 d6, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ vext.16 d28, d20, d21, #1 @//extract a[1] (set1)
+
+ vaddl.s16 q13, d31, d20 @// a0 + a5 (set1)
+ vext.16 d31, d22, d23, #1 @//extract a[5] (set2)
+ vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1)
+ vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1)
+ vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
+ vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
+ vext.16 d30, d21, d22, #2 @//extract a[2] (set2)
+
+ vqrshrun.s16 d7, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
+ vext.16 d29, d21, d22, #3 @//extract a[3] (set2)
+
+ vext.16 d28, d21, d22, #1 @//extract a[1] (set2)
+ vaddl.s16 q10, d31, d21 @// a0 + a5 (set2)
+ vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2)
+ vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2)
+ vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
+ vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
+ vext.16 d31, d23, d24, #1 @//extract a[5] (set3)
+
+ vext.8 d5, d5, d6, #2
+ vst1.8 {d6, d7}, [r11], r12 @// store row1 - 1,1/2 grid
+ vst1.8 {d5}, [r1], r12 @// store row1 - 1,1/2 grid
+
+ vext.16 d30, d22, d23, #2 @//extract a[2] (set3)
+ vext.16 d29, d22, d23, #3 @//extract a[3] (set3)
+
+ vaddl.s16 q3, d31, d22 @// a0 + a5 (set3)
+ vext.16 d28, d22, d23, #1 @//extract a[1] (set3)
+ vmlal.s16 q3, d30, d0[1] @// a0 + a5 + 20a2 (set3)
+ vmlal.s16 q3, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3)
+ vmlsl.s16 q3, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
+ vmlsl.s16 q3, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
+ vext.16 d31, d24, d25, #1 @//extract a[5] (set4)
+
+ vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2)
+ vext.16 d30, d23, d24, #2 @//extract a[2] (set4)
+ vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1)
+ vext.16 d29, d23, d24, #3 @//extract a[3] (set4)
+
+ vaddl.s16 q13, d31, d23 @// a0 + a5 (set4)
+ vext.16 d28, d23, d24, #1 @//extract a[1] (set4)
+ vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid
+ vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4)
+ vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4)
+ vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
+ vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
+ vext.16 d30, d24, d25, #2 @//extract a[2] (set5)
+
+ vaddl.s16 q11, d31, d24 @// a0 + a5 (set5)
+ vext.16 d29, d24, d25, #3 @//extract a[3] (set5)
+
+ vext.16 d31, d24, d25, #1 @//extract a[1] (set5)
+ vshrn.s32 d28, q3, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3)
+
+ vld1.8 {d5, d6, d7}, [r0], r3 @// Load next Row data
+ vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5)
+ vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5)
+ vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
+ vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
+ vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4)
+ vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2
+
+
+ @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
+ @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
+
+ @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
+
+ @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values
+ @// ////////////// ROW 3 ///////////////////////
+
+@// Process first vertical interpolated row
+@// each column is
+ vaddl.u8 q10, d8, d5 @// a0 + a5 (column1,row0)
+ vmov.i8 d31, #5
+ vmlal.u8 q10, d14, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlal.u8 q10, d17, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlsl.u8 q10, d11, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q10, d2, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+
+ vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4
+ vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5)
+
+ vaddl.u8 q11, d9, d6 @// a0 + a5 (column2,row0)
+ vmlal.u8 q11, d15, d1 @// a0 + a5 + 20a2 (column2,row0)
+ vmlal.u8 q11, d18, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ vmlsl.u8 q11, d12, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ vmlsl.u8 q11, d3, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+
+ vqrshrun.s16 d28, q14, #2 @// half,half gird set5
+ vext.16 d30, d20, d21, #2 @//extract a[2] (set1)
+
+ vaddl.u8 q12, d10, d7 @// a0 + a5 (column3,row0)
+ vext.16 d29, d20, d21, #3 @//extract a[3] (set1)
+ vmlal.u8 q12, d16, d1 @// a0 + a5 + 20a2 (column3,row0)
+ vmlal.u8 q12, d19, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0)
+ vmlsl.u8 q12, d13, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
+ vmlsl.u8 q12, d4, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
+
+ vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values
+
+ vqrshrun.s16 d8, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vext.16 d31, d21, d22, #1 @//extract a[5] (set1)
+ vqrshrun.s16 d9, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ vext.16 d28, d20, d21, #1 @//extract a[1] (set1)
+
+ vaddl.s16 q13, d31, d20 @// a0 + a5 (set1)
+ vext.16 d31, d22, d23, #1 @//extract a[5] (set2)
+ vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1)
+ vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1)
+ vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
+ vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
+ vext.16 d30, d21, d22, #2 @//extract a[2] (set2)
+
+ vqrshrun.s16 d10, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
+ vext.16 d29, d21, d22, #3 @//extract a[3] (set2)
+
+ vext.16 d28, d21, d22, #1 @//extract a[1] (set2)
+ vaddl.s16 q10, d31, d21 @// a0 + a5 (set2)
+ vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2)
+ vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2)
+ vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
+ vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
+ vext.16 d31, d23, d24, #1 @//extract a[5] (set3)
+
+ vext.8 d8, d8, d9, #2
+ vst1.8 {d9, d10}, [r11], r12 @// store row1 - 1,1/2 grid
+ vst1.8 {d8}, [r1], r12 @// store row1 - 1,1/2 grid
+
+ vext.16 d30, d22, d23, #2 @//extract a[2] (set3)
+ vext.16 d29, d22, d23, #3 @//extract a[3] (set3)
+
+ vaddl.s16 q4, d31, d22 @// a0 + a5 (set3)
+ vext.16 d28, d22, d23, #1 @//extract a[1] (set3)
+ vmlal.s16 q4, d30, d0[1] @// a0 + a5 + 20a2 (set3)
+ vmlal.s16 q4, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3)
+ vmlsl.s16 q4, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
+ vmlsl.s16 q4, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
+ vext.16 d31, d24, d25, #1 @//extract a[5] (set4)
+
+ vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2)
+ vext.16 d30, d23, d24, #2 @//extract a[2] (set4)
+ vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1)
+ vext.16 d29, d23, d24, #3 @//extract a[3] (set4)
+
+ vaddl.s16 q13, d31, d23 @// a0 + a5 (set4)
+ vext.16 d28, d23, d24, #1 @//extract a[1] (set4)
+ vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid
+ vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4)
+ vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4)
+ vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
+ vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
+ vext.16 d30, d24, d25, #2 @//extract a[2] (set5)
+
+ vaddl.s16 q11, d31, d24 @// a0 + a5 (set5)
+ vext.16 d29, d24, d25, #3 @//extract a[3] (set5)
+
+ vext.16 d31, d24, d25, #1 @//extract a[1] (set5)
+ vshrn.s32 d28, q4, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3)
+
+ vld1.8 {d8, d9, d10}, [r0], r3 @// Load next Row data
+ vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5)
+ vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5)
+ vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
+ vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
+ vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4)
+ vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2
+
+
+ @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
+ @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
+
+ @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
+
+ @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values
+ @// ////////////// ROW 4 ///////////////////////
+
+@// Process first vertical interpolated row
+@// each column is
+ vaddl.u8 q10, d11, d8 @// a0 + a5 (column1,row0)
+ vmov.i8 d31, #5
+ vmlal.u8 q10, d17, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlal.u8 q10, d2, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlsl.u8 q10, d14, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q10, d5, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+
+ vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4
+ vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5)
+
+ vaddl.u8 q11, d12, d9 @// a0 + a5 (column2,row0)
+ vmlal.u8 q11, d18, d1 @// a0 + a5 + 20a2 (column2,row0)
+ vmlal.u8 q11, d3, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ vmlsl.u8 q11, d15, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ vmlsl.u8 q11, d6, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+
+ vqrshrun.s16 d28, q14, #2 @// half,half gird set5
+ vext.16 d30, d20, d21, #2 @//extract a[2] (set1)
+
+ vaddl.u8 q12, d13, d10 @// a0 + a5 (column3,row0)
+ vext.16 d29, d20, d21, #3 @//extract a[3] (set1)
+ vmlal.u8 q12, d19, d1 @// a0 + a5 + 20a2 (column3,row0)
+ vmlal.u8 q12, d4, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0)
+ vmlsl.u8 q12, d16, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
+ vmlsl.u8 q12, d7, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
+
+ vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values
+
+ vqrshrun.s16 d11, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vext.16 d31, d21, d22, #1 @//extract a[5] (set1)
+ vqrshrun.s16 d12, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ vext.16 d28, d20, d21, #1 @//extract a[1] (set1)
+
+ vaddl.s16 q13, d31, d20 @// a0 + a5 (set1)
+ vext.16 d31, d22, d23, #1 @//extract a[5] (set2)
+ vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1)
+ vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1)
+ vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
+ vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
+ vext.16 d30, d21, d22, #2 @//extract a[2] (set2)
+
+ vqrshrun.s16 d13, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
+ vext.16 d29, d21, d22, #3 @//extract a[3] (set2)
+
+ vext.16 d28, d21, d22, #1 @//extract a[1] (set2)
+ vaddl.s16 q10, d31, d21 @// a0 + a5 (set2)
+ vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2)
+ vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2)
+ vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
+ vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
+ vext.16 d31, d23, d24, #1 @//extract a[5] (set3)
+
+ vext.8 d11, d11, d12, #2
+ vst1.8 {d12, d13}, [r11], r12 @// store row1 - 1,1/2 grid
+ vst1.8 {d11}, [r1], r12 @// store row1 - 1,1/2 grid
+
+ vext.16 d30, d22, d23, #2 @//extract a[2] (set3)
+ vext.16 d29, d22, d23, #3 @//extract a[3] (set3)
+
+ vaddl.s16 q6, d31, d22 @// a0 + a5 (set3)
+ vext.16 d28, d22, d23, #1 @//extract a[1] (set3)
+ vmlal.s16 q6, d30, d0[1] @// a0 + a5 + 20a2 (set3)
+ vmlal.s16 q6, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3)
+ vmlsl.s16 q6, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
+ vmlsl.s16 q6, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
+ vext.16 d31, d24, d25, #1 @//extract a[5] (set4)
+
+ vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2)
+ vext.16 d30, d23, d24, #2 @//extract a[2] (set4)
+ vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1)
+ vext.16 d29, d23, d24, #3 @//extract a[3] (set4)
+
+ vaddl.s16 q13, d31, d23 @// a0 + a5 (set4)
+ vext.16 d28, d23, d24, #1 @//extract a[1] (set4)
+ vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid
+ vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4)
+ vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4)
+ vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
+ vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
+ vext.16 d30, d24, d25, #2 @//extract a[2] (set5)
+
+ vaddl.s16 q11, d31, d24 @// a0 + a5 (set5)
+ vext.16 d29, d24, d25, #3 @//extract a[3] (set5)
+
+ vext.16 d31, d24, d25, #1 @//extract a[1] (set5)
+ vshrn.s32 d28, q6, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3)
+
+ vld1.8 {d11, d12, d13}, [r0], r3 @// Load next Row data
+ vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5)
+ vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5)
+ vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
+ vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
+ vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4)
+ vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2
+
+
+ @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
+ @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
+
+ @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
+
+ @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values
+ @// ////////////// ROW 5 ///////////////////////
+
+@// Process first vertical interpolated row
+@// each column is
+ vaddl.u8 q10, d14, d11 @// a0 + a5 (column1,row0)
+ vmov.i8 d31, #5
+ vmlal.u8 q10, d2, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlal.u8 q10, d5, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlsl.u8 q10, d17, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q10, d8, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+
+ vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4
+ vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5)
+
+ vaddl.u8 q11, d15, d12 @// a0 + a5 (column2,row0)
+ vmlal.u8 q11, d3, d1 @// a0 + a5 + 20a2 (column2,row0)
+ vmlal.u8 q11, d6, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ vmlsl.u8 q11, d18, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ vmlsl.u8 q11, d9, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+
+ vqrshrun.s16 d28, q14, #2 @// half,half gird set5
+ vext.16 d30, d20, d21, #2 @//extract a[2] (set1)
+
+ vaddl.u8 q12, d16, d13 @// a0 + a5 (column3,row0)
+ vext.16 d29, d20, d21, #3 @//extract a[3] (set1)
+ vmlal.u8 q12, d4, d1 @// a0 + a5 + 20a2 (column3,row0)
+ vmlal.u8 q12, d7, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0)
+ vmlsl.u8 q12, d19, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
+ vmlsl.u8 q12, d10, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
+
+ vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values
+
+ vqrshrun.s16 d14, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vext.16 d31, d21, d22, #1 @//extract a[5] (set1)
+ vqrshrun.s16 d15, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ vext.16 d28, d20, d21, #1 @//extract a[1] (set1)
+
+ vaddl.s16 q13, d31, d20 @// a0 + a5 (set1)
+ vext.16 d31, d22, d23, #1 @//extract a[5] (set2)
+ vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1)
+ vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1)
+ vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
+ vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
+ vext.16 d30, d21, d22, #2 @//extract a[2] (set2)
+
+ vqrshrun.s16 d16, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
+ vext.16 d29, d21, d22, #3 @//extract a[3] (set2)
+
+ vext.16 d28, d21, d22, #1 @//extract a[1] (set2)
+ vaddl.s16 q10, d31, d21 @// a0 + a5 (set2)
+ vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2)
+ vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2)
+ vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
+ vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
+ vext.16 d31, d23, d24, #1 @//extract a[5] (set3)
+
+ vext.8 d14, d14, d15, #2
+ vst1.8 {d15, d16}, [r11], r12 @// store row1 - 1,1/2 grid
+ vst1.8 {d14}, [r1], r12 @// store row1 - 1,1/2 grid
+
+ vext.16 d30, d22, d23, #2 @//extract a[2] (set3)
+ vext.16 d29, d22, d23, #3 @//extract a[3] (set3)
+
+ vaddl.s16 q7, d31, d22 @// a0 + a5 (set3)
+ vext.16 d28, d22, d23, #1 @//extract a[1] (set3)
+ vmlal.s16 q7, d30, d0[1] @// a0 + a5 + 20a2 (set3)
+ vmlal.s16 q7, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3)
+ vmlsl.s16 q7, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
+ vmlsl.s16 q7, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
+ vext.16 d31, d24, d25, #1 @//extract a[5] (set4)
+
+ vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2)
+ vext.16 d30, d23, d24, #2 @//extract a[2] (set4)
+ vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1)
+ vext.16 d29, d23, d24, #3 @//extract a[3] (set4)
+
+ vaddl.s16 q13, d31, d23 @// a0 + a5 (set4)
+ vext.16 d28, d23, d24, #1 @//extract a[1] (set4)
+ vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid
+ vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4)
+ vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4)
+ vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
+ vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
+ vext.16 d30, d24, d25, #2 @//extract a[2] (set5)
+
+ vaddl.s16 q11, d31, d24 @// a0 + a5 (set5)
+ vext.16 d29, d24, d25, #3 @//extract a[3] (set5)
+
+ vext.16 d31, d24, d25, #1 @//extract a[1] (set5)
+ vshrn.s32 d28, q7, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3)
+
+ vld1.8 {d14, d15, d16}, [r0], r3 @// Load next Row data
+ vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5)
+ vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5)
+ vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
+ vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
+ vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4)
+ vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2
+
+
+ @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
+ @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
+
+ @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
+
+ @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values
+ @// ////////////// ROW 6 ///////////////////////
+
+@// Process first vertical interpolated row
+@// each column is
+
+ cmp r10, #1 @// if it 17 rows are complete skip
+ beq filter_2dvh_skip_row
+ vaddl.u8 q10, d17, d14 @// a0 + a5 (column1,row0)
+ vmov.i8 d31, #5
+ vmlal.u8 q10, d5, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlal.u8 q10, d8, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlsl.u8 q10, d2, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q10, d11, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+
+ vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4
+ vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5)
+
+ vaddl.u8 q11, d18, d15 @// a0 + a5 (column2,row0)
+ vmlal.u8 q11, d6, d1 @// a0 + a5 + 20a2 (column2,row0)
+ vmlal.u8 q11, d9, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ vmlsl.u8 q11, d3, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ vmlsl.u8 q11, d12, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+
+ vqrshrun.s16 d28, q14, #2 @// half,half gird set5
+ vext.16 d30, d20, d21, #2 @//extract a[2] (set1)
+
+ vaddl.u8 q12, d19, d16 @// a0 + a5 (column3,row0)
+ vext.16 d29, d20, d21, #3 @//extract a[3] (set1)
+ vmlal.u8 q12, d7, d1 @// a0 + a5 + 20a2 (column3,row0)
+ vmlal.u8 q12, d10, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0)
+ vmlsl.u8 q12, d4, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
+ vmlsl.u8 q12, d13, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
+
+ vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values
+
+ vqrshrun.s16 d17, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vext.16 d31, d21, d22, #1 @//extract a[5] (set1)
+ vqrshrun.s16 d18, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ vext.16 d28, d20, d21, #1 @//extract a[1] (set1)
+
+ vaddl.s16 q13, d31, d20 @// a0 + a5 (set1)
+ vext.16 d31, d22, d23, #1 @//extract a[5] (set2)
+ vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1)
+ vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1)
+ vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
+ vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
+ vext.16 d30, d21, d22, #2 @//extract a[2] (set2)
+
+ vqrshrun.s16 d19, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
+ vext.16 d29, d21, d22, #3 @//extract a[3] (set2)
+
+ vext.16 d28, d21, d22, #1 @//extract a[1] (set2)
+ vaddl.s16 q10, d31, d21 @// a0 + a5 (set2)
+ vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2)
+ vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2)
+ vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
+ vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
+ vext.16 d31, d23, d24, #1 @//extract a[5] (set3)
+
+ vext.8 d17, d17, d18, #2
+ vst1.8 {d18, d19}, [r11], r12 @// store row1 - 1,1/2 grid
+ vst1.8 {d17}, [r1], r12 @// store row1 - 1,1/2 grid
+
+ vext.16 d30, d22, d23, #2 @//extract a[2] (set3)
+ vext.16 d29, d22, d23, #3 @//extract a[3] (set3)
+
+ vaddl.s16 q9, d31, d22 @// a0 + a5 (set3)
+ vext.16 d28, d22, d23, #1 @//extract a[1] (set3)
+ vmlal.s16 q9, d30, d0[1] @// a0 + a5 + 20a2 (set3)
+ vmlal.s16 q9, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3)
+ vmlsl.s16 q9, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
+ vmlsl.s16 q9, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
+ vext.16 d31, d24, d25, #1 @//extract a[5] (set4)
+
+ vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2)
+ vext.16 d30, d23, d24, #2 @//extract a[2] (set4)
+ vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1)
+ vext.16 d29, d23, d24, #3 @//extract a[3] (set4)
+
+ vaddl.s16 q13, d31, d23 @// a0 + a5 (set4)
+ vext.16 d28, d23, d24, #1 @//extract a[1] (set4)
+ vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid
+ vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4)
+ vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4)
+ vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
+ vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
+ vext.16 d30, d24, d25, #2 @//extract a[2] (set5)
+
+ vaddl.s16 q11, d31, d24 @// a0 + a5 (set5)
+ vext.16 d29, d24, d25, #3 @//extract a[3] (set5)
+
+ vext.16 d31, d24, d25, #1 @//extract a[1] (set5)
+ vshrn.s32 d28, q9, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3)
+
+ vld1.8 {d17, d18, d19}, [r0], r3 @// Load next Row data
+ vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5)
+ vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5)
+ vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
+ vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
+ vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4)
+ vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2
+
+
+ vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4
+ vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5)
+
+ vqrshrun.s16 d28, q14, #2 @// half,half gird set5
+
+ vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values
+
+ subs r10, r10, #1 @//decrement loop counter
+
+ bne filter_2dvh_loop
+
+
+@// Process first vertical interpolated row
+@// each column is
+ @// ////////////// ROW 13 ///////////////////////
+
+@// Process first vertical interpolated row
+@// each column is
+ vpop {d8-d15}
+ ldmfd sp!, {r10, r11, r12, pc}
+
+filter_2dvh_skip_row:
+
+ vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4
+ vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5)
+
+ vqrshrun.s16 d28, q14, #2 @// half,half gird set5
+
+ vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values
+ vpop {d8-d15}
+ ldmfd sp!, {r10, r11, r12, pc}
+
+
+
+
diff --git a/encoder/arm/ih264e_platform_macros.h b/encoder/arm/ih264e_platform_macros.h
new file mode 100755
index 0000000..39cac96
--- /dev/null
+++ b/encoder/arm/ih264e_platform_macros.h
@@ -0,0 +1,143 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264e_platform_macros.h
+*
+* @brief
+* Contains platform specific routines used for codec context intialization
+*
+* @author
+* ittiam
+*
+* @remarks
+* none
+*
+*******************************************************************************
+*/
+
+#ifndef IH264E_PLATFORM_MACROS_H_
+#define IH264E_PLATFORM_MACROS_H_
+
+/*****************************************************************************/
+/* Extern Function Declarations */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief Initialize the intra/inter/transform/deblk function pointers of
+* codec context
+*
+* @par Description: the current routine initializes the function pointers of
+* codec context basing on the architecture in use
+*
+* @param[in] ps_codec
+* Codec context pointer
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264e_init_function_ptr_neon_a9q(codec_t *ps_codec);
+
+/**
+*******************************************************************************
+*
+* @brief Initialize the intra/inter/transform/deblk function pointers of
+* codec context
+*
+* @par Description: the current routine initializes the function pointers of
+* codec context basing on the architecture in use
+*
+* @param[in] ps_codec
+* Codec context pointer
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264e_init_function_ptr_neon_av8(codec_t *ps_codec);
+
+/**
+*******************************************************************************
+*
+* @brief Initialize the intra/inter/transform/deblk function pointers of
+* codec context
+*
+* @par Description: the current routine initializes the function pointers of
+* codec context basing on the architecture in use
+*
+* @param[in] ps_codec
+* Codec context pointer
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264e_init_function_ptr_generic(codec_t *ps_codec);
+
+/**
+*******************************************************************************
+*
+* @brief Initialize the intra/inter/transform/deblk function pointers of
+* codec context
+*
+* @par Description: the current routine initializes the function pointers of
+* codec context basing on the architecture in use
+*
+* @param[in] ps_codec
+* Codec context pointer
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264e_init_function_ptr(void *pv_codec);
+
+/**
+*******************************************************************************
+*
+* @brief Determine the architecture of the encoder executing environment
+*
+* @par Description: This routine returns the architecture of the enviro-
+* ment in which the current encoder is being tested
+*
+* @param[in] void
+*
+* @returns IV_ARCH_T
+* architecture
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+IV_ARCH_T ih264e_default_arch(void);
+
+#endif /* IH264E_PLATFORM_MACROS_H_ */
diff --git a/encoder/arm/ime_distortion_metrics_a9q.s b/encoder/arm/ime_distortion_metrics_a9q.s
new file mode 100755
index 0000000..b58911e
--- /dev/null
+++ b/encoder/arm/ime_distortion_metrics_a9q.s
@@ -0,0 +1,1353 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+
+@/**
+@******************************************************************************
+@*
+@*
+@* @brief
+@* This file contains definitions of routines that compute distortion
+@* between two macro/sub blocks of identical dimensions
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@* - ime_compute_sad_16x16_a9q()
+@* - ime_compute_sad_16x16_fast_a9q()
+@* - ime_compute_sad_16x8_a9q()
+@* - ime_compute_sad_16x16_ea8_a9q()
+@* - ime_calculate_sad2_prog_a9q()
+@* - ime_calculate_sad3_prog_a9q()
+@* - ime_calculate_sad4_prog_a9q()
+@* - ime_sub_pel_compute_sad_16x16_a9q()
+@* - ime_compute_satqd_16x16_lumainter_a9q()
+@* -
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@
+
+
+@/**
+@******************************************************************************
+@*
+@* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
+@*
+@* @par Description
+@* This functions computes SAD between 2 16x16 blocks. There is a provision
+@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
+@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] i4_max_sad
+@* integer maximum allowed distortion
+@*
+@* @param[in] pi4_mb_distortion
+@* integer evaluated sad
+@*
+@* @remarks
+@*
+@******************************************************************************
+@*/
+.text
+.p2align 2
+ .global ime_compute_sad_16x16_fast_a9q
+ime_compute_sad_16x16_fast_a9q:
+
+ stmfd sp!, {r12, lr}
+ lsl r2, r2, #1
+ lsl r3, r3, #1
+
+ @for bringing buffer2 into cache..., dummy load instructions
+ @ LDR r12,[r1]
+
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ mov r12, #6
+ vld1.8 {d8, d9}, [r0], r2
+ vabdl.u8 q0, d6, d4
+ vabdl.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
+
+loop_sad_16x16_fast:
+
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+ vld1.8 {d6, d7}, [r1], r3
+ subs r12, #2
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
+
+ bne loop_sad_16x16_fast
+
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d1, d0
+
+ ldr r12, [sp, #12]
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vshl.u32 d0, d0, #1
+ vst1.32 {d0[0]}, [r12]
+
+ ldmfd sp!, {r12, pc}
+
+
+
+
+@/**
+@******************************************************************************
+@*
+@* @brief computes distortion (SAD) between 2 16x8 blocks
+@*
+@*
+@* @par Description
+@* This functions computes SAD between 2 16x8 blocks. There is a provision
+@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
+@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] u4_max_sad
+@* integer maximum allowed distortion
+@*
+@* @param[in] pi4_mb_distortion
+@* integer evaluated sad
+@*
+@* @remarks
+@*
+@******************************************************************************
+@*/
+@
+ .global ime_compute_sad_16x8_a9q
+ime_compute_sad_16x8_a9q:
+
+ stmfd sp!, {r12, lr}
+
+ @for bringing buffer2 into cache..., dummy load instructions
+ @LDR r12,[r1]
+
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ mov r12, #6
+ vld1.8 {d8, d9}, [r0], r2
+ vabdl.u8 q0, d6, d4
+ vabdl.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
+
+loop_sad_16x8:
+
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+ vld1.8 {d6, d7}, [r1], r3
+ subs r12, #2
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
+
+ bne loop_sad_16x8
+
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d1, d0
+
+ ldr r12, [sp, #12]
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vst1.32 {d0[0]}, [r12]
+
+ ldmfd sp!, {r12, pc}
+
+
+
+
+
+@/**
+@******************************************************************************
+@*
+@* @brief computes distortion (SAD) between 2 16x16 blocks with early exit
+@*
+@* @par Description
+@* This functions computes SAD between 2 16x16 blocks. There is a provision
+@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
+@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] i4_max_sad
+@* integer maximum allowed distortion
+@*
+@* @param[in] pi4_mb_distortion
+@* integer evaluated sad
+@*
+@* @remarks
+@*
+@******************************************************************************
+@*/
+ .global ime_compute_sad_16x16_ea8_a9q
+
+ime_compute_sad_16x16_ea8_a9q:
+
+ stmfd sp!, {r5-r7, lr}
+ lsl r2, r2, #1
+ lsl r3, r3, #1
+
+ @for bringing buffer2 into cache..., dummy load instructions
+ @LDR r12,[r1]
+
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ mov r5, #6
+ vld1.8 {d8, d9}, [r0], r2
+ vabdl.u8 q0, d6, d4
+ vabdl.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
+ ldrd r6, r7, [sp, #16]
+ @r6 = i4_max_sad, r7 = pi4_mb_distortion
+
+loop_sad_16x16_ea8_1:
+
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+ vld1.8 {d6, d7}, [r1], r3
+ subs r5, #2
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
+
+ bne loop_sad_16x16_ea8_1
+
+ vabal.u8 q0, d10, d8
+ sub r0, r0, r2, lsl #3
+ vabal.u8 q1, d11, d9
+ sub r1, r1, r3, lsl #3
+
+ vadd.i16 q6, q0, q1
+ add r0, r0, r2, asr #1
+ vadd.i16 d12, d12, d13
+ add r1, r1, r3, asr #1
+
+ vpaddl.u16 d12, d12
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ vpaddl.u32 d12, d12
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+
+ vst1.32 {d12[0]}, [r7]
+ ldr r5, [r7]
+ cmp r5, r6
+ bgt end_func_16x16_ea8
+
+ vld1.8 {d10, d11}, [r1], r3
+ mov r5, #6
+
+loop_sad_16x16_ea8_2:
+
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+ vld1.8 {d6, d7}, [r1], r3
+ subs r5, #2
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
+
+ bne loop_sad_16x16_ea8_2
+
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d1, d0
+
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+
+ vst1.32 {d0[0]}, [r7]
+
+end_func_16x16_ea8:
+
+ ldmfd sp!, {r5-r7, pc}
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name : Calculate_Mad2_prog()
+@//
+@// Detail Description : This function find the sad values of 4 Progressive MBs
+@// at one shot
+@//
+@// Platform : CortexA8/NEON .
+@//
+@//-----------------------------------------------------------------------------
+@*/
+
+ .global ime_calculate_sad2_prog_a9q
+
+ime_calculate_sad2_prog_a9q:
+
+ @ r0 = ref1 <UWORD8 *>
+ @ r1 = ref2 <UWORD8 *>
+ @ r2 = src <UWORD8 *>
+ @ r3 = RefBufferWidth <UWORD32>
+ @ stack = CurBufferWidth <UWORD32>, psad <UWORD32 *>
+
+ stmfd sp!, {r4-r5, lr}
+
+ ldr r4, [sp, #8] @ load src stride to r4
+ mov r5, #14
+
+ @Row 1
+ vld1.8 {d0, d1}, [r2], r4 @ load src Row 1
+ vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1
+ vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1
+
+ @Row 2
+ vld1.8 {d6, d7}, [r2], r4 @ load src Row 2
+ vabdl.u8 q6, d2, d0
+ vabdl.u8 q7, d3, d1
+ vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2
+ vabdl.u8 q8, d4, d0
+ vabdl.u8 q9, d5, d1
+ vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2
+
+loop_sad2_prog:
+
+ subs r5, #2
+ @Row 1
+ vld1.8 {d0, d1}, [r2], r4 @ load src Row 1
+ vabal.u8 q6, d8, d6
+ vabal.u8 q7, d9, d7
+ vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1
+ vabal.u8 q8, d10, d6
+ vabal.u8 q9, d11, d7
+ vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1
+
+ @Row 2
+ vld1.8 {d6, d7}, [r2], r4 @ load src Row 2
+ vabal.u8 q6, d2, d0
+ vabal.u8 q7, d3, d1
+ vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2
+ vabal.u8 q8, d4, d0
+ vabal.u8 q9, d5, d1
+ vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2
+
+ bne loop_sad2_prog
+
+ vabal.u8 q6, d8, d6
+ vabal.u8 q7, d9, d7
+ vabal.u8 q8, d10, d6
+ vabal.u8 q9, d11, d7
+
+ @ Compute SAD
+
+ vadd.u16 q6, q6, q7 @ Q6 : sad_ref1
+ vadd.u16 q8, q8, q9 @ Q8 : sad_ref2
+
+ vadd.u16 d12, d12, d13
+ ldr r5, [sp, #16] @ loading pi4_sad to r5
+ vadd.u16 d16, d16, d17
+
+ vpadd.u16 d12, d12, d16
+ vpaddl.u16 d12, d12
+
+ vst1.64 {d12}, [r5]!
+
+ ldmfd sp!, {r4-r5, pc}
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name : Calculate_Mad3_prog()
+@//
+@// Detail Description : This function find the sad values of 4 Progressive MBs
+@// at one shot
+@//
+@// Platform : CortexA8/NEON .
+@//
+@//-----------------------------------------------------------------------------
+@*/
+
+ .global ime_calculate_sad3_prog_a9q
+
+ime_calculate_sad3_prog_a9q:
+
+ @ r0 = ref1 <UWORD8 *>
+ @ r1 = ref2 <UWORD8 *>
+ @ r2 = ref3 <UWORD8 *>
+ @ r3 = src <UWORD8 *>
+ @ stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *>
+
+
+ stmfd sp!, {r4-r6, lr}
+
+ ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5
+ mov r6, #14
+
+ @ Row 1
+ vld1.8 {d0, d1}, [r3], r5 @ load src Row 1
+ vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1
+ vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1
+ vabdl.u8 q8, d2, d0
+ vabdl.u8 q9, d3, d1
+ vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1
+ vabdl.u8 q10, d4, d0
+ vabdl.u8 q11, d5, d1
+
+ @ Row 2
+ vld1.8 {d8, d9}, [r3], r5 @ load src Row 1
+ vabdl.u8 q12, d6, d0
+ vabdl.u8 q13, d7, d1
+ vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1
+ vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1
+ vabal.u8 q8, d10, d8
+ vabal.u8 q9, d11, d9
+ vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1
+ vabal.u8 q10, d12, d8
+ vabal.u8 q11, d13, d9
+
+loop_sad3_prog:
+
+ @Row 1
+ vld1.8 {d0, d1}, [r3], r5 @ load src Row 1
+ vabal.u8 q12, d14, d8
+ vabal.u8 q13, d15, d9
+ vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1
+ vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1
+ vabal.u8 q8, d2, d0
+ vabal.u8 q9, d3, d1
+ vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1
+ vabal.u8 q10, d4, d0
+ vabal.u8 q11, d5, d1
+
+ @Row 2
+ vld1.8 {d8, d9}, [r3], r5 @ load src Row 1
+ vabal.u8 q12, d6, d0
+ vabal.u8 q13, d7, d1
+ vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1
+ subs r6, #2
+ vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1
+ vabal.u8 q8, d10, d8
+ vabal.u8 q9, d11, d9
+ vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1
+ vabal.u8 q10, d12, d8
+ vabal.u8 q11, d13, d9
+
+ bne loop_sad3_prog
+
+ vabal.u8 q12, d14, d8
+ vabal.u8 q13, d15, d9
+
+ @ Compute SAD
+
+ vadd.u16 q8, q8, q9 @ Q8 : sad_ref1
+ vadd.u16 q10, q10, q11 @ Q10 : sad_ref2
+ vadd.u16 q12, q12, q13 @ Q12 : sad_ref3
+
+ vadd.u16 d16, d16, d17
+ vadd.u16 d20, d20, d21
+ vadd.u16 d24, d24, d25
+
+ vpadd.u16 d16, d16, d20
+ vpadd.u16 d24, d24, d24
+
+ ldr r6, [sp, #24] @ loading pi4_sad to r6
+ vpaddl.u16 d16, d16
+ vpaddl.u16 d24, d24
+
+ vst1.64 {d16}, [r6]!
+ vst1.32 {d24[0]}, [r6]
+
+ ldmfd sp!, {r4-r6, pc}
+
+
+
+@/**
+@******************************************************************************
+@*
+@* @brief computes distortion (SAD) for sub-pel motion estimation
+@*
+@* @par Description
+@* This functions computes SAD for all the 8 half pel points
+@*
+@* @param[out] pi4_sad
+@* integer evaluated sad
+@* pi4_sad[0] - half x
+@* pi4_sad[1] - half x - 1
+@* pi4_sad[2] - half y
+@* pi4_sad[3] - half y - 1
+@* pi4_sad[4] - half xy
+@* pi4_sad[5] - half xy - 1
+@* pi4_sad[6] - half xy - strd
+@* pi4_sad[7] - half xy - 1 - strd
+@*
+@* @remarks
+@*
+@******************************************************************************
+@*/
+
+.text
+.p2align 2
+
+ .global ime_sub_pel_compute_sad_16x16_a9q
+
+ime_sub_pel_compute_sad_16x16_a9q:
+
+ stmfd sp!, {r4-r11, lr} @store register values to stack
+
+ ldr r9, [sp, #36]
+ ldr r10, [sp, #40]
+
+ sub r4, r1, #1 @ x left
+ sub r5, r2, r10 @ y top
+
+ sub r6, r3, #1 @ xy left
+ sub r7, r3, r10 @ xy top
+
+ sub r8, r7, #1 @ xy top-left
+ mov r11, #15
+
+ @for bringing buffer2 into cache..., dummy load instructions
+ @ LDR r12,[r1]
+ @ LDR r12,[sp,#12]
+
+ vld1.8 {d0, d1}, [r0], r9 @ src
+ vld1.8 {d2, d3}, [r5], r10 @ y top LOAD
+ vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD
+ vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD
+
+ vabdl.u8 q6, d2, d0 @ y top ABS1
+ vabdl.u8 q7, d4, d0 @ xy top ABS1
+ vld1.8 {d8, d9}, [r1], r10 @ x LOAD
+ vabdl.u8 q8, d6, d0 @ xy top-left ABS1
+ vabdl.u8 q9, d8, d0 @ x ABS1
+ vld1.8 {d10, d11}, [r4], r10 @ x left LOAD
+
+ vabal.u8 q6, d3, d1 @ y top ABS2
+ vabal.u8 q7, d5, d1 @ xy top ABS2
+ vld1.8 {d2, d3}, [r2], r10 @ y LOAD
+ vabal.u8 q8, d7, d1 @ xy top-left ABS2
+ vabal.u8 q9, d9, d1 @ x ABS2
+ vld1.8 {d4, d5}, [r3], r10 @ xy LOAD
+
+ vabdl.u8 q10, d10, d0 @ x left ABS1
+ vabdl.u8 q11, d2, d0 @ y ABS1
+ vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD
+ vabdl.u8 q12, d4, d0 @ xy ABS1
+ vabdl.u8 q13, d6, d0 @ xy left ABS1
+
+loop_sub_pel_16x16:
+
+ vabal.u8 q10, d11, d1 @ x left ABS2
+ vabal.u8 q11, d3, d1 @ y ABS2
+ subs r11, #1
+ vabal.u8 q12, d5, d1 @ xy ABS2
+ vabal.u8 q13, d7, d1 @ xy left ABS2
+
+ vld1.8 {d0, d1}, [r0], r9 @ src
+ vabal.u8 q6, d2, d0 @ y top ABS1
+ vabal.u8 q7, d4, d0 @ xy top ABS1
+ vld1.8 {d8, d9}, [r1], r10 @ x LOAD
+ vabal.u8 q8, d6, d0 @ xy top-left ABS1
+ vabal.u8 q9, d8, d0 @ x ABS1
+ vld1.8 {d10, d11}, [r4], r10 @ x left LOAD
+
+ vabal.u8 q6, d3, d1 @ y top ABS2
+ vabal.u8 q7, d5, d1 @ xy top ABS2
+ vld1.8 {d2, d3}, [r2], r10 @ y LOAD
+ vabal.u8 q8, d7, d1 @ xy top-left ABS2
+ vabal.u8 q9, d9, d1 @ x ABS2
+ vld1.8 {d4, d5}, [r3], r10 @ xy LOAD
+
+ vabal.u8 q10, d10, d0 @ x left ABS1
+ vabal.u8 q11, d2, d0 @ y ABS1
+ vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD
+ vabal.u8 q12, d4, d0 @ xy ABS1
+ vabal.u8 q13, d6, d0 @ xy left ABS1
+
+ bne loop_sub_pel_16x16
+
+ vabal.u8 q10, d11, d1 @ x left ABS2
+ vabal.u8 q11, d3, d1 @ y ABS2
+ vabal.u8 q12, d5, d1 @ xy ABS2
+ vabal.u8 q13, d7, d1 @ xy left ABS2
+
+ vadd.i16 d0, d18, d19 @ x
+ vadd.i16 d3, d12, d13 @ y top
+ vadd.i16 d6, d14, d15 @ xy top
+ vadd.i16 d5, d26, d27 @ xy left
+ vadd.i16 d1, d20, d21 @ x left
+ vadd.i16 d2, d22, d23 @ y
+ vadd.i16 d4, d24, d25 @ xy
+ vadd.i16 d7, d16, d17 @ xy top left
+
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d2, d2, d3
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d6, d6, d7
+
+ vpaddl.u16 d0, d0
+ vpaddl.u16 d2, d2
+ ldr r11, [sp, #44]
+ vpaddl.u16 d4, d4
+ vpaddl.u16 d6, d6
+
+ vst1.32 {d0}, [r11]!
+ vst1.32 {d2}, [r11]!
+ vst1.32 {d4}, [r11]!
+ vst1.32 {d6}, [r11]!
+
+ ldmfd sp!, {r4-r11, pc} @Restoring registers from stack
+
+
+
+@/**
+@******************************************************************************
+@*
+@* @brief computes distortion (SAD) between 2 16x16 blocks
+@*
+@* @par Description
+@* This functions computes SAD between 2 16x16 blocks. There is a provision
+@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
+@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] i4_max_sad
+@* integer maximum allowed distortion
+@*
+@* @param[in] pi4_mb_distortion
+@* integer evaluated sad
+@*
+@* @remarks
+@*
+@******************************************************************************
+@*/
+
+.text
+.p2align 2
+
+ .global ime_compute_sad_16x16_a9q
+
+ime_compute_sad_16x16_a9q:
+
+
+ @STMFD sp!,{r12,lr}
+ stmfd sp!, {r12, r14} @store register values to stack
+
+ @for bringing buffer2 into cache..., dummy load instructions
+ @ LDR r12,[r1]
+ @ LDR r12,[sp,#12]
+
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+
+ mov r12, #14
+ vld1.8 {d8, d9}, [r0], r2
+ vabdl.u8 q0, d4, d6
+ vld1.8 {d10, d11}, [r1], r3
+ vabdl.u8 q1, d5, d7
+
+loop_sad_16x16:
+
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d8, d10
+ vld1.8 {d6, d7}, [r1], r3
+ vabal.u8 q1, d9, d11
+
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d4, d6
+ subs r12, #2
+ vld1.8 {d10, d11}, [r1], r3
+ vabal.u8 q1, d5, d7
+
+ bne loop_sad_16x16
+
+ vabal.u8 q0, d8, d10
+ vabal.u8 q1, d9, d11
+
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d1, d0
+ ldr r12, [sp, #12]
+
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vst1.32 {d0[0]}, [r12]
+
+ ldmfd sp!, {r12, pc} @Restoring registers from stack
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name : Calculate_Mad4_prog()
+@//
+@// Detail Description : This function find the sad values of 4 Progressive MBs
+@// at one shot
+@//
+@// Platform : CortexA8/NEON .
+@//
+@//-----------------------------------------------------------------------------
+@*/
+
+ .global ime_calculate_sad4_prog_a9q
+
+ime_calculate_sad4_prog_a9q:
+ @ r0 = temp_frame <UWORD8 *>
+ @ r1 = buffer_ptr <UWORD8 *>
+ @ r2 = RefBufferWidth <UWORD32>
+ @ r3 = CurBufferWidth <UWORD32>
+ @ stack = psad <UWORD32 *> {at 0x34}
+
+ stmfd sp!, {r4-r7, lr}
+
+ @UWORD8 *left_ptr = temp_frame - 1;
+ @UWORD8 *right_ptr = temp_frame + 1;
+ @UWORD8 *top_ptr = temp_frame - RefBufferWidth;
+ @UWORD8 *bot_ptr = temp_frame + RefBufferWidth;
+
+ mov r7, #14
+ sub r4, r0, #0x01 @r4 = left_ptr
+ add r5, r0, #0x1 @r5 = right_ptr
+ sub r6, r0, r2 @r6 = top_ptr
+ add r0, r0, r2 @r0 = bot_ptr
+ @r1 = buffer_ptr
+
+ @D0:D1 : buffer
+ @D2:D3 : top
+ @D4:D5 : left
+ @D6:D7 : right
+ @D8:D9 : bottom
+
+ @Row 1
+ vld1.8 {d0, d1}, [r1], r3 @ load src Row 1
+ vld1.8 {d2, d3}, [r6], r2 @ load top Row 1
+ vld1.8 {d4, d5}, [r4], r2 @ load left Row 1
+
+ vabdl.u8 q5, d2, d0
+ vld1.8 {d6, d7}, [r5], r2 @ load right Row 1
+ vabdl.u8 q6, d3, d1
+
+ vabdl.u8 q7, d0, d4
+ vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1
+ vabdl.u8 q8, d1, d5
+
+ @Row 2
+ vabdl.u8 q9, d0, d6
+ vld1.8 {d26, d27}, [r1], r3 @ load src Row 2
+ vabdl.u8 q10, d1, d7
+
+ vabdl.u8 q11, d0, d8
+ vld1.8 {d2, d3}, [r6], r2 @ load top Row 2
+ vabdl.u8 q12, d1, d9
+
+loop_sad4_prog:
+
+ vabal.u8 q5, d26, d2
+ vld1.8 {d4, d5}, [r4], r2 @ load left Row 2
+ vabal.u8 q6, d27, d3
+
+ vabal.u8 q7, d26, d4
+ vld1.8 {d6, d7}, [r5], r2 @ load right Row 2
+ vabal.u8 q8, d27, d5
+
+ vabal.u8 q9, d26, d6
+ vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2
+ vabal.u8 q10, d27, d7
+
+ @Row 1
+ vabal.u8 q11, d26, d8
+ vld1.8 {d0, d1}, [r1], r3 @ load src Row 1
+ vabal.u8 q12, d27, d9
+
+ vld1.8 {d2, d3}, [r6], r2 @ load top Row 1
+ subs r7, #2
+ vld1.8 {d4, d5}, [r4], r2 @ load left Row 1
+
+ vabal.u8 q5, d0, d2
+
+ vld1.8 {d6, d7}, [r5], r2 @ load right Row 1
+ vabal.u8 q6, d1, d3
+
+ vabal.u8 q7, d0, d4
+ vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1
+ vabal.u8 q8, d1, d5
+
+ @Row 2
+ vabal.u8 q9, d0, d6
+ vld1.8 {d26, d27}, [r1], r3 @ load src Row 2
+ vabal.u8 q10, d1, d7
+
+ vabal.u8 q11, d0, d8
+ vld1.8 {d2, d3}, [r6], r2 @ load top Row 2
+ vabal.u8 q12, d1, d9
+
+ bne loop_sad4_prog
+
+ vabal.u8 q5, d26, d2
+ vld1.8 {d4, d5}, [r4], r2 @ load left Row 2
+ vabal.u8 q6, d27, d3
+
+ vabal.u8 q7, d26, d4
+ vld1.8 {d6, d7}, [r5], r2 @ load right Row 2
+ vabal.u8 q8, d27, d5
+
+ vabal.u8 q9, d26, d6
+ vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2
+ vabal.u8 q10, d27, d7
+
+ vabal.u8 q11, d26, d8
+ vabal.u8 q12, d27, d9
+
+ @;Q5:Q6 : sad_top
+ @;Q7:Q8 : sad_left
+ @;Q9:Q10 : sad_right
+ @;Q11:Q12 : sad_bot
+
+ vadd.u16 q5, q5, q6
+ vadd.u16 q7, q7, q8
+ vadd.u16 q9, q9, q10
+ vadd.u16 q11, q11, q12
+
+ @; Free :-
+ @; Q6,Q8,Q10,Q12
+
+ @;Q5 -> D10:D11
+ @;Q7 -> D14:D15
+ @;Q9 -> D18:D19
+ @;Q11 -> D22:D23
+
+ vadd.u16 d10, d10, d11
+ vadd.u16 d14, d14, d15
+ vadd.u16 d18, d18, d19
+ vadd.u16 d22, d22, d23
+
+ @;D10 : sad_top
+ @;D14 : sad_left
+ @;D18 : sad_right
+ @;D22 : sad_bot
+
+
+ vpaddl.u16 d11, d10
+ vpaddl.u16 d15, d14
+ vpaddl.u16 d19, d18
+ vpaddl.u16 d23, d22
+
+ @;D11 : sad_top
+ @;D15 : sad_left
+ @;D19 : sad_right
+ @;D23 : sad_bot
+
+ vpaddl.u32 d10, d11
+ vpaddl.u32 d22, d23
+ vpaddl.u32 d14, d15
+ vpaddl.u32 d18, d19
+
+ @;D10 : sad_top
+ @;D14 : sad_left
+ @;D18 : sad_right
+ @;D22 : sad_bot
+
+ ldr r4, [sp, #20] @;Can be rearranged
+
+ vsli.64 d10, d22, #32
+ vsli.64 d14, d18, #32
+
+ vst1.64 {d14}, [r4]!
+ vst1.64 {d10}, [r4]!
+
+ ldmfd sp!, {r4-r7, pc}
+
+
+
+
+@*****************************************************************************
+@*
+@* Function Name : ime_compute_satqd_16x16_lumainter_a9
+@* Description : This fucntion computes SAD for a 16x16 block.
+@ : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant
+@
+@ Arguments : R0 :pointer to src buffer
+@ R1 :pointer to est buffer
+@ R2 :source stride
+@ R3 :est stride
+@ STACk :Threshold,distotion,is_nonzero
+@*
+@* Values Returned : NONE
+@*
+@* Register Usage : R0-R11
+@* Stack Usage :
+@* Cycles : Around
+@* Interruptiaility : Interruptable
+@*
+@* Known Limitations
+@* \Assumptions :
+@*
+@* Revision History :
+@* DD MM YYYY Author(s) Changes
+@* 14 04 2014 Harinarayanan K K First version
+@*
+@*****************************************************************************
+ .global ime_compute_satqd_16x16_lumainter_a9q
+ime_compute_satqd_16x16_lumainter_a9q:
+ @R0 :pointer to src buffer
+ @R1 :pointer to est buffer
+ @R2 :Source stride
+ @R3 :Pred stride
+ @R4 :Threshold pointer
+ @R5 :Distortion,ie SAD
+ @R6 :is nonzero
+
+ push {r4-r12, lr} @push all the variables first
+ @ADD SP,SP,#40 ;decrement stack pointer,to accomodate two variables
+ ldr r4, [sp, #40] @load the threshold address
+
+ mov r8, #8 @Number of 4x8 blocks to be processed
+ mov r10, #0 @Sad
+ mov r7, #0 @Nonzero info
+ @----------------------------------------------------
+
+ vld1.u8 d30, [r0], r2 @I load 8 pix src row 1
+
+ vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1
+
+ vld1.u8 d28, [r0], r2 @I load 8 pix src row 2
+
+ vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2
+
+ vld1.u8 d26, [r0], r2 @I load 8 pix src row 3
+ vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12
+
+ vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3
+
+ vld1.u8 d24, [r0], r2 @I load 8 pix src row 4
+
+ vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4
+ vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12
+
+ vld1.u16 {q11}, [r4] @I load the threhold
+ vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12
+
+ vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12
+
+
+
+core_loop:
+ @S1 S2 S3 S4 A1 A2 A3 A4
+ @S5 S6 S7 S8 A5 A6 A7 A8
+ @S9 S10 S11 S12 A9 A10 A11 A12
+ @S13 S14 S15 S16 A13 A14 A15 A16
+ ands r11, r8, #1 @II See if we are at even or odd block
+ vadd.u16 q4 , q0, q3 @I Add r1 r4
+ lsl r11, r2, #2 @II Move back src 4 rows
+
+ subeq r0, r0, r11 @II Move back src 4 rows if we are at even block
+ vadd.u16 q5 , q1, q2 @I Add r2 r3
+ addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block
+
+ lsl r11, r3, #2 @II Move back pred 4 rows
+ vtrn.16 d8 , d10 @I trnspse 1
+ subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block
+
+ addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block
+ vtrn.16 d9 , d11 @I trnspse 2
+ subne r0, r0, #8 @II Src 8clos back for odd rows
+
+ subne r1, r1, #8 @II Pred 8 cols back for odd rows
+ vtrn.32 d10, d11 @I trnspse 4
+
+
+ vtrn.32 d8 , d9 @I trnspse 3
+ vswp d10, d11 @I rearrange so that the q4 and q5 add properly
+ @D8 S1 S4 A1 A4
+ @D9 S2 S3 A2 A3
+ @D11 S1 S4 A1 A4
+ @D10 S2 S3 A2 A3
+
+ vadd.s16 q6, q4, q5 @I Get s1 s4
+ vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1
+
+ vtrn.s16 d12, d13 @I Get s2 s3
+ @D12 S1 S4 A1 A4
+ @D13 S2 S3 A2 A3
+
+ vshl.s16 q7, q6 , #1 @I si = si<<1
+ vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1
+
+ vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3)
+ vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2
+ @ D16 S14 A14 S23 A23
+ vrev32.16 d0, d16 @I
+ vuzp.s16 d16, d0 @I
+ @D16 S14 S23 A14 A23
+ vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4)
+ vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2
+ @D17 S12 S34 A12 A34
+
+ vrev32.16 q9, q7 @I Rearrange si's
+ @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2
+
+ @D12 S1 S4 A1 A4
+ @D19 Z3 Z2 Y3 Y2
+ vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1))
+ vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3
+ @D13 S2 S3 A2 A3
+ @D18 Z4 Z1 Y4 Y1
+ vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1))
+ vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3
+ @Q10 S8 S5 A8 A5 S7 S4 A7 A4
+
+ @D16 S14 S23 A14 A23
+ vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4
+ vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4
+ @D22 SAD1 SAD2 junk junk
+
+
+ @Q8 S2 S1 A2 A1 S6 S3 A6 A3
+ @Q10 S8 S5 A8 A5 S7 S4 A7 A4
+ vtrn.32 q8, q4 @I Rearrange to make ls of each block togather
+ @Q8 S2 S1 S8 S5 S6 S3 S7 S4
+ @Q10 A2 A1 A8 A5 A6 A3 A7 A4
+
+
+ ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1
+ vdup.s16 q6, d10[0] @I Get the sad blk 1
+ vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12
+
+ vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1
+ vmov.s16 r9, d10[0] @I Get the sad for block 1
+
+ vsub.s16 q9, q7, q8 @I Add to the lss
+ vmov.s16 r5, d10[1] @I Get the sad for block 2
+
+ vcle.s16 q7, q11, q9 @I Add to the lss
+ vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4
+
+ vdup.s16 q15, d10[1] @I Get the sad blk 1
+ vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12
+
+
+ vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1
+ vsub.s16 q3, q14, q4 @I Add to the lss
+ vcle.s16 q15, q11, q3 @I Add to the lss
+
+ ADD R10, R10, R9 @I Add to the global sad blk 1
+ vtrn.u8 q15, q7 @I get all comparison bits to one reg
+ vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12
+
+ ADD R10, R10, R5 @I Add to the global sad blk 2
+ vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs
+ cmp r11, r9
+
+ movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1
+ vadd.u8 d28, d28, d29 @I Add the bits
+ cmp r11, r5 @I Compare with threshold blk 2
+
+ movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2
+ vpadd.u8 d28, d28, d29 @I Add the bits
+
+ vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11
+ vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12
+
+ orr r7, r7, r11 @I get the guy to r11
+
+
+ sub r8, r8, #1 @I Decremrnt block count
+
+ cmp r7, #0 @I If we have atlest one non zero block
+ bne compute_sad_only @I if a non zero block is der,From now on compute sad only
+
+ cmp r8, #1 @I See if we are at the last block
+ bne core_loop @I If the blocks are zero, lets continue the satdq
+
+
+ @EPILOUGE for core loop
+ @S1 S2 S3 S4 A1 A2 A3 A4
+ @S5 S6 S7 S8 A5 A6 A7 A8
+ @S9 S10 S11 S12 A9 A10 A11 A12
+ @S13 S14 S15 S16 A13 A14 A15 A16
+ vadd.u16 q4 , q0, q3 @Add r1 r4
+ vadd.u16 q5 , q1, q2 @Add r2 r3
+ @D8 S1 S2 S2 S1
+ @D10 S4 S3 S3 S4
+ @D9 A1 A2 A2 A1
+ @D11 A4 A3 A3 A4
+ vtrn.16 d8 , d10 @I trnspse 1
+ vtrn.16 d9 , d11 @I trnspse 2
+ vtrn.32 d8 , d9 @I trnspse 3
+ vtrn.32 d10, d11 @I trnspse 4
+
+ vswp d10, d11 @I rearrange so that the q4 and q5 add properly
+ @D8 S1 S4 A1 A4
+ @D9 S2 S3 A2 A3
+ @D11 S1 S4 A1 A4
+ @D10 S2 S3 A2 A3
+ vadd.s16 q6, q4, q5 @Get s1 s4
+ vtrn.s16 d12, d13 @Get s2 s3
+ @D12 S1 S4 A1 A4
+ @D13 S2 S3 A2 A3
+
+ vshl.s16 q7, q6 , #1 @si = si<<1
+ vmov.s16 r9, d10[0] @Get the sad for block 1
+
+ vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3)
+ vmov.s16 r5, d10[1] @Get the sad for block 2
+ @D16 S14 A14 S23 A23
+ vrev32.16 d30, d16 @
+ vuzp.s16 d16, d30 @
+ @D16 S14 S23 A14 A23
+ vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4)
+ @D17 S12 S34 A12 A34
+
+ vrev32.16 q9, q7 @Rearrange si's
+ @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2
+
+ @D12 S1 S4 A1 A4
+ @D19 Z3 Z2 Y3 Y2
+ vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1))
+ @D13 S2 S3 A2 A3
+ @D18 Z4 Z1 Y4 Y1
+ vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1))
+ @Q10 S8 S5 A8 A5 S7 S4 A7 A4
+
+ @D16 S14 S23 A14 A23
+ vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4
+ @D22 SAD1 SAD2 junk junk
+ vmov.u16 r9, d10[0] @Get the sad for block 1
+ vmov.u16 r5, d10[1] @Get the sad for block 2
+
+ @Q8 S2 S1 A2 A1 S6 S3 A6 A3
+ @Q10 S8 S5 A8 A5 S7 S4 A7 A4
+ ldrh r11, [r4, #16] @Load the threshold for DC val blk 1
+ vtrn.32 q8, q4 @Rearrange to make ls of each block togather
+ ADD R10, R10, R9 @Add to the global sad blk 1
+
+ @Q8 S2 S1 S8 S5 S6 S3 S7 S4
+ @Q10 A2 A1 A8 A5 A6 A3 A7 A4
+
+ vld1.u16 {q11}, [r4] @load the threhold
+ ADD R10, R10, R5 @Add to the global sad blk 2
+
+ vdup.u16 q6, d10[0] @Get the sad blk 1
+
+ cmp r11, r9 @Compare with threshold blk 1
+ vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1
+
+ vsub.s16 q9, q7, q8 @Add to the lss
+
+ vcle.s16 q15, q11, q9 @Add to the lss
+ movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1
+
+ cmp r11, r5 @Compare with threshold blk 2
+ vdup.u16 q14, d10[1] @Get the sad blk 1
+
+ vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1
+ vsub.s16 q12, q13, q4 @Add to the lss
+ vcle.s16 q14, q11, q12 @Add to the lss
+ movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2
+
+ vtrn.u8 q14, q15 @get all comparison bits to one reg
+ vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs
+ vadd.u8 d28, d28, d29 @Add the bits
+ vpadd.u8 d28, d28, d29 @Add the bits
+ vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11
+ orr r7, r7, r11 @get the guy to r11
+
+ b funcend_sad_16x16 @Since all blocks ar processed nw, got to end
+
+compute_sad_only: @This block computes SAD only, so will be lighter
+ @IT will start processign at n odd block
+ @It will compute sad for odd blok,
+ @and then for two blocks at a time
+ @The counter is r7, hence r7 blocks will be processed
+
+ and r11, r8, #1 @Get the last bit of counter
+ cmp r11, #0 @See if we are at even or odd block
+ @iif the blk is even we just have to set the pointer to the
+ @start of current row
+
+ lsleq r11, r2, #2 @I Move back src 4 rows
+ subeq r0, r0, r11 @I Move back src 4 rows if we are at even block
+
+ lsleq r11, r3, #2 @I Move back pred 4 rows
+ subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block
+ @ADDEQ R8,R8,#2 ;Inc counter
+ beq skip_odd_blk @If the blk is odd we have to compute sad
+
+
+ vadd.u16 q4, q0, q1 @Add SAD of row1 and row2
+ vadd.u16 q5, q2, q3 @Add SAD of row3 and row4
+ vadd.u16 q6, q4, q5 @Add SAD of row 1-4
+ vadd.u16 d14, d12, d13 @Add Blk1 and blk2
+ vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4
+ vpadd.u16 d18, d16, d17 @Add col 12-34
+
+ vmov.u16 r9, d18[0] @Move sad to arm
+ ADD R10, R10, R9 @Add to the global sad
+
+ sub r8, r8, #1 @Dec counter
+ cmp r8, #0 @See if we processed last block
+ beq funcend_sad_16x16 @if lprocessed last block goto end of func
+
+ sub r0, r0, #8 @Since we processed od block move back src by 8 cols
+ sub r1, r1, #8 @Since we processed od block move back pred by 8 cols
+
+skip_odd_blk:
+
+ vmov.s16 q0, #0 @Initialize the accumulator
+ vmov.s16 q1, #0 @Initialize the accumulator
+
+ vld1.u8 {q15}, [r0], r2 @load src r1
+ vld1.u8 {q14}, [r1], r3 @load pred r1
+
+ vld1.u8 {q13}, [r0], r2 @load src r2
+ vld1.u8 {q12}, [r1], r3 @load pred r2
+
+ vld1.u8 {q11}, [r0], r2 @load src r3
+ vld1.u8 {q10}, [r1], r3 @load pred r2
+
+ vld1.u8 {q9}, [r0], r2 @load src r4
+ vld1.u8 {q8}, [r1], r3 @load pred r4
+
+ cmp r8, #2
+ beq sad_epilouge
+
+sad_loop:
+
+ vabal.u8 q0, d30, d28 @I accumulate Abs diff R1
+ vabal.u8 q1, d31, d29 @I accumulate Abs diff R1
+
+ vld1.u8 {q15}, [r0], r2 @II load r1 src
+ vabal.u8 q0, d26, d24 @I accumulate Abs diff R2
+
+ vld1.u8 {q14}, [r1], r3 @II load r1 pred
+ vabal.u8 q1, d27, d25 @I accumulate Abs diff R2
+
+ vld1.u8 {q13}, [r0], r2 @II load r3 src
+ vabal.u8 q0, d22, d20 @I accumulate Abs diff R3
+
+ vld1.u8 {q12}, [r1], r3 @II load r2 pred
+ vabal.u8 q1, d23, d21 @I accumulate Abs diff R3
+
+ vld1.u8 {q11}, [r0], r2 @II load r3 src
+ vabal.u8 q0, d18, d16 @I accumulate Abs diff R4
+
+
+ sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2
+ vld1.u8 {q10}, [r1], r3 @II load r3 pred
+ vabal.u8 q1, d19, d17 @I accumulate Abs diff R4
+
+ cmp r8, #2 @Check if last loop
+ vld1.u8 {q9}, [r0], r2 @II load r4 src
+ vld1.u8 {q8}, [r1], r3 @II load r4 pred
+
+ bne sad_loop @Go back to SAD computation
+
+sad_epilouge:
+ vabal.u8 q0, d30, d28 @Accumulate Abs diff R1
+ vabal.u8 q1, d31, d29 @Accumulate Abs diff R1
+
+ vabal.u8 q0, d26, d24 @Accumulate Abs diff R2
+ vabal.u8 q1, d27, d25 @Accumulate Abs diff R2
+
+ vabal.u8 q0, d22, d20 @Accumulate Abs diff R3
+ vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3
+
+ vabal.u8 q0, d18, d16 @Accumulate Abs diff R4
+ vabal.u8 q1, d19, d17 @Accumulate Abs diff R4
+
+ vadd.u16 q2, q0, q1 @ADD two accumulators
+ vadd.u16 d6, d4, d5 @Add two blk sad
+ vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad
+ vpadd.u16 d10, d8, d9 @Add col 12-34 sad
+
+ vmov.u16 r9, d10[0] @move SAD to ARM
+ ADD R10, R10, R9 @Add to the global sad
+
+funcend_sad_16x16: @End of fucntion process
+ ldr r5, [sp, #44]
+ ldr r6, [sp, #48]
+
+ str r7, [r6] @Store the is zero reg
+ str r10, [r5] @Store sad
+
+ @SUB SP,SP,#40
+ pop {r4-r12, pc}
+
+
diff --git a/encoder/arm/ime_platform_macros.h b/encoder/arm/ime_platform_macros.h
new file mode 100755
index 0000000..0f5b2f2
--- /dev/null
+++ b/encoder/arm/ime_platform_macros.h
@@ -0,0 +1,51 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ime_platform_macros.h
+*
+* @brief
+* Platform specific Macro definitions used in the codec
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IME_PLATFORM_MACROS_H_
+#define _IME_PLATFORM_MACROS_H_
+
+/*****************************************************************************/
+/* Function macro definitions */
+/*****************************************************************************/
+
+#define USADA8(src,est,sad) \
+ sad += ABS(src[0]-est[0]) + \
+ ABS(src[1]-est[1]) + \
+ ABS(src[2]-est[2]) + \
+ ABS(src[3]-est[3])
+
+
+#endif /* _IH264_PLATFORM_MACROS_H_ */