diff options
author | Hamsalekha S <hamsalekha.s@ittiam.com> | 2015-03-13 21:24:58 +0530 |
---|---|---|
committer | Hamsalekha S <hamsalekha.s@ittiam.com> | 2015-04-02 15:59:02 +0530 |
commit | 8d3d303c7942ced6a987a52db8977d768dc3605f (patch) | |
tree | cc806c96794356996b13ba9970941d0aed74a97e /common/arm/ih264_inter_pred_filters_luma_horz_a9q.s | |
parent | 3956d913d37327dcb340f836e604b04bd478b158 (diff) | |
download | android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2 android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip |
Initial version
Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017
Diffstat (limited to 'common/arm/ih264_inter_pred_filters_luma_horz_a9q.s')
-rwxr-xr-x | common/arm/ih264_inter_pred_filters_luma_horz_a9q.s | 245 |
1 files changed, 245 insertions, 0 deletions
diff --git a/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s new file mode 100755 index 0000000..ea6bba0 --- /dev/null +++ b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s @@ -0,0 +1,245 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* Interprediction luma filter for horizontal input +@* +@* @par Description: +@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +@* sec 8.4.2.2.1 titled "Luma sample interpolation process" +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@ @remarks +@* None +@* +@******************************************************************************* +@*/ + +@void ih264_inter_pred_luma_horz ( +@ UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd ) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r5 => ht +@ r6 => wd + +.text +.p2align 2 + + + .global ih264_inter_pred_luma_horz_a9q + +ih264_inter_pred_luma_horz_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r5, [sp, #104] @Loads ht + sub r0, r0, #2 @pu1_src-2 + ldr r6, [sp, #108] @Loads wd + vmov.i8 d0, #5 @filter coeff + subs r12, r6, #8 @if wd=8 branch to loop_8 + vmov.i8 d1, #20 @filter coeff + beq loop_8 + + subs r12, r6, #4 @if wd=4 branch to loop_4 + beq loop_4 + +loop_16: @when wd=16 + @// Processing row0 and row1 + vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 ;for checking loop + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1 + vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0) + vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0) + vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1) + vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0) + vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0) + vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1) + vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1) + vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0) + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1) + vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0) + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) + vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2) + vst1.8 {d20, d21}, [r1], r3 @//Store dest row0 + vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2) + vqrshrun.s16 d24, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) + vst1.8 {d23, d24}, [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + + beq end_func + b loop_16 @ loop if height == 8 or 16 + +loop_8: +@// Processing row0 and row1 + vld1.8 {d5, d6}, [r0], r2 @// Load row1 + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vld1.8 {d2, d3}, [r0], r2 @// Load row0 + vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) + vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) + vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) + vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) + vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) + vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vst1.8 {d23}, [r1], r3 @//Store dest row0 + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vst1.8 {d20}, [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + + beq end_func @ Branch if height==4 + + b loop_8 @looping if height =8 or 16 + +loop_4: + vld1.8 {d5, d6}, [r0], r2 @// Load row1 + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vld1.8 {d2, d3}, [r0], r2 @// Load row0 + vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) + vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) + vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) + vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) + vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) + vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) + vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) + vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vst1.32 d23[0], [r1], r3 @//Store dest row0 + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vst1.32 d20[0], [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + beq end_func + + b loop_4 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + |