summaryrefslogtreecommitdiffstats
path: root/common/arm/ihevc_inter_pred_luma_copy_w16out.s
diff options
context:
space:
mode:
Diffstat (limited to 'common/arm/ihevc_inter_pred_luma_copy_w16out.s')
-rw-r--r--common/arm/ihevc_inter_pred_luma_copy_w16out.s249
1 files changed, 249 insertions, 0 deletions
diff --git a/common/arm/ihevc_inter_pred_luma_copy_w16out.s b/common/arm/ihevc_inter_pred_luma_copy_w16out.s
new file mode 100644
index 0000000..771bcb3
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_luma_copy_w16out.s
@@ -0,0 +1,249 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* interprediction luma function for copy
+@*
+@* @par description:
+@* copies the array of width 'wd' and height 'ht' from the location pointed
+@* by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_copy_w16out (
+@ uword8 *pu1_src,
+@ word16 *pi2_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd )
+
+@**************variables vs registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pi2_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r7 => ht
+@ r12 => wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_copy_w16out_a9q
+
+.type ihevc_inter_pred_luma_copy_w16out_a9q, %function
+
+ihevc_inter_pred_luma_copy_w16out_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r12,[sp,#48] @loads wd
+ ldr r7,[sp,#44] @loads ht
+ cmp r7,#0 @ht condition(ht == 0)
+ ble end_loops @loop
+ tst r12,#7 @conditional check for wd (multiples)
+ beq core_loop_wd_8
+ sub r11,r12,#4
+ lsls r6,r3,#1
+
+outer_loop_wd_4:
+ subs r4,r12,#0 @wd conditional subtract
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4:
+ vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp)
+ add r5,r0,r2 @pu1_src +src_strd
+ vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ add r10,r1,r6
+ subs r4,r4,#4 @wd - 4
+ vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6)
+ vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp)
+ add r0,r0,#4 @pu1_src += 4
+ vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ add r1,r1,#8
+ vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6)
+ vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ vshl.i64 q12,q12,#6 @vshlq_n_s64(temp, 6)
+ vld1.8 {d26},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.64 {d24},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ vmovl.u8 q13,d26 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vshl.i64 q13,q13,#6 @vshlq_n_s64(temp, 6)
+ vst1.64 {d26},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs r7,r7,#4 @ht + 4
+ sub r0,r5,r11
+ sub r1,r10,r11,lsl #1
+ bgt outer_loop_wd_4
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+core_loop_wd_8:
+ @sub r11,r12,#8
+ lsls r5,r3,#1
+ rsb r11,r12,r3, lsl #2 @ r11 = (dst_strd * 4) - width
+ rsb r8,r12,r2,lsl #2 @r2->src_strd
+ mov r4,r12, lsr #3 @ divide by 8
+ mul r7, r4
+ sub r4,r12,#0 @wd conditional check
+ sub r7,r7,#4 @subtract one for epilog
+
+prolog:
+ add r6,r0,r2 @pu1_src_tmp += src_strd
+ add r10,r1,r5
+ vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
+ vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
+ vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ subs r4,r4,#8 @wd decrements by 8
+ vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6)
+ addle r0,r0,r8
+ add r6,r0,r2 @pu1_src_tmp += src_strd
+ vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
+ vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp)
+
+ vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
+ addle r1,r1,r11,lsl #1
+ suble r4,r12,#0 @wd conditional check
+
+ subs r7,r7,#4 @ht - 4
+
+ blt epilog_end @jumps to epilog_end
+ beq epilog @jumps to epilog
+
+
+
+outer_loop_wd_8:
+
+ vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
+
+ vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ subs r4,r4,#8 @wd decrements by 8
+ addle r0,r0,r8
+
+ add r6,r0,r2 @pu1_src_tmp += src_strd
+
+ vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
+ vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
+
+ vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
+
+ vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6)
+
+ vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp)
+ add r10,r1,r5
+
+ vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6)
+
+ vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
+
+ addle r1,r1,r11,lsl #1
+ suble r4,r12,#0 @wd conditional check
+
+ subs r7,r7,#4 @ht - 4
+ bgt outer_loop_wd_8
+
+epilog:
+ vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
+
+ vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ @add r6,r0,r2 @pu1_src_tmp += src_strd
+
+ vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6)
+ add r10,r1,r5
+ vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6)
+
+ vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
+epilog_end:
+ vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+