1 files changed, 249 insertions, 0 deletions
diff --git a/common/arm/ihevc_inter_pred_luma_copy_w16out.s b/common/arm/ihevc_inter_pred_luma_copy_w16out.s
new file mode 100644
index 0000000..771bcb3
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_luma_copy_w16out.s
@@ -0,0 +1,249 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     interprediction luma function for copy
+@*
+@* @par description:
+@*   copies the array of width 'wd' and height 'ht' from the  location pointed
+@*   by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_copy_w16out (
+@                                uword8 *pu1_src,
+@                                word16 *pi2_dst,
+@                                word32 src_strd,
+@                                word32 dst_strd,
+@                                word8 *pi1_coeff,
+@                                word32 ht,
+@                                word32 wd   )
+
+@**************variables vs registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pi2_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r7 =>  ht
+@   r12 => wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_copy_w16out_a9q
+
+.type ihevc_inter_pred_luma_copy_w16out_a9q, %function
+
+ihevc_inter_pred_luma_copy_w16out_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    ldr         r12,[sp,#48]                @loads wd
+    ldr         r7,[sp,#44]                 @loads ht
+    cmp         r7,#0                       @ht condition(ht == 0)
+    ble         end_loops                   @loop
+    tst         r12,#7                      @conditional check for wd (multiples)
+    beq         core_loop_wd_8
+    sub         r11,r12,#4
+    lsls        r6,r3,#1
+
+outer_loop_wd_4:
+    subs        r4,r12,#0                   @wd conditional subtract
+    ble         end_inner_loop_wd_4
+
+inner_loop_wd_4:
+    vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
+    add         r5,r0,r2                    @pu1_src +src_strd
+    vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
+    add         r10,r1,r6
+    subs        r4,r4,#4                    @wd - 4
+    vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
+    vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
+    add         r0,r0,#4                    @pu1_src += 4
+    vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    add         r1,r1,#8
+    vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
+    vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
+    vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    vshl.i64    q12,q12,#6                  @vshlq_n_s64(temp, 6)
+    vld1.8      {d26},[r5],r2               @vld1_u8(pu1_src_tmp)
+    vst1.64     {d24},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    vmovl.u8    q13,d26                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vshl.i64    q13,q13,#6                  @vshlq_n_s64(temp, 6)
+    vst1.64     {d26},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    bgt         inner_loop_wd_4
+
+end_inner_loop_wd_4:
+    subs        r7,r7,#4                    @ht + 4
+    sub         r0,r5,r11
+    sub         r1,r10,r11,lsl #1
+    bgt         outer_loop_wd_4
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+core_loop_wd_8:
+    @sub            r11,r12,#8
+    lsls        r5,r3,#1
+    rsb         r11,r12,r3, lsl #2          @ r11 = (dst_strd * 4) - width
+    rsb         r8,r12,r2,lsl #2            @r2->src_strd
+    mov         r4,r12, lsr #3              @ divide by 8
+    mul         r7, r4
+    sub         r4,r12,#0                   @wd conditional check
+    sub         r7,r7,#4                    @subtract one for epilog
+
+prolog:
+    add         r6,r0,r2                    @pu1_src_tmp += src_strd
+    add         r10,r1,r5
+    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
+    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
+    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    subs        r4,r4,#8                    @wd decrements by 8
+    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
+    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
+    addle       r0,r0,r8
+    add         r6,r0,r2                    @pu1_src_tmp += src_strd
+    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
+    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
+
+    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
+    addle       r1,r1,r11,lsl #1
+    suble       r4,r12,#0                   @wd conditional check
+
+    subs        r7,r7,#4                    @ht - 4
+
+    blt         epilog_end                  @jumps to epilog_end
+    beq         epilog                      @jumps to epilog
+
+
+
+outer_loop_wd_8:
+
+    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
+
+    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    subs        r4,r4,#8                    @wd decrements by 8
+    addle       r0,r0,r8
+
+    add         r6,r0,r2                    @pu1_src_tmp += src_strd
+
+    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
+    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
+
+    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
+
+    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
+
+    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
+    add         r10,r1,r5
+
+    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
+
+    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
+
+    addle       r1,r1,r11,lsl #1
+    suble       r4,r12,#0                   @wd conditional check
+
+    subs        r7,r7,#4                    @ht - 4
+    bgt         outer_loop_wd_8
+
+epilog:
+    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
+
+    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    @add        r6,r0,r2                @pu1_src_tmp += src_strd
+
+    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
+    add         r10,r1,r5
+    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
+
+    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
+epilog_end:
+    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+