1 files changed, 356 insertions, 0 deletions
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
new file mode 100644
index 0000000..55e7f54
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
@@ -0,0 +1,356 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//*  ihevc_inter_pred_chroma_vert_neon_w16inp_neon.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  yogeswaran rs / parthiban
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*       chroma interprediction filter for 16bit vertical input.
+//*
+//* //par description:
+//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+//*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
+//*    clipped to lie  between 0 and 255   assumptions : the function is
+//*    optimized considering the fact width and  height are multiple of 2.
+//*
+//* //param[in] pi2_src
+//*  word16 pointer to the source
+//*
+//* //param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_inter_pred_chroma_vert_w16inp(word16 *pi2_src,
+//                                          uword8 *pu1_dst,
+//                                          word32 src_strd,
+//                                          word32 dst_strd,
+//                                          word8 *pi1_coeff,
+//                                          word32 ht,
+//                                          word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 =>  src_strd
+//x3 =>  dst_strd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_vert_w16inp_av8
+
+.type ihevc_inter_pred_chroma_vert_w16inp_av8, %function
+
+ihevc_inter_pred_chroma_vert_w16inp_av8:
+
+    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+    mov         x4, x15                     //loads pi1_coeff
+    mov         x6, x17                     //wd
+    lsl         x2,x2,#1                    //src_strd = 2* src_strd
+    mov         x5,x16                      //loads ht
+    ld1         {v0.8b},[x4]                //loads pi1_coeff
+    sub         x4,x0,x2                    //pu1_src - src_strd
+    sxtl        v0.8h, v0.8b                //long the value
+
+    tst         x6,#3                       //checks wd  == 2
+    dup         v12.4h, v0.4h[0]            //coeff_0
+    dup         v13.4h, v0.4h[1]            //coeff_1
+    dup         v14.4h, v0.4h[2]            //coeff_2
+    dup         v15.4h, v0.4h[3]            //coeff_3
+
+    bgt         core_loop_ht_2              //jumps to loop handles wd 2
+
+    tst         x5,#3                       //checks ht == mul of 4
+    beq         core_loop_ht_4              //jumps to loop handles ht mul of 4
+
+core_loop_ht_2:
+    lsl         x7,x2,#1                    //2*src_strd
+    lsl         x12,x3,#1                   //2*dst_strd
+    lsl         x9,x6,#2                    //4*wd
+    sub         x6,x12,x6,lsl #1            //2*dst_strd - 2*wd
+    sub         x8,x7,x9                    //2*src_strd - 4*wd
+    mov         x12,x9                      //4wd
+
+inner_loop_ht_2:
+    add         x0,x4,x2                    //increments pi2_src
+    ld1         {v0.4h},[x4],#8             //loads pu1_src
+    smull       v0.4s, v0.4h, v12.4h        //vmull_s16(src_tmp1, coeff_0)
+    subs        x12,x12,#8                  //2wd + 8
+    ld1         {v2.4h},[x0],x2             //loads pi2_src
+    smull       v8.4s, v2.4h, v12.4h        //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v3.4h},[x0],x2             //loads pi2_src
+    smlal       v0.4s, v2.4h, v13.4h
+    ld1         {v6.4h},[x0],x2
+    smlal       v8.4s, v3.4h, v13.4h
+    ld1         {v2.4h},[x0]
+    add         x7,x1,x3                    //pu1_dst + dst_strd
+    smlal       v0.4s, v3.4h, v14.4h
+    smlal       v8.4s, v6.4h, v14.4h
+    smlal       v0.4s, v6.4h, v15.4h
+    smlal       v8.4s, v2.4h, v15.4h
+    sqshrn      v0.4h, v0.4s,#6             //right shift
+    sqshrn      v30.4h, v8.4s,#6            //right shift
+    sqrshrun    v0.8b, v0.8h,#6             //rounding shift
+    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
+    st1         {v0.s}[0],[x1],#4           //stores the loaded value
+    st1         {v30.s}[0],[x7]             //stores the loaded value
+    bgt         inner_loop_ht_2             //inner loop -again
+
+    //inner loop ends
+    subs        x5,x5,#2                    //increments ht
+    add         x1,x1,x6                    //pu1_dst += 2*dst_strd - 2*wd
+    mov         x12,x9                      //4wd
+    add         x4,x4,x8                    //pi1_src_tmp1 += 2*src_strd - 4*wd
+    bgt         inner_loop_ht_2             //loop again
+
+    b           end_loops                   //jumps to end
+
+core_loop_ht_4:
+    lsl         x7,x2,#2                    //2*src_strd
+    lsl         x12,x3,#2                   //2*dst_strd
+    lsr         x11, x6, #1                 //divide by 2
+    sub         x14,x12,x6,lsl #1           //2*dst_strd - 2*wd
+    sub         x8,x7,x6,lsl #2             //2*src_strd - 4*wd
+
+    mul         x12, x5 , x11               //multiply height by width
+    sub         x12, x12,#4                 //subtract by one for epilog
+    lsl         x11, x6, #1                 //2*wd
+
+prolog:
+    add         x0,x4,x2                    //increments pi2_src
+    ld1         {v0.4h},[x4],#8             //loads pu1_src
+    ld1         {v1.4h},[x0],x2             //loads pi2_src
+    subs        x11,x11,#4
+    ld1         {v2.4h},[x0],x2             //loads pi2_src
+    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    ld1         {v3.4h},[x0],x2
+    smlal       v30.4s, v1.4h, v13.4h
+    smlal       v30.4s, v2.4h, v14.4h
+    add         x9,x1,x3                    //pu1_dst + dst_strd
+    smlal       v30.4s, v3.4h, v15.4h
+
+    ld1         {v4.4h},[x0],x2
+    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    add         x20,x4,x8
+    csel        x4, x20, x4,le
+    smlal       v28.4s, v2.4h, v13.4h
+    ld1         {v5.4h},[x0],x2
+    smlal       v28.4s, v3.4h, v14.4h
+    ld1         {v6.4h},[x0],x2
+    smlal       v28.4s, v4.4h, v15.4h
+    lsl         x20,x6,#1
+    csel        x11, x20, x11,le
+
+    sqshrn      v30.4h, v30.4s,#6           //right shift
+
+    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    add         x0,x4,x2
+    smlal       v26.4s, v3.4h, v13.4h
+    smlal       v26.4s, v4.4h, v14.4h
+    ld1         {v0.4h},[x4],#8             //loads pu1_src
+    smlal       v26.4s, v5.4h, v15.4h
+
+    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
+    sqshrn      v28.4h, v28.4s,#6           //right shift
+
+    ld1         {v1.4h},[x0],x2             //loads pi2_src
+    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    st1         {v30.s}[0],[x1],#4          //stores the loaded value
+    smlal       v24.4s, v4.4h, v13.4h
+    ld1         {v2.4h},[x0],x2             //loads pi2_src
+    smlal       v24.4s, v5.4h, v14.4h
+    ld1         {v3.4h},[x0],x2
+    smlal       v24.4s, v6.4h, v15.4h
+    add         x20,x1,x14
+    csel        x1, x20, x1,le
+
+    sqshrn      v26.4h, v26.4s,#6           //right shift
+    subs        x12,x12,#4
+    sqrshrun    v28.8b, v28.8h,#6           //rounding shift
+
+    beq         epilog                      //jumps to epilog
+
+kernel_4:
+    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    subs        x11,x11,#4
+    smlal       v30.4s, v1.4h, v13.4h
+    st1         {v28.s}[0],[x9],x3          //stores the loaded value
+    smlal       v30.4s, v2.4h, v14.4h
+    smlal       v30.4s, v3.4h, v15.4h
+
+    sqshrn      v24.4h, v24.4s,#6           //right shift
+    sqrshrun    v26.8b, v26.8h,#6           //rounding shift
+
+    ld1         {v4.4h},[x0],x2
+    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smlal       v28.4s, v2.4h, v13.4h
+    smlal       v28.4s, v3.4h, v14.4h
+    smlal       v28.4s, v4.4h, v15.4h
+    st1         {v26.s}[0],[x9],x3          //stores the loaded value
+    add         x20,x4,x8
+    csel        x4, x20, x4,le
+    lsl         x20,x6,#1
+    csel        x11, x20, x11,le
+
+    sqshrn      v30.4h, v30.4s,#6           //right shift
+    sqrshrun    v24.8b, v24.8h,#6           //rounding shift
+
+    ld1         {v5.4h},[x0],x2
+    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v6.4h},[x0],x2
+    smlal       v26.4s, v3.4h, v13.4h
+    st1         {v24.s}[0],[x9]             //stores the loaded value
+    add         x0,x4,x2
+    smlal       v26.4s, v4.4h, v14.4h
+    ld1         {v0.4h},[x4],#8             //loads pu1_src
+    smlal       v26.4s, v5.4h, v15.4h
+
+    sqshrn      v28.4h, v28.4s,#6           //right shift
+    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
+
+    ld1         {v1.4h},[x0],x2             //loads pi2_src
+    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    add         x9,x1,x3                    //pu1_dst + dst_strd
+    ld1         {v2.4h},[x0],x2             //loads pi2_src
+    smlal       v24.4s, v4.4h, v13.4h
+    ld1         {v3.4h},[x0],x2
+    smlal       v24.4s, v5.4h, v14.4h
+
+    st1         {v30.s}[0],[x1],#4          //stores the loaded value
+    smlal       v24.4s, v6.4h, v15.4h
+
+    sqshrn      v26.4h, v26.4s,#6           //right shift
+    sqrshrun    v28.8b, v28.8h,#6           //rounding shift
+    add         x20,x1,x14
+    csel        x1, x20, x1,le
+
+    subs        x12,x12,#4
+
+    bgt         kernel_4                    //jumps to kernel_4
+
+epilog:
+    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    st1         {v28.s}[0],[x9],x3          //stores the loaded value
+    smlal       v30.4s, v1.4h, v13.4h
+    smlal       v30.4s, v2.4h, v14.4h
+    smlal       v30.4s, v3.4h, v15.4h
+
+    sqshrn      v24.4h, v24.4s,#6           //right shift
+    sqrshrun    v26.8b, v26.8h,#6           //rounding shift
+
+    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v4.4h},[x0],x2
+    smlal       v28.4s, v2.4h, v13.4h
+    st1         {v26.s}[0],[x9],x3          //stores the loaded value
+    smlal       v28.4s, v3.4h, v14.4h
+    smlal       v28.4s, v4.4h, v15.4h
+
+    sqshrn      v30.4h, v30.4s,#6           //right shift
+    sqrshrun    v24.8b, v24.8h,#6           //rounding shift
+
+    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v5.4h},[x0],x2
+    smlal       v26.4s, v3.4h, v13.4h
+    smlal       v26.4s, v4.4h, v14.4h
+    smlal       v26.4s, v5.4h, v15.4h
+
+    sqshrn      v28.4h, v28.4s,#6           //right shift
+    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
+
+    st1         {v24.s}[0],[x9]             //stores the loaded value
+    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smlal       v24.4s, v4.4h, v13.4h
+    add         x9,x1,x3                    //pu1_dst + dst_strd
+    ld1         {v6.4h},[x0],x2
+    smlal       v24.4s, v5.4h, v14.4h
+    smlal       v24.4s, v6.4h, v15.4h
+    st1         {v30.s}[0],[x1],#4          //stores the loaded value
+
+    sqrshrun    v28.8b, v28.8h,#6           //rounding shift
+    sqshrn      v26.4h, v26.4s,#6           //right shift
+
+    st1         {v28.s}[0],[x9],x3          //stores the loaded value
+    sqrshrun    v26.8b, v26.8h,#6           //rounding shift
+
+    sqshrn      v24.4h, v24.4s,#6           //right shift
+    st1         {v26.s}[0],[x9],x3          //stores the loaded value
+    sqrshrun    v24.8b, v24.8h,#6           //rounding shift
+
+    st1         {v24.s}[0],[x9]             //stores the loaded value
+
+end_loops:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+