diff options
Diffstat (limited to 'common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s')
-rw-r--r-- | common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s | 356 |
1 files changed, 356 insertions, 0 deletions
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s new file mode 100644 index 0000000..55e7f54 --- /dev/null +++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s @@ -0,0 +1,356 @@ +///***************************************************************************** +//* +//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//*****************************************************************************/ +///** +//******************************************************************************* +//* //file +//* ihevc_inter_pred_chroma_vert_neon_w16inp_neon.s +//* +//* //brief +//* contains function definitions for inter prediction interpolation. +//* functions are coded using neon intrinsics and can be compiled using + +//* rvct +//* +//* //author +//* yogeswaran rs / parthiban +//* +//* //par list of functions: +//* +//* +//* //remarks +//* none +//* +//******************************************************************************* +//*/ +///** +///** +//******************************************************************************* +//* +//* //brief +//* chroma interprediction filter for 16bit vertical input. +//* +//* //par description: +//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to +//* the elements pointed by 'pu1_src' and writes to the location pointed by +//* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and +//* clipped to lie between 0 and 255 assumptions : the function is +//* optimized considering the fact width and height are multiple of 2. +//* +//* //param[in] pi2_src +//* word16 pointer to the source +//* +//* //param[out] pu1_dst +//* uword8 pointer to the destination +//* +//* //param[in] src_strd +//* integer source stride +//* +//* //param[in] dst_strd +//* integer destination stride +//* +//* //param[in] pi1_coeff +//* word8 pointer to the filter coefficients +//* +//* //param[in] ht +//* integer height of the array +//* +//* //param[in] wd +//* integer width of the array +//* +//* //returns +//* +//* //remarks +//* none +//* +//******************************************************************************* +//*/ +//void ihevc_inter_pred_chroma_vert_w16inp(word16 *pi2_src, +// uword8 *pu1_dst, +// word32 src_strd, +// word32 dst_strd, +// word8 *pi1_coeff, +// word32 ht, +// word32 wd) +//**************variables vs registers***************************************** +//x0 => *pu1_src +//x1 => *pi2_dst +//x2 => src_strd +//x3 => dst_strd + +.text +.align 4 + +.include "ihevc_neon_macros.s" + +.globl ihevc_inter_pred_chroma_vert_w16inp_av8 + +.type ihevc_inter_pred_chroma_vert_w16inp_av8, %function + +ihevc_inter_pred_chroma_vert_w16inp_av8: + + // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments + push_v_regs + stp x19, x20,[sp,#-16]! + + mov x15,x4 // pi1_coeff + mov x16,x5 // ht + mov x17,x6 // wd + + mov x4, x15 //loads pi1_coeff + mov x6, x17 //wd + lsl x2,x2,#1 //src_strd = 2* src_strd + mov x5,x16 //loads ht + ld1 {v0.8b},[x4] //loads pi1_coeff + sub x4,x0,x2 //pu1_src - src_strd + sxtl v0.8h, v0.8b //long the value + + tst x6,#3 //checks wd == 2 + dup v12.4h, v0.4h[0] //coeff_0 + dup v13.4h, v0.4h[1] //coeff_1 + dup v14.4h, v0.4h[2] //coeff_2 + dup v15.4h, v0.4h[3] //coeff_3 + + bgt core_loop_ht_2 //jumps to loop handles wd 2 + + tst x5,#3 //checks ht == mul of 4 + beq core_loop_ht_4 //jumps to loop handles ht mul of 4 + +core_loop_ht_2: + lsl x7,x2,#1 //2*src_strd + lsl x12,x3,#1 //2*dst_strd + lsl x9,x6,#2 //4*wd + sub x6,x12,x6,lsl #1 //2*dst_strd - 2*wd + sub x8,x7,x9 //2*src_strd - 4*wd + mov x12,x9 //4wd + +inner_loop_ht_2: + add x0,x4,x2 //increments pi2_src + ld1 {v0.4h},[x4],#8 //loads pu1_src + smull v0.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0) + subs x12,x12,#8 //2wd + 8 + ld1 {v2.4h},[x0],x2 //loads pi2_src + smull v8.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + ld1 {v3.4h},[x0],x2 //loads pi2_src + smlal v0.4s, v2.4h, v13.4h + ld1 {v6.4h},[x0],x2 + smlal v8.4s, v3.4h, v13.4h + ld1 {v2.4h},[x0] + add x7,x1,x3 //pu1_dst + dst_strd + smlal v0.4s, v3.4h, v14.4h + smlal v8.4s, v6.4h, v14.4h + smlal v0.4s, v6.4h, v15.4h + smlal v8.4s, v2.4h, v15.4h + sqshrn v0.4h, v0.4s,#6 //right shift + sqshrn v30.4h, v8.4s,#6 //right shift + sqrshrun v0.8b, v0.8h,#6 //rounding shift + sqrshrun v30.8b, v30.8h,#6 //rounding shift + st1 {v0.s}[0],[x1],#4 //stores the loaded value + st1 {v30.s}[0],[x7] //stores the loaded value + bgt inner_loop_ht_2 //inner loop -again + + //inner loop ends + subs x5,x5,#2 //increments ht + add x1,x1,x6 //pu1_dst += 2*dst_strd - 2*wd + mov x12,x9 //4wd + add x4,x4,x8 //pi1_src_tmp1 += 2*src_strd - 4*wd + bgt inner_loop_ht_2 //loop again + + b end_loops //jumps to end + +core_loop_ht_4: + lsl x7,x2,#2 //2*src_strd + lsl x12,x3,#2 //2*dst_strd + lsr x11, x6, #1 //divide by 2 + sub x14,x12,x6,lsl #1 //2*dst_strd - 2*wd + sub x8,x7,x6,lsl #2 //2*src_strd - 4*wd + + mul x12, x5 , x11 //multiply height by width + sub x12, x12,#4 //subtract by one for epilog + lsl x11, x6, #1 //2*wd + +prolog: + add x0,x4,x2 //increments pi2_src + ld1 {v0.4h},[x4],#8 //loads pu1_src + ld1 {v1.4h},[x0],x2 //loads pi2_src + subs x11,x11,#4 + ld1 {v2.4h},[x0],x2 //loads pi2_src + smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0) + ld1 {v3.4h},[x0],x2 + smlal v30.4s, v1.4h, v13.4h + smlal v30.4s, v2.4h, v14.4h + add x9,x1,x3 //pu1_dst + dst_strd + smlal v30.4s, v3.4h, v15.4h + + ld1 {v4.4h},[x0],x2 + smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + add x20,x4,x8 + csel x4, x20, x4,le + smlal v28.4s, v2.4h, v13.4h + ld1 {v5.4h},[x0],x2 + smlal v28.4s, v3.4h, v14.4h + ld1 {v6.4h},[x0],x2 + smlal v28.4s, v4.4h, v15.4h + lsl x20,x6,#1 + csel x11, x20, x11,le + + sqshrn v30.4h, v30.4s,#6 //right shift + + smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + add x0,x4,x2 + smlal v26.4s, v3.4h, v13.4h + smlal v26.4s, v4.4h, v14.4h + ld1 {v0.4h},[x4],#8 //loads pu1_src + smlal v26.4s, v5.4h, v15.4h + + sqrshrun v30.8b, v30.8h,#6 //rounding shift + sqshrn v28.4h, v28.4s,#6 //right shift + + ld1 {v1.4h},[x0],x2 //loads pi2_src + smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + st1 {v30.s}[0],[x1],#4 //stores the loaded value + smlal v24.4s, v4.4h, v13.4h + ld1 {v2.4h},[x0],x2 //loads pi2_src + smlal v24.4s, v5.4h, v14.4h + ld1 {v3.4h},[x0],x2 + smlal v24.4s, v6.4h, v15.4h + add x20,x1,x14 + csel x1, x20, x1,le + + sqshrn v26.4h, v26.4s,#6 //right shift + subs x12,x12,#4 + sqrshrun v28.8b, v28.8h,#6 //rounding shift + + beq epilog //jumps to epilog + +kernel_4: + smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0) + subs x11,x11,#4 + smlal v30.4s, v1.4h, v13.4h + st1 {v28.s}[0],[x9],x3 //stores the loaded value + smlal v30.4s, v2.4h, v14.4h + smlal v30.4s, v3.4h, v15.4h + + sqshrn v24.4h, v24.4s,#6 //right shift + sqrshrun v26.8b, v26.8h,#6 //rounding shift + + ld1 {v4.4h},[x0],x2 + smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smlal v28.4s, v2.4h, v13.4h + smlal v28.4s, v3.4h, v14.4h + smlal v28.4s, v4.4h, v15.4h + st1 {v26.s}[0],[x9],x3 //stores the loaded value + add x20,x4,x8 + csel x4, x20, x4,le + lsl x20,x6,#1 + csel x11, x20, x11,le + + sqshrn v30.4h, v30.4s,#6 //right shift + sqrshrun v24.8b, v24.8h,#6 //rounding shift + + ld1 {v5.4h},[x0],x2 + smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + ld1 {v6.4h},[x0],x2 + smlal v26.4s, v3.4h, v13.4h + st1 {v24.s}[0],[x9] //stores the loaded value + add x0,x4,x2 + smlal v26.4s, v4.4h, v14.4h + ld1 {v0.4h},[x4],#8 //loads pu1_src + smlal v26.4s, v5.4h, v15.4h + + sqshrn v28.4h, v28.4s,#6 //right shift + sqrshrun v30.8b, v30.8h,#6 //rounding shift + + ld1 {v1.4h},[x0],x2 //loads pi2_src + smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + add x9,x1,x3 //pu1_dst + dst_strd + ld1 {v2.4h},[x0],x2 //loads pi2_src + smlal v24.4s, v4.4h, v13.4h + ld1 {v3.4h},[x0],x2 + smlal v24.4s, v5.4h, v14.4h + + st1 {v30.s}[0],[x1],#4 //stores the loaded value + smlal v24.4s, v6.4h, v15.4h + + sqshrn v26.4h, v26.4s,#6 //right shift + sqrshrun v28.8b, v28.8h,#6 //rounding shift + add x20,x1,x14 + csel x1, x20, x1,le + + subs x12,x12,#4 + + bgt kernel_4 //jumps to kernel_4 + +epilog: + smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0) + st1 {v28.s}[0],[x9],x3 //stores the loaded value + smlal v30.4s, v1.4h, v13.4h + smlal v30.4s, v2.4h, v14.4h + smlal v30.4s, v3.4h, v15.4h + + sqshrn v24.4h, v24.4s,#6 //right shift + sqrshrun v26.8b, v26.8h,#6 //rounding shift + + smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + ld1 {v4.4h},[x0],x2 + smlal v28.4s, v2.4h, v13.4h + st1 {v26.s}[0],[x9],x3 //stores the loaded value + smlal v28.4s, v3.4h, v14.4h + smlal v28.4s, v4.4h, v15.4h + + sqshrn v30.4h, v30.4s,#6 //right shift + sqrshrun v24.8b, v24.8h,#6 //rounding shift + + smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + ld1 {v5.4h},[x0],x2 + smlal v26.4s, v3.4h, v13.4h + smlal v26.4s, v4.4h, v14.4h + smlal v26.4s, v5.4h, v15.4h + + sqshrn v28.4h, v28.4s,#6 //right shift + sqrshrun v30.8b, v30.8h,#6 //rounding shift + + st1 {v24.s}[0],[x9] //stores the loaded value + smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smlal v24.4s, v4.4h, v13.4h + add x9,x1,x3 //pu1_dst + dst_strd + ld1 {v6.4h},[x0],x2 + smlal v24.4s, v5.4h, v14.4h + smlal v24.4s, v6.4h, v15.4h + st1 {v30.s}[0],[x1],#4 //stores the loaded value + + sqrshrun v28.8b, v28.8h,#6 //rounding shift + sqshrn v26.4h, v26.4s,#6 //right shift + + st1 {v28.s}[0],[x9],x3 //stores the loaded value + sqrshrun v26.8b, v26.8h,#6 //rounding shift + + sqshrn v24.4h, v24.4s,#6 //right shift + st1 {v26.s}[0],[x9],x3 //stores the loaded value + sqrshrun v24.8b, v24.8h,#6 //rounding shift + + st1 {v24.s}[0],[x9] //stores the loaded value + +end_loops: + // ldmfd sp!,{x4-x12,x15} //reload the registers from sp + ldp x19, x20,[sp],#16 + pop_v_regs + ret + + + + |