diff options
Diffstat (limited to 'common/arm64/ihevc_inter_pred_chroma_copy_w16out.s')
-rw-r--r-- | common/arm64/ihevc_inter_pred_chroma_copy_w16out.s | 348 |
1 files changed, 348 insertions, 0 deletions
diff --git a/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s new file mode 100644 index 0000000..e479651 --- /dev/null +++ b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s @@ -0,0 +1,348 @@ +///***************************************************************************** +//* +//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//*****************************************************************************/ +///** +//******************************************************************************* +//* //file +//* ihevc_inter_pred_chroma_copy_w16out_neon.s +//* +//* //brief +//* contains function definitions for inter prediction interpolation. +//* functions are coded using neon intrinsics and can be compiled using + +//* rvct +//* +//* //author +//* yogeswaran rs +//* +//* //par list of functions: +//* +//* +//* //remarks +//* none +//* +//******************************************************************************* +//*/ +///** +//******************************************************************************* +//* +//* //brief +//* chroma interprediction filter for copy +//* +//* //par description: +//* copies the array of width 'wd' and height 'ht' from the location pointed +//* by 'src' to the location pointed by 'dst' +//* +//* //param[in] pu1_src +//* uword8 pointer to the source +//* +//* //param[out] pu1_dst +//* uword8 pointer to the destination +//* +//* //param[in] src_strd +//* integer source stride +//* +//* //param[in] dst_strd +//* integer destination stride +//* +//* //param[in] pi1_coeff +//* word8 pointer to the filter coefficients +//* +//* //param[in] ht +//* integer height of the array +//* +//* //param[in] wd +//* integer width of the array +//* +//* //returns +//* +//* //remarks +//* none +//* +//******************************************************************************* +//*/ + +//void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src, +// word16 *pi2_dst, +// word32 src_strd, +// word32 dst_strd, +// word8 *pi1_coeff, +// word32 ht, +// word32 wd) +//**************variables vs registers***************************************** +//x0 => *pu1_src +//x1 => *pi2_dst +//x2 => src_strd +//x3 => dst_strd +//x4 => *pi1_coeff +//x5 => ht +//x6 => wd + +.text +.align 4 + +.include "ihevc_neon_macros.s" + +.globl ihevc_inter_pred_chroma_copy_w16out_av8 + +.type ihevc_inter_pred_chroma_copy_w16out_av8, %function + +ihevc_inter_pred_chroma_copy_w16out_av8: + + // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments + push_v_regs + stp x19, x20,[sp,#-16]! + + mov x15,x4 // pi1_coeff + mov x16,x5 // ht + mov x17,x6 // wd + + + mov x12,x17 //loads wd + lsl x12,x12,#1 //2*wd + mov x7,x16 //loads ht + cmp x7,#0 //ht condition(ht == 0) + ble end_loops //loop + and x8,x7,#3 //check ht for mul of 2 + sub x9,x7,x8 //check the rounded height value + and x11,x7,#6 + cmp x11,#6 + beq loop_ht_6 + tst x12,#7 //conditional check for wd (multiples) + beq core_loop_wd_8 + +loop_ht_6: + sub x11,x12,#4 + lsl x6, x3,#1 + adds x6, x6,#0 + cmp x9,#0 + beq outer_loop_wd_4_ht_2 + +outer_loop_wd_4: + subs x4,x12,#0 //wd conditional subtract + ble end_inner_loop_wd_4 + +inner_loop_wd_4: + ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) + add x5,x0,x2 //pu1_src +src_strd + uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + add x10,x1,x6 + subs x4,x4,#4 //wd - 4 + shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) + ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) + add x0,x0,#4 //pu1_src += 4 + st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) + add x1,x1,#8 + uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) + shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) + uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) + shl v24.2d, v24.2d,#6 //vshlq_n_s64(temp, 6) + ld1 {v26.8b},[x5],x2 //vld1_u8(pu1_src_tmp) + st1 {v24.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) + uxtl v26.8h, v26.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + shl v26.2d, v26.2d,#6 //vshlq_n_s64(temp, 6) + st1 {v26.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) + bgt inner_loop_wd_4 + +end_inner_loop_wd_4: + subs x9,x9,#4 //ht - 4 + sub x0,x5,x11 + sub x1,x10,x11,lsl #1 + bgt outer_loop_wd_4 + cmp x8,#0 + bgt outer_loop_wd_4_ht_2 + + +end_loops: + // ldmfd sp!,{x4-x12,x15} //reload the registers from sp + ldp x19, x20,[sp],#16 + pop_v_regs + ret + + +outer_loop_wd_4_ht_2: + subs x4,x12,#0 //wd conditional subtract + ble end_inner_loop_wd_4 + +inner_loop_wd_4_ht_2: + ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) + add x5,x0,x2 //pu1_src +src_strd + uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + add x10,x1,x6 + subs x4,x4,#4 //wd - 4 + shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) + ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) + add x0,x0,#4 //pu1_src += 4 + st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) + add x1,x1,#8 + uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) + shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) + uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) + bgt inner_loop_wd_4_ht_2 + b end_loops + + +core_loop_wd_8: + //sub x11,x12,#8 + lsl x5, x3,#1 + adds x5, x5,#0 + sub x20,x12,x3, lsl #2 // x11 = (dst_strd * 4) - width + neg x11, x20 + sub x20,x12,x2,lsl #2 //x2->src_strd + neg x8, x20 + lsr x4, x12, #3 // divide by 8 + mov x7,x9 + mul x7, x7, x4 + sub x4,x12,#0 //wd conditional check + sub x7,x7,#4 //subtract one for epilog + cmp x9,#0 + beq core_loop_wd_8_ht_2 + +prolog: + add x6,x0,x2 //pu1_src_tmp += src_strd + add x10,x1,x5 + ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + subs x4,x4,#8 //wd decrements by 8 + shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) + shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) + shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) + shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) + add x20,x0,x8 + csel x0, x20, x0,le + add x6,x0,x2 //pu1_src_tmp += src_strd + ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + + st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) + add x20,x1,x11,lsl #1 + csel x1, x20, x1,le + sub x20,x12,#0 //wd conditional check + csel x4, x20, x4,le + + subs x7,x7,#4 //ht - 4 + + blt epilog_end //jumps to epilog_end + beq epilog //jumps to epilog + + + +outer_loop_wd_8: + + st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) + uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + + st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) + uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + + st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) + uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + + uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + + subs x4,x4,#8 //wd decrements by 8 + add x20,x0,x8 + csel x0, x20, x0,le + + add x6,x0,x2 //pu1_src_tmp += src_strd + + ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) + + ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) + + ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) + + ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + add x10,x1,x5 + + shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) + + st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) + + add x20,x1,x11,lsl #1 + csel x1, x20, x1,le + sub x20,x12,#0 //wd conditional check + csel x4, x20, x4,le + + subs x7,x7,#4 //ht - 4 + bgt outer_loop_wd_8 + +epilog: + st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) + uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + + st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) + uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + + st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) + uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + + uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + //add x6,x0,x2 //pu1_src_tmp += src_strd + + shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) + shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) + shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) + add x10,x1,x5 + shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) + + st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) +epilog_end: + st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) + st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) + st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) + b end_loops + +core_loop_wd_8_ht_2: + add x6,x0,x2 //pu1_src_tmp += src_strd + add x10,x1,x5 + ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + subs x12,x12,#8 //wd decrements by 8 + shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) + shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) + st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) + st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) + bgt core_loop_wd_8_ht_2 + + // ldmfd sp!,{x4-x12,x15} //reload the registers from sp + ldp x19, x20,[sp],#16 + pop_v_regs + ret + + + + + + |