///***************************************************************************** //* //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //*****************************************************************************/ ///** //******************************************************************************* //* //file //* ihevc_inter_pred_chroma_vert_w16out_neon.s //* //* //brief //* contains function definitions for inter prediction interpolation. //* functions are coded using neon intrinsics and can be compiled using //* rvct //* //* //author //* yogeswaran rs/ pathiban //* //* //par list of functions: //* //* //* //remarks //* none //* //******************************************************************************* //*/ ///** ///** //******************************************************************************* //* //* //brief //* interprediction chroma filter to store vertical 16bit ouput //* //* //par description: //* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to //* the elements pointed by 'pu1_src' and writes to the location pointed by //* 'pu1_dst' no downshifting or clipping is done and the output is used as //* an input for weighted prediction assumptions : the function is optimized //* considering the fact width is multiple of 2,4 or 8. and also considering //* height should be multiple of 2. width 4,8 is optimized further //* //* //param[in] pu1_src //* uword8 pointer to the source //* //* //param[out] pi2_dst //* word16 pointer to the destination //* //* //param[in] src_strd //* integer source stride //* //* //param[in] dst_strd //* integer destination stride //* //* //param[in] pi1_coeff //* word8 pointer to the filter coefficients //* //* //param[in] ht //* integer height of the array //* //* //param[in] wd //* integer width of the array //* //* //returns //* //* //remarks //* none //* //***************************************************************************** //*/ //void ihevc_inter_pred_chroma_vert_w16out(uword8 *pu1_src, // word16 *pi2_dst, // word32 src_strd, // word32 dst_strd, // word8 *pi1_coeff, // word32 ht, // word32 wd) //**************variables vs registers***************************************** //x0 => *pu1_src //x1 => *pi2_dst //x2 => src_strd //x3 => dst_strd .text .align 4 .include "ihevc_neon_macros.s" .globl ihevc_inter_pred_chroma_vert_w16out_av8 .type ihevc_inter_pred_chroma_vert_w16out_av8, %function ihevc_inter_pred_chroma_vert_w16out_av8: // stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff mov x16,x5 // ht mov x17,x6 // wd mov x4,x16 //loads ht mov x12,x15 //loads pi1_coeff cmp x4,#0 //checks ht == 0 mov x6,x17 //loads wd sub x0,x0,x2 //pu1_src - src_strd ld1 {v0.8b},[x12] //loads pi1_coeff ble end_loops //jumps to end tst x6,#3 //checks (wd & 3) abs v3.8b, v0.8b //vabs_s8(coeff) lsl x10,x6,#1 //2*wd dup v0.8b, v3.b[0] //coeffabs_0 dup v1.8b, v3.b[1] //coeffabs_1 dup v2.8b, v3.b[2] //coeffabs_2 dup v3.8b, v3.b[3] //coeffabs_3 bgt outer_loop_wd_2 //jumps to loop handling wd ==2 tst x4,#7 //checks ht for mul of 8 beq core_loop_ht_8 //when height is multiple of 8 lsl x7,x3,#2 //2*dst_strd sub x9,x7,x10,lsl #1 //4*dst_strd - 4wd lsl x12,x2,#1 //2*src_strd sub x8,x12,x10 //2*src_strd - 2wd lsl x3, x3, #1 mov x5,x10 //2wd inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2 add x6,x0,x2 //pu1_src +src_strd ld1 {v17.8b},[x6],x2 //loads pu1_src subs x5,x5,#8 //2wd - 8 ld1 {v5.8b},[x0],#8 //loads src umull v6.8h, v17.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) ld1 {v4.8b},[x6],x2 //loads incremented src umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0) ld1 {v16.8b},[x6],x2 //loads incremented src umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2) umull v4.8h, v4.8b, v1.8b ld1 {v18.8b},[x6] //loads the incremented src umlsl v6.8h, v16.8b, v3.8b umlsl v4.8h, v17.8b, v0.8b umlal v4.8h, v16.8b, v2.8b umlsl v4.8h, v18.8b, v3.8b add x6,x1,x3 //pu1_dst + dst_strd st1 { v6.8h},[x1],#16 //stores the loaded value st1 { v4.8h},[x6] //stores the loaded value bgt inner_loop_ht_2 //inner loop again subs x4,x4,#2 //ht - 2 add x1,x1,x9 //pu1_dst += (2*dst_strd - 2wd) mov x5,x10 //2wd add x0,x0,x8 //pu1_src += (2*src_strd - 2wd) bgt inner_loop_ht_2 //loop again b end_loops //jumps to end outer_loop_wd_2: //called when width is multiple of 2 lsl x5,x3,#2 //2*dst_strd mov x12,x10 //2wd sub x9,x5,x10,lsl #1 //4*dst_strd - 4wd lsl x7,x2,#1 //2*src_strd sub x8,x7,x10 //2*src_strd - 2wd inner_loop_wd_2: add x6,x0,x2 //pu1_src + src_strd ld1 {v6.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0 subs x12,x12,#4 //2wd - 4 add x0,x0,#4 //pu1_src + 4 ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp dup v7.2s, v6.s[1] ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) dup v7.2s, v7.s[1] ld1 {v7.s}[1],[x6],x2 umlsl v4.8h, v6.8b, v0.8b umlal v4.8h, v7.8b, v2.8b dup v7.2s, v7.s[1] ld1 {v7.s}[1],[x6] add x6,x1,x3,lsl #1 //pu1_dst + dst_strd umlsl v4.8h, v7.8b, v3.8b st1 {v4.d}[0],[x1] //stores the loaded value add x1,x1,#8 //pu1_dst += 4 st1 {v4.d}[1],[x6] //stores the loaded value bgt inner_loop_wd_2 //inner loop again //inner loop ends subs x4,x4,#2 //ht - 2 add x1,x1,x9 //pu1_dst += 2*dst_strd - 2*wd mov x12,x10 //2wd add x0,x0,x8 //pu1_src += 2*src_strd - 2*wd bgt inner_loop_wd_2 //loop again b end_loops //jumps to end core_loop_ht_8: //when wd & ht is multiple of 8 lsl x12,x3,#3 //4*dst_strd sub x8,x12,x10,lsl #1 //4*dst_strd - 2wd lsl x12,x2,#2 //4*src_strd sub x9,x12,x10 //4*src_strd - 2wd bic x5,x10,#7 //x5 ->wd lsr x14, x10, #3 //divide by 8 mul x12, x4 , x14 //multiply height by width sub x12, x12,#4 //subtract by one for epilog lsl x3, x3, #1 prolog: add x6,x0,x2 //pu1_src + src_strd ld1 {v5.8b},[x6],x2 //loads pu1_src subs x5,x5,#8 //2wd - 8 ld1 {v4.8b},[x0],#8 //loads the source ld1 {v6.8b},[x6],x2 //load and increment umull v30.8h, v5.8b, v1.8b //mul with coeff 1 ld1 {v7.8b},[x6],x2 //load and increment umlsl v30.8h, v4.8b, v0.8b add x7,x1,x3 //pu1_dst umlal v30.8h, v6.8b, v2.8b umlsl v30.8h, v7.8b, v3.8b ld1 {v16.8b},[x6],x2 //load and increment umull v28.8h, v6.8b, v1.8b //mul_res 2 add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd csel x0, x20, x0,le umlsl v28.8h, v5.8b, v0.8b bic x20,x10,#7 //x5 ->wd csel x5, x20, x5,le umlal v28.8h, v7.8b, v2.8b ld1 {v17.8b},[x6],x2 umlsl v28.8h, v16.8b, v3.8b ld1 {v18.8b},[x6],x2 umull v26.8h, v7.8b, v1.8b add x6,x0,x2 //pu1_src + src_strd umlsl v26.8h, v6.8b, v0.8b st1 { v30.16b},[x1],#16 //stores the loaded value umlal v26.8h, v16.8b, v2.8b ld1 {v4.8b},[x0],#8 //loads the source umlsl v26.8h, v17.8b, v3.8b add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd csel x1, x20, x1,le umull v24.8h, v16.8b, v1.8b ld1 {v5.8b},[x6],x2 //loads pu1_src umlsl v24.8h, v7.8b, v0.8b subs x12,x12,#4 ld1 {v6.8b},[x6],x2 //load and increment umlal v24.8h, v17.8b, v2.8b ld1 {v7.8b},[x6],x2 //load and increment umlsl v24.8h, v18.8b, v3.8b sub x20,x2,x2,lsl #3 neg x11, x20 add x14,x2,x2,lsl #1 add x14,x14,x11 st1 { v28.16b},[x7],x3 //stores the loaded value ble epilog //jumps to epilog kernel_8: umull v30.8h, v5.8b, v1.8b //mul with coeff 1 subs x5,x5,#8 //2wd - 8 umlsl v30.8h, v4.8b, v0.8b add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd csel x0, x20, x0,le umlal v30.8h, v6.8b, v2.8b lsl x20,x2,#3 sub x20,x20,x2 csel x11,x20,x11,le //rsble x11,x2,x2,lsl #3 umlsl v30.8h, v7.8b, v3.8b st1 { v26.16b},[x7],x3 //stores the loaded value ld1 {v16.8b},[x6],x2 //load and increment umull v28.8h, v6.8b, v1.8b //mul_res 2 bic x20,x10,#7 //x5 ->wd csel x5, x20, x5,le umlsl v28.8h, v5.8b, v0.8b st1 { v24.16b},[x7],x3 //stores the loaded value umlal v28.8h, v7.8b, v2.8b ld1 {v17.8b},[x6],x2 umlsl v28.8h, v16.8b, v3.8b ld1 {v18.8b},[x6],x2 add x7,x1,x3 //pu1_dst umull v26.8h, v7.8b, v1.8b add x6,x0,x2 //pu1_src + src_strd add x20,x0, x11 prfm PLDL1KEEP,[x20] umlsl v26.8h, v6.8b, v0.8b ld1 {v4.8b},[x0],#8 //loads the source add x11,x11,x2 umlal v26.8h, v16.8b, v2.8b st1 { v30.16b},[x1],#16 //stores the loaded value umlsl v26.8h, v17.8b, v3.8b ld1 {v5.8b},[x6],x2 //loads pu1_src umull v24.8h, v16.8b, v1.8b ld1 {v6.8b},[x6],x2 //load and increment add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd csel x1, x20, x1,le cmp x11,x14 lsl x20,x2,#3 sub x20,x20,x2 csel x11,x20,x11,gt //rsbgt x11,x2,x2,lsl #3 umlsl v24.8h, v7.8b, v0.8b subs x12,x12,#4 umlal v24.8h, v17.8b, v2.8b ld1 {v7.8b},[x6],x2 //load and increment umlsl v24.8h, v18.8b, v3.8b st1 { v28.16b},[x7],x3 //stores the loaded value bgt kernel_8 //jumps to kernel_8 epilog: umull v30.8h, v5.8b, v1.8b //mul with coeff 1 umlsl v30.8h, v4.8b, v0.8b umlal v30.8h, v6.8b, v2.8b umlsl v30.8h, v7.8b, v3.8b st1 { v26.16b},[x7],x3 //stores the loaded value ld1 {v16.8b},[x6],x2 //load and increment umull v28.8h, v6.8b, v1.8b //mul_res 2 umlsl v28.8h, v5.8b, v0.8b umlal v28.8h, v7.8b, v2.8b umlsl v28.8h, v16.8b, v3.8b st1 { v24.16b},[x7],x3 //stores the loaded value ld1 {v17.8b},[x6],x2 umull v26.8h, v7.8b, v1.8b add x7,x1,x3 //pu1_dst umlsl v26.8h, v6.8b, v0.8b st1 { v30.16b},[x1],#16 //stores the loaded value umlal v26.8h, v16.8b, v2.8b ld1 {v18.8b},[x6],x2 umlsl v26.8h, v17.8b, v3.8b umull v24.8h, v16.8b, v1.8b st1 { v28.16b},[x7],x3 //stores the loaded value umlsl v24.8h, v7.8b, v0.8b umlal v24.8h, v17.8b, v2.8b st1 { v26.16b},[x7],x3 //stores the loaded value umlsl v24.8h, v18.8b, v3.8b st1 { v24.16b},[x7],x3 //stores the loaded value end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 ret