//****************************************************************************** //* //* Copyright (C) 2015 The Android Open Source Project //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //***************************************************************************** //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore //*/ ///** //****************************************************************************** //* @file //* ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s //* //* @brief //* Contains function definitions for inter prediction interpolation. //* //* @author //* Mohit //* //* @par List of Functions: //* //* - ih264_inter_pred_luma_horz_hpel_vert_qpel_av8() //* //* @remarks //* None //* //******************************************************************************* //*/ ///* All the functions here are replicated from ih264_inter_pred_filters.c // ///** ///** ///** //******************************************************************************* //* //* @brief //* This function implements a two stage cascaded six tap filter. It //* applies the six tap filter in the horizontal direction on the //* predictor values, followed by applying the same filter in the //* vertical direction on the output of the first stage. It then averages //* the output of the 1st stage and the output of the 2nd stage to obtain //* the quarter pel values. The six tap filtering operation is described //* in sec 8.4.2.2.1 titled "Luma sample interpolation process". //* //* @par Description: //* This function is called to obtain pixels lying at the following //* location (1/2,1/4) or (1/2,3/4). The function interpolates //* the predictors first in the horizontal direction and then in the //* vertical direction to output the (1/2,1/2). It then averages //* the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4) //* or (1/2,3/4) depending on the offset. //* //* @param[in] pu1_src //* UWORD8 pointer to the source //* //* @param[out] pu1_dst //* UWORD8 pointer to the destination //* //* @param[in] src_strd //* integer source stride //* //* @param[in] dst_strd //* integer destination stride //* //* @param[in] ht //* integer height of the array //* //* @param[in] wd //* integer width of the array //* //* @param[in] pu1_tmp: temporary buffer //* //* @param[in] dydx: x and y reference offset for qpel calculations //* //* @returns //* //* @remarks //* None //* //******************************************************************************* //*/; //void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src, // UWORD8 *pu1_dst, // WORD32 src_strd,, // WORD32 dst_strd, // WORD32 ht, // WORD32 wd, // UWORD8* pu1_tmp, // UWORD32 dydx) //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst // x2 => src_strd // x3 => dst_strd // x4 => ht // x5 => wd // x7 => dydx // x9 => *pu1_tmp .text .p2align 2 .include "ih264_neon_macros.s" .global ih264_inter_pred_luma_horz_hpel_vert_qpel_av8 ih264_inter_pred_luma_horz_hpel_vert_qpel_av8: // store register values to stack push_v_regs stp x19, x20, [sp, #-16]! sub x0, x0, x2, lsl #1 // pu1_src-2*src_strd sub x0, x0, #2 // pu1_src-2 mov x9, x6 lsr x7, x7, #3 // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit add x7, x7, #2 mov x6, #48 madd x7, x7, x6, x9 subs x12, x5, #4 //if wd=4 branch to loop_4 beq loop_4_start subs x12, x5, #8 //if wd=8 branch to loop_8 beq loop_8_start //when wd=16 movi v22.8h, #20 // Filter coeff 0x14 into Q11 movi v24.8h, #5 // Filter coeff 0x5 into Q12 add x8, x0, #8 add x14, x1, #8 add x10, x9, #8 mov x12, x4 add x11, x7, #8 loop_16_lowhalf_start: ld1 {v0.2s, v1.2s}, [x0], x2 // row -2 load for horizontal filter ext v5.8b, v0.8b , v1.8b , #5 uaddl v6.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v8.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v6.8h, v8.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v8.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load for horizontal filter mls v6.8h, v8.8h , v24.8h ext v5.8b, v0.8b , v1.8b , #5 uaddl v8.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v10.8h, v2.8b, v3.8b st1 {v6.4s}, [x9], x6 // store temp buffer 0 ext v4.8b, v0.8b , v1.8b , #4 mla v8.8h, v10.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v10.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load for horizontal filter mls v8.8h, v10.8h , v24.8h ext v5.8b, v0.8b , v1.8b , #5 uaddl v10.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v12.8h, v2.8b, v3.8b st1 {v8.4s}, [x9], x6 // store temp buffer 1 ext v4.8b, v0.8b , v1.8b , #4 mla v10.8h, v12.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v12.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load for horizontal filter mls v10.8h, v12.8h , v24.8h ext v5.8b, v0.8b , v1.8b , #5 uaddl v12.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v14.8h, v2.8b, v3.8b st1 {v10.4s}, [x9], x6 // store temp buffer 2 ext v4.8b, v0.8b , v1.8b , #4 mla v12.8h, v14.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v14.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load for horizontal filter mls v12.8h, v14.8h , v24.8h ext v5.8b, v0.8b , v1.8b , #5 uaddl v14.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v16.8h, v2.8b, v3.8b st1 {v12.4s}, [x9], x6 // store temp buffer 3 ext v4.8b, v0.8b , v1.8b , #4 mla v14.8h, v16.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v16.8h, v1.8b, v4.8b mls v14.8h, v16.8h , v24.8h loop_16_lowhalf: ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load for horizontal filter ext v5.8b, v0.8b , v1.8b , #5 ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v16.8h, v0.8b, v5.8b st1 {v14.4s}, [x9], x6 // store temp buffer 4 uaddl v18.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v16.8h, v18.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 add v28.8h, v8.8h , v14.8h uaddl v18.8h, v1.8b, v4.8b add v30.8h, v10.8h , v12.8h mls v16.8h, v18.8h , v24.8h ld1 {v0.2s, v1.2s}, [x0], x2 // row 4 load for hoorizontal filter ext v5.8b, v0.8b , v1.8b , #5 ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v20.8h, v0.8b, v5.8b st1 {v16.4s}, [x9], x6 // store temp buffer x5 saddl v18.4s, v6.4h, v16.4h ld1 {v26.4s}, [x7], x6 // load from temp buffer 0 saddl2 v6.4s, v6.8h, v16.8h sqrshrun v26.8b, v26.8h, #5 smlal v18.4s, v30.4h, v22.4h smlsl v18.4s, v28.4h, v24.4h smlal2 v6.4s, v30.8h, v22.8h smlsl2 v6.4s, v28.8h, v24.8h uaddl v2.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v20.8h, v2.8h , v22.8h sqrshrun v18.4h, v18.4s, #10 ext v1.8b, v0.8b , v1.8b , #1 sqrshrun v19.4h, v6.4s, #10 add v28.8h, v10.8h , v16.8h uaddl v2.8h, v1.8b, v4.8b add v30.8h, v12.8h , v14.8h mls v20.8h, v2.8h , v24.8h uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h mov v18.2s[1], v19.2s[0] ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter urhadd v26.8b, v18.8b , v26.8b ext v5.8b, v0.8b , v1.8b , #5 ext v2.8b, v0.8b , v1.8b , #2 st1 {v20.4s}, [x9], x6 // store temp buffer x6 saddl v18.4s, v8.4h, v20.4h saddl2 v6.4s, v8.8h, v20.8h ld1 {v8.4s}, [x7], x6 //load from temp buffer 1 st1 {v26.2s}, [x1], x3 // store row 0 smlal v18.4s, v30.4h, v22.4h smlsl v18.4s, v28.4h, v24.4h smlal2 v6.4s, v30.8h, v22.8h smlsl2 v6.4s, v28.8h, v24.8h sqrshrun v28.8b, v8.8h, #5 ext v3.8b, v0.8b , v1.8b , #3 uaddl v8.8h, v0.8b, v5.8b uaddl v2.8h, v2.8b, v3.8b sqrshrun v18.4h, v18.4s, #10 ext v4.8b, v0.8b , v1.8b , #4 sqrshrun v19.4h, v6.4s, #10 mla v8.8h, v2.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 add v26.8h, v12.8h , v20.8h uaddl v2.8h, v1.8b, v4.8b uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h mov v18.2s[1], v19.2s[0] add v30.8h, v14.8h , v16.8h mls v8.8h, v2.8h , v24.8h ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter urhadd v28.8b, v28.8b , v18.8b ext v5.8b, v0.8b , v1.8b , #5 ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 st1 {v28.2s}, [x1], x3 // store row 1 uaddl v28.8h, v0.8b, v5.8b st1 {v8.4s}, [x9], x6 // store temp buffer x7 saddl v18.4s, v10.4h, v8.4h saddl2 v6.4s, v10.8h, v8.8h ld1 {v10.4s}, [x7], x6 // load from temp buffer 2 smlal v18.4s, v30.4h, v22.4h smlsl v18.4s, v26.4h, v24.4h smlal2 v6.4s, v30.8h, v22.8h smlsl2 v6.4s, v26.8h, v24.8h sqrshrun v26.8b, v10.8h, #5 uaddl v2.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v28.8h, v2.8h , v22.8h sqrshrun v18.4h, v18.4s, #10 ext v1.8b, v0.8b , v1.8b , #1 sqrshrun v19.4h, v6.4s, #10 add v10.8h, v14.8h , v8.8h uaddl v2.8h, v1.8b, v4.8b add v30.8h, v16.8h , v20.8h mls v28.8h, v2.8h , v24.8h uqxtn v27.8b, v18.8h uqxtn v19.8b, v19.8h mov v27.2s[1], v19.2s[0] saddl v18.4s, v12.4h, v28.4h saddl2 v6.4s, v12.8h, v28.8h urhadd v26.8b, v26.8b , v27.8b smlal v18.4s, v30.4h, v22.4h smlsl v18.4s, v10.4h, v24.4h smlal2 v6.4s, v30.8h, v22.8h smlsl2 v6.4s, v10.8h, v24.8h st1 {v26.2s}, [x1], x3 // store row 2 st1 {v28.2s, v29.2s}, [x9] sqrshrun v18.4h, v18.4s, #10 mov v10.16b, v20.16b mov v11.16b, v21.16b ld1 {v30.4s}, [x7], x6 // load from temp buffer 3 sqrshrun v19.4h, v6.4s, #10 subs x4, x4, #4 sqrshrun v30.8b, v30.8h, #5 uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h mov v18.2s[1], v19.2s[0] mov v12.16b, v8.16b mov v13.16b, v9.16b mov v6.16b, v14.16b mov v7.16b, v15.16b urhadd v30.8b, v18.8b , v30.8b mov v8.16b, v16.16b mov v9.16b, v17.16b mov v14.16b, v28.16b mov v15.16b, v29.16b st1 {v30.2s}, [x1], x3 // store row 3 bgt loop_16_lowhalf // looping if height =16 loop_16_highhalf_start: ld1 {v0.2s, v1.2s}, [x8], x2 ext v5.8b, v0.8b , v1.8b , #5 uaddl v6.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v8.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v6.8h, v8.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v8.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x8], x2 mls v6.8h, v8.8h , v24.8h ext v5.8b, v0.8b , v1.8b , #5 uaddl v8.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v10.8h, v2.8b, v3.8b st1 {v6.4s}, [x10], x6 ext v4.8b, v0.8b , v1.8b , #4 mla v8.8h, v10.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v10.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x8], x2 mls v8.8h, v10.8h , v24.8h ext v5.8b, v0.8b , v1.8b , #5 uaddl v10.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v12.8h, v2.8b, v3.8b st1 {v8.4s}, [x10], x6 ext v4.8b, v0.8b , v1.8b , #4 mla v10.8h, v12.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v12.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x8], x2 mls v10.8h, v12.8h , v24.8h ext v5.8b, v0.8b , v1.8b , #5 uaddl v12.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v14.8h, v2.8b, v3.8b st1 {v10.4s}, [x10], x6 ext v4.8b, v0.8b , v1.8b , #4 mla v12.8h, v14.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v14.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x8], x2 mls v12.8h, v14.8h , v24.8h ext v5.8b, v0.8b , v1.8b , #5 uaddl v14.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v16.8h, v2.8b, v3.8b st1 {v12.4s}, [x10], x6 ext v4.8b, v0.8b , v1.8b , #4 mla v14.8h, v16.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v16.8h, v1.8b, v4.8b mls v14.8h, v16.8h , v24.8h loop_16_highhalf: ld1 {v0.2s, v1.2s}, [x8], x2 ext v5.8b, v0.8b , v1.8b , #5 ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v16.8h, v0.8b, v5.8b st1 {v14.4s}, [x10], x6 uaddl v18.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v16.8h, v18.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 add v28.8h, v8.8h , v14.8h uaddl v18.8h, v1.8b, v4.8b add v30.8h, v10.8h , v12.8h mls v16.8h, v18.8h , v24.8h ld1 {v0.2s, v1.2s}, [x8], x2 ext v5.8b, v0.8b , v1.8b , #5 ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v20.8h, v0.8b, v5.8b st1 {v16.4s}, [x10], x6 saddl v18.4s, v6.4h, v16.4h ld1 {v26.4s}, [x11], x6 saddl2 v6.4s, v6.8h, v16.8h sqrshrun v26.8b, v26.8h, #5 smlal v18.4s, v30.4h, v22.4h smlsl v18.4s, v28.4h, v24.4h smlal2 v6.4s, v30.8h, v22.8h smlsl2 v6.4s, v28.8h, v24.8h uaddl v2.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v20.8h, v2.8h , v22.8h sqrshrun v18.4h, v18.4s, #10 ext v1.8b, v0.8b , v1.8b , #1 sqrshrun v19.4h, v6.4s, #10 add v28.8h, v10.8h , v16.8h uaddl v2.8h, v1.8b, v4.8b add v30.8h, v12.8h , v14.8h mls v20.8h, v2.8h , v24.8h uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h mov v18.2s[1], v19.2s[0] ld1 {v0.2s, v1.2s}, [x8], x2 urhadd v26.8b, v18.8b , v26.8b ext v5.8b, v0.8b , v1.8b , #5 ext v2.8b, v0.8b , v1.8b , #2 st1 {v20.4s}, [x10], x6 saddl v18.4s, v8.4h, v20.4h saddl2 v6.4s, v8.8h, v20.8h ld1 {v8.4s}, [x11], x6 st1 {v26.2s}, [x14], x3 //store row 0 smlal v18.4s, v30.4h, v22.4h smlsl v18.4s, v28.4h, v24.4h smlal2 v6.4s, v30.8h, v22.8h smlsl2 v6.4s, v28.8h, v24.8h sqrshrun v28.8b, v8.8h, #5 ext v3.8b, v0.8b , v1.8b , #3 uaddl v8.8h, v0.8b, v5.8b uaddl v2.8h, v2.8b, v3.8b sqrshrun v18.4h, v18.4s, #10 ext v4.8b, v0.8b , v1.8b , #4 sqrshrun v19.4h, v6.4s, #10 mla v8.8h, v2.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 add v26.8h, v12.8h , v20.8h uaddl v2.8h, v1.8b, v4.8b uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h mov v18.2s[1], v19.2s[0] add v30.8h, v14.8h , v16.8h mls v8.8h, v2.8h , v24.8h ld1 {v0.2s, v1.2s}, [x8], x2 urhadd v28.8b, v28.8b , v18.8b ext v5.8b, v0.8b , v1.8b , #5 ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 st1 {v28.2s}, [x14], x3 //store row 1 uaddl v28.8h, v0.8b, v5.8b st1 {v8.4s}, [x10], x6 saddl v18.4s, v10.4h, v8.4h saddl2 v6.4s, v10.8h, v8.8h ld1 {v10.4s}, [x11], x6 smlal v18.4s, v30.4h, v22.4h smlsl v18.4s, v26.4h, v24.4h smlal2 v6.4s, v30.8h, v22.8h smlsl2 v6.4s, v26.8h, v24.8h sqrshrun v26.8b, v10.8h, #5 uaddl v2.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v28.8h, v2.8h , v22.8h sqrshrun v18.4h, v18.4s, #10 ext v1.8b, v0.8b , v1.8b , #1 sqrshrun v19.4h, v6.4s, #10 add v10.8h, v14.8h , v8.8h uaddl v2.8h, v1.8b, v4.8b add v30.8h, v16.8h , v20.8h mls v28.8h, v2.8h , v24.8h uqxtn v27.8b, v18.8h uqxtn v19.8b, v19.8h mov v27.2s[1], v19.2s[0] saddl v18.4s, v12.4h, v28.4h saddl2 v6.4s, v12.8h, v28.8h urhadd v26.8b, v26.8b , v27.8b smlal v18.4s, v30.4h, v22.4h smlsl v18.4s, v10.4h, v24.4h smlal2 v6.4s, v30.8h, v22.8h smlsl2 v6.4s, v10.8h, v24.8h st1 {v26.2s}, [x14], x3 // store row 2 st1 {v28.4s}, [x10] sqrshrun v18.4h, v18.4s, #10 mov v10.16b, v20.16b mov v11.16b, v21.16b ld1 {v30.4s}, [x11], x6 sqrshrun v19.4h, v6.4s, #10 subs x12, x12, #4 sqrshrun v30.8b, v30.8h, #5 uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h mov v18.2s[1], v19.2s[0] mov v12.16b, v8.16b mov v13.16b, v9.16b mov v6.16b, v14.16b mov v7.16b, v15.16b urhadd v30.8b, v18.8b , v30.8b mov v8.16b, v16.16b mov v9.16b, v17.16b mov v14.16b, v28.16b mov v15.16b, v29.16b st1 {v30.2s}, [x14], x3 // store row 3 bgt loop_16_highhalf // looping if height = 8 or 16 b end_func loop_8_start: movi v22.8h, #0x14 // Filter coeff 20 into Q11 movi v24.8h, #5 // Filter coeff 5 into Q12 ld1 {v0.2s, v1.2s}, [x0], x2 // row -2 load for horizontal filter ext v5.8b, v0.8b , v1.8b , #5 uaddl v6.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v8.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v6.8h, v8.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v8.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load for horizontal filter mls v6.8h, v8.8h , v24.8h ext v5.8b, v0.8b , v1.8b , #5 uaddl v8.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v10.8h, v2.8b, v3.8b st1 {v6.4s}, [x9], x6 // store temp buffer 0 ext v4.8b, v0.8b , v1.8b , #4 mla v8.8h, v10.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v10.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load for horizontal filter mls v8.8h, v10.8h , v24.8h ext v5.8b, v0.8b , v1.8b , #5 uaddl v10.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v12.8h, v2.8b, v3.8b st1 {v8.4s}, [x9], x6 // store temp buffer 1 ext v4.8b, v0.8b , v1.8b , #4 mla v10.8h, v12.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v12.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load for horizontal filter mls v10.8h, v12.8h , v24.8h ext v5.8b, v0.8b , v1.8b , #5 uaddl v12.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v14.8h, v2.8b, v3.8b st1 {v10.4s}, [x9], x6 // store temp buffer 2 ext v4.8b, v0.8b , v1.8b , #4 mla v12.8h, v14.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v14.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load for horizontal filter mls v12.8h, v14.8h , v24.8h ext v5.8b, v0.8b , v1.8b , #5 uaddl v14.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v16.8h, v2.8b, v3.8b st1 {v12.4s}, [x9], x6 // store temp buffer 3 ext v4.8b, v0.8b , v1.8b , #4 mla v14.8h, v16.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 uaddl v16.8h, v1.8b, v4.8b mls v14.8h, v16.8h , v24.8h loop_8: ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load for horizontal filter ext v5.8b, v0.8b , v1.8b , #5 ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v16.8h, v0.8b, v5.8b st1 {v14.4s}, [x9], x6 // store temp buffer 4 uaddl v18.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v16.8h, v18.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 add v28.8h, v8.8h , v14.8h uaddl v18.8h, v1.8b, v4.8b add v30.8h, v10.8h , v12.8h mls v16.8h, v18.8h , v24.8h ld1 {v0.2s, v1.2s} , [x0], x2 // row 4 load for hoorizontal filter ext v5.8b, v0.8b , v1.8b , #5 ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v20.8h, v0.8b, v5.8b st1 {v16.4s}, [x9], x6 // store temp buffer x5 saddl v18.4s, v6.4h, v16.4h ld1 {v26.4s}, [x7], x6 // load from temp buffer 0 saddl2 v6.4s, v6.8h, v16.8h sqrshrun v26.8b, v26.8h, #5 smlal v18.4s, v30.4h, v22.4h smlsl v18.4s, v28.4h, v24.4h smlal2 v6.4s, v30.8h, v22.8h smlsl2 v6.4s, v28.8h, v24.8h uaddl v2.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v20.8h, v2.8h , v22.8h sqrshrun v18.4h, v18.4s, #10 ext v1.8b, v0.8b , v1.8b , #1 sqrshrun v19.4h, v6.4s, #10 add v28.8h, v10.8h , v16.8h uaddl v2.8h, v1.8b, v4.8b add v30.8h, v12.8h , v14.8h mls v20.8h, v2.8h , v24.8h uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h mov v18.2s[1], v19.2s[0] ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter urhadd v26.8b, v18.8b , v26.8b ext v5.8b, v0.8b , v1.8b , #5 ext v2.8b, v0.8b , v1.8b , #2 st1 {v20.4s}, [x9], x6 // store temp buffer x6 saddl v18.4s, v8.4h, v20.4h saddl2 v6.4s, v8.8h, v20.8h ld1 {v8.4s}, [x7], x6 //load from temp buffer 1 st1 {v26.2s}, [x1], x3 // store row 0 smlal v18.4s, v30.4h, v22.4h smlsl v18.4s, v28.4h, v24.4h smlal2 v6.4s, v30.8h, v22.8h smlsl2 v6.4s, v28.8h, v24.8h sqrshrun v28.8b, v8.8h, #5 ext v3.8b, v0.8b , v1.8b , #3 uaddl v8.8h, v0.8b, v5.8b uaddl v2.8h, v2.8b, v3.8b sqrshrun v18.4h, v18.4s, #10 ext v4.8b, v0.8b , v1.8b , #4 sqrshrun v19.4h, v6.4s, #10 mla v8.8h, v2.8h , v22.8h ext v1.8b, v0.8b , v1.8b , #1 add v26.8h, v12.8h , v20.8h uaddl v2.8h, v1.8b, v4.8b uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h mov v18.2s[1], v19.2s[0] add v30.8h, v14.8h , v16.8h mls v8.8h, v2.8h , v24.8h ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter urhadd v28.8b, v28.8b , v18.8b ext v5.8b, v0.8b , v1.8b , #5 ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 st1 {v28.2s}, [x1], x3 // store row 1 uaddl v28.8h, v0.8b, v5.8b st1 {v8.4s}, [x9], x6 // store temp buffer x7 saddl v18.4s, v10.4h, v8.4h saddl2 v6.4s, v10.8h, v8.8h ld1 {v10.4s}, [x7], x6 // load from temp buffer 2 smlal v18.4s, v30.4h, v22.4h smlsl v18.4s, v26.4h, v24.4h smlal2 v6.4s, v30.8h, v22.8h smlsl2 v6.4s, v26.8h, v24.8h sqrshrun v26.8b, v10.8h, #5 uaddl v2.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v28.8h, v2.8h , v22.8h sqrshrun v18.4h, v18.4s, #10 ext v1.8b, v0.8b , v1.8b , #1 sqrshrun v19.4h, v6.4s, #10 add v10.8h, v14.8h , v8.8h uaddl v2.8h, v1.8b, v4.8b add v30.8h, v16.8h , v20.8h mls v28.8h, v2.8h , v24.8h uqxtn v27.8b, v18.8h uqxtn v19.8b, v19.8h mov v27.2s[1], v19.2s[0] saddl v18.4s, v12.4h, v28.4h saddl2 v6.4s, v12.8h, v28.8h urhadd v26.8b, v26.8b , v27.8b smlal v18.4s, v30.4h, v22.4h smlsl v18.4s, v10.4h, v24.4h smlal2 v6.4s, v30.8h, v22.8h smlsl2 v6.4s, v10.8h, v24.8h st1 {v26.2s}, [x1], x3 // store row 2 st1 {v28.2s, v29.2s}, [x9] sqrshrun v18.4h, v18.4s, #10 mov v10.16b, v20.16b mov v11.16b, v21.16b ld1 {v30.4s}, [x7], x6 // load from temp buffer 3 sqrshrun v19.4h, v6.4s, #10 subs x4, x4, #4 sqrshrun v30.8b, v30.8h, #5 uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h mov v18.2s[1], v19.2s[0] mov v12.16b, v8.16b mov v13.16b, v9.16b mov v6.16b, v14.16b mov v7.16b, v15.16b urhadd v30.8b, v18.8b , v30.8b mov v8.16b, v16.16b mov v9.16b, v17.16b mov v14.16b, v28.16b mov v15.16b, v29.16b st1 {v30.2s}, [x1], x3 // store row 3 bgt loop_8 //if height =8 or 16 loop b end_func loop_4_start: movi v22.8h, #20 // Filter coeff 20 into D22 movi v23.8h, #5 // Filter coeff 5 into D23 ld1 {v0.2s, v1.2s}, [x0], x2 //row -2 load ext v5.8b, v0.8b , v1.8b , #5 uaddl v6.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v8.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v6.4h, v8.4h , v22.4h ext v1.8b, v0.8b , v1.8b , #1 uaddl v8.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load mls v6.4h, v8.4h , v23.4h ext v5.8b, v0.8b , v1.8b , #5 uaddl v8.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v10.8h, v2.8b, v3.8b st1 {v6.2s}, [x9], x6 // store temp buffer 0 ext v4.8b, v0.8b , v1.8b , #4 mla v8.4h, v10.4h , v22.4h ext v1.8b, v0.8b , v1.8b , #1 uaddl v10.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load mls v8.4h, v10.4h , v23.4h ext v5.8b, v0.8b , v1.8b , #5 uaddl v10.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v12.8h, v2.8b, v3.8b st1 {v8.2s}, [x9], x6 // store temp buffer 1 ext v4.8b, v0.8b , v1.8b , #4 mla v10.4h, v12.4h , v22.4h ext v1.8b, v0.8b , v1.8b , #1 uaddl v12.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load mls v10.4h, v12.4h , v23.4h ext v5.8b, v0.8b , v1.8b , #5 uaddl v12.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v14.8h, v2.8b, v3.8b st1 {v10.2s}, [x9], x6 // store temp buffer 2 ext v4.8b, v0.8b , v1.8b , #4 mla v12.4h, v14.4h , v22.4h ext v1.8b, v0.8b , v1.8b , #1 uaddl v14.8h, v1.8b, v4.8b ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load mls v12.4h, v14.4h , v23.4h ext v5.8b, v0.8b , v1.8b , #5 uaddl v14.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v16.8h, v2.8b, v3.8b ext v4.8b, v0.8b , v1.8b , #4 mla v14.4h, v16.4h , v22.4h ext v1.8b, v0.8b , v1.8b , #1 uaddl v16.8h, v1.8b, v4.8b st1 {v12.2s}, [x9], x6 // store temp buffer 3 mls v14.4h, v16.4h , v23.4h loop_4: ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load ext v5.8b, v0.8b , v1.8b , #5 uaddl v16.8h, v0.8b, v5.8b ext v2.8b, v0.8b , v1.8b , #2 ext v3.8b, v0.8b , v1.8b , #3 uaddl v18.8h, v2.8b, v3.8b st1 {v14.2s}, [x9], x6 // store temp buffer 4 ext v4.8b, v0.8b , v1.8b , #4 mla v16.4h, v18.4h , v22.4h ext v1.8b, v0.8b , v1.8b , #1 uaddl v18.8h, v1.8b, v4.8b add v2.4h, v10.4h , v12.4h mls v16.4h, v18.4h , v23.4h add v3.4h, v8.4h , v14.4h ld1 {v18.2s, v19.2s}, [x0], x2 // row 4 load ext v25.8b, v18.8b , v19.8b , #5 uaddl v26.8h, v18.8b, v25.8b ext v20.8b, v18.8b , v19.8b , #2 st1 {v16.2s}, [x9], x6 // store temp buffer 5 saddl v0.4s, v6.4h, v16.4h smlal v0.4s, v2.4h, v22.4h ext v21.8b, v18.8b , v19.8b , #3 uaddl v28.8h, v20.8b, v21.8b ext v24.8b, v18.8b , v19.8b , #4 smlsl v0.4s, v3.4h, v23.4h mla v26.4h, v28.4h , v22.4h ext v19.8b, v18.8b , v19.8b , #1 uaddl v28.8h, v19.8b, v24.8b add v2.4h, v12.4h , v14.4h mls v26.4h, v28.4h , v23.4h sqrshrun v0.4h, v0.4s, #0xa add v3.4h, v10.4h , v16.4h ld1 {v18.2s, v19.2s}, [x0], x2 // row 5 load ext v25.8b, v18.8b , v19.8b , #5 uqxtn v11.8b, v0.8h uaddl v28.8h, v18.8b, v25.8b st1 {v26.2s}, [x9], x6 // store temp buffer 6 //Q3 available here ld1 {v6.2s}, [x7], x6 // load from temp buffer 0 ld1 {v7.2s}, [x7], x6 // load from temp buffer 1 sqrshrun v9.8b, v6.8h, #5 sqrshrun v7.8b, v7.8h, #5 mov v9.2s[1], v7.2s[0] ext v20.8b, v18.8b , v19.8b , #2 saddl v0.4s, v8.4h, v26.4h smlal v0.4s, v2.4h, v22.4h ext v21.8b, v18.8b , v19.8b , #3 uaddl v6.8h, v20.8b, v21.8b ext v24.8b, v18.8b , v19.8b , #4 smlsl v0.4s, v3.4h, v23.4h mla v28.4h, v6.4h , v22.4h ext v19.8b, v18.8b , v19.8b , #1 uaddl v6.8h, v19.8b, v24.8b add v2.4h, v14.4h , v16.4h mls v28.4h, v6.4h , v23.4h sqrshrun v0.4h, v0.4s, #0xa add v3.4h, v12.4h , v26.4h ld1 {v18.2s, v19.2s}, [x0], x2 // row 6 load ext v25.8b, v18.8b , v19.8b , #5 uqxtn v13.8b, v0.8h trn1 v11.2s, v11.2s, v13.2s trn2 v13.2s, v11.2s, v13.2s saddl v0.4s, v10.4h, v28.4h urhadd v9.8b, v9.8b , v11.8b st1 {v28.2s}, [x9], x6 // store temp buffer 7 smlal v0.4s, v2.4h, v22.4h uaddl v30.8h, v18.8b, v25.8b st1 {v9.s}[0], [x1], x3 // store row 0 ext v20.8b, v18.8b , v19.8b , #2 st1 {v9.s}[1], [x1], x3 // store row 1 ext v21.8b, v18.8b , v19.8b , #3 smlsl v0.4s, v3.4h, v23.4h uaddl v8.8h, v20.8b, v21.8b ext v24.8b, v18.8b , v19.8b , #4 mla v30.4h, v8.4h , v22.4h ext v19.8b, v18.8b , v19.8b , #1 uaddl v8.8h, v19.8b, v24.8b sqrshrun v0.4h, v0.4s, #0xa add v2.4h, v16.4h , v26.4h mls v30.4h, v8.4h , v23.4h uqxtn v4.8b, v0.8h add v3.4h, v14.4h , v28.4h saddl v0.4s, v12.4h, v30.4h st1 {v30.2s}, [x9] smlal v0.4s, v2.4h, v22.4h ld1 {v8.2s}, [x7], x6 // load from temp buffer 2 ld1 {v9.2s}, [x7], x6 // load from temp buffer 3 smlsl v0.4s, v3.4h, v23.4h subs x4, x4, #4 sqrshrun v10.8b, v8.8h, #5 sqrshrun v9.8b, v9.8h, #5 mov v10.2s[1], v9.2s[0] mov v12.8b, v28.8b sqrshrun v0.4h, v0.4s, #0xa mov v6.8b, v14.8b mov v8.8b, v16.8b uqxtn v5.8b, v0.8h trn1 v4.2s, v4.2s, v5.2s trn2 v5.2s, v4.2s, v5.2s urhadd v4.8b, v4.8b , v10.8b mov v10.8b, v26.8b mov v14.8b, v30.8b st1 {v4.s}[0], [x1], x3 // store row 2 st1 {v4.s}[1], [x1], x3 // store row 3 bgt loop_4 end_func: //Restoring registers from stack ldp x19, x20, [sp], #16 pop_v_regs ret