//****************************************************************************** //* //* Copyright (C) 2015 The Android Open Source Project //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //***************************************************************************** //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore //*/ ///** //****************************************************************************** //* @file //* ih264_inter_pred_luma_horz_av8.s //* //* @brief //* Contains function definitions for inter prediction interpolation. //* //* @author //* Ittiam //* //* @par List of Functions: //* //* - ih264_inter_pred_luma_horz_av8() //* //* @remarks //* None //* //******************************************************************************* //*/ ///* All the functions here are replicated from ih264_inter_pred_filters.c // ///** ///** //******************************************************************************* //* //* @brief //* Interprediction luma filter for horizontal input //* //* @par Description: //* Applies a 6 tap horizontal filter .The output is clipped to 8 bits //* sec 8.4.2.2.1 titled "Luma sample interpolation process" //* //* @param[in] pu1_src //* UWORD8 pointer to the source //* //* @param[out] pu1_dst //* UWORD8 pointer to the destination //* //* @param[in] src_strd //* integer source stride //* //* @param[in] dst_strd //* integer destination stride //* //* @param[in] ht //* integer height of the array //* //* @param[in] wd //* integer width of the array //* //* @returns //* // @remarks //* None //* //******************************************************************************* //*/ //void ih264_inter_pred_luma_horz ( // UWORD8 *pu1_src, // UWORD8 *pu1_dst, // WORD32 src_strd, // WORD32 dst_strd, // WORD32 ht, // WORD32 wd ) //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst // x2 => src_strd // x3 => dst_strd // x4 => ht // x5 => wd .text .p2align 2 .include "ih264_neon_macros.s" .global ih264_inter_pred_luma_horz_av8 ih264_inter_pred_luma_horz_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! sub x0, x0, #2 //pu1_src-2 sub x14, x4, #16 movi v0.8b, #5 //filter coeff subs x12, x5, #8 //if wd=8 branch to loop_8 movi v1.8b, #20 //filter coeff beq loop_8 subs x12, x5, #4 //if wd=4 branch to loop_4 beq loop_4 loop_16: //when wd=16 //// Processing row0 and row1 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0 add x14, x14, #1 //for checking loop ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0) ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1 ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row0) uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1) uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0) ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row1) uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0) uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1) ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row0) umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1) umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row1) umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0) umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1) ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row0) umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1) umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row1) umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0) umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1) ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row0) umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1) umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row1) umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0) umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row0) umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1) umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row1) umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2) st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row0 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row2) sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) //// Processing row2 and row3 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3) uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row1 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row2) ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row3) uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2) uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row3) ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row2) umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row3) umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row2) ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3) umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2) umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row3) ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row2) umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row3) umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row2) ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row3) umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2) umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row3) ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row2) umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3) umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row2) ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row3) umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2) umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row3) ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row2) umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3) umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row2) ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row3) umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row3) sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row2) ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row4) st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row4) sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row3) //// Processing row4 and row5 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row5) uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row3 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row4) ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row5) uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row4) uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row5) ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row4) umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row5) umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row4) ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row5) umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row4) umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row5) ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row4) umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row5) umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row4) ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row5) umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row4) umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row5) ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row4) umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row5) umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row4) ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row5) umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row4) umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row5) ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row4) umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row5) umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row4) ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row5) umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row5) sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row4) ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row6) st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row6) sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row5) //// Processing row6 and row7 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row7) uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row5 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row6) ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row7) uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row6) uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row7) ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row6) umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row7) umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row6) ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row7) umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row6) umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row7) ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row6) umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row7) umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row6) ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row7) umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row6) umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row7) ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row6) umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row7) umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row6) ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row7) umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row6) umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row7) ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row6) umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row7) umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row6) ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row6) sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row6) umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row7) sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row6 sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row7) subs x12, x14, #1 // if height==16 - looping st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row7 beq loop_16 b end_func loop_8: //// Processing row0 and row1 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 add x14, x14, #1 //for checking loop ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1) ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1) ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0) ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1) ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1) ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1) uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0) umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0) uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0) ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0) ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) //// Processing row2 and row3 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3) ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3) ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2) uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) st1 {v23.8b}, [x1], x3 ////Store dest row0 ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row2) ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3) sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3) ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2) umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) st1 {v20.8b}, [x1], x3 ////Store dest row1 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2) ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2) ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2) ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row4 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 subs x9, x4, #4 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row5) ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row5) ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row4) uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row5) sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row5) ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row4) st1 {v20.8b}, [x1], x3 ////Store dest row2 ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row4) uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) st1 {v23.8b}, [x1], x3 ////Store dest row3 beq end_func // Branch if height==4 //// Processing row4 and row5 ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row5) umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row5) umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row4) ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row4) ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row6 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row7 ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row6) ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row7) ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row7) uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row7) ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row7) sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row6) ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row6) st1 {v20.8b}, [x1], x3 ////Store dest row4 ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row6) uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row6) umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) //// Processing row6 and row7 st1 {v23.8b}, [x1], x3 ////Store dest row5 ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row7) umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row7) umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) subs x12, x14, #1 st1 {v20.8b}, [x1], x3 ////Store dest row6 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) st1 {v23.8b}, [x1], x3 ////Store dest row7 beq loop_8 //looping if height ==16 b end_func loop_4: ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1) ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1) ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0) uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1) ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1) ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1) ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0) umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0) ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0) ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0) ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3) ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3) sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2) ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row2) st1 {v23.s}[0], [x1], x3 ////Store dest row0 ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3) ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3) ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2) sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2) ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2) //// Processing row2 and row3 st1 {v20.s}[0], [x1], x3 ////Store dest row1 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2) umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) st1 {v20.s}[0], [x1], x3 ////Store dest row2 subs x4, x4, #8 // Loop if height =8 st1 {v23.s}[0], [x1], x3 ////Store dest row3 beq loop_4 end_func: // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack ldp x19, x20, [sp], #16 pop_v_regs ret