diff options
Diffstat (limited to 'common/x86/ihevc_sao_ssse3_intr.c')
-rw-r--r-- | common/x86/ihevc_sao_ssse3_intr.c | 5653 |
1 files changed, 5653 insertions, 0 deletions
diff --git a/common/x86/ihevc_sao_ssse3_intr.c b/common/x86/ihevc_sao_ssse3_intr.c new file mode 100644 index 0000000..cffd2a9 --- /dev/null +++ b/common/x86/ihevc_sao_ssse3_intr.c @@ -0,0 +1,5653 @@ +/****************************************************************************** +* +* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at: +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +******************************************************************************/ +/** +******************************************************************************* +* @file +* ihevc_sao_atom_intr.c +* +* @brief +* Contains function definitions for Sample adaptive offset(SAO) used in-loop +* filtering +* +* @author +* 100592 +* +* @par List of Functions: +* - ihevc_sao_band_offset_luma_ssse3() +* - ihevc_sao_band_offset_chroma_ssse3() +* - ihevc_sao_edge_offset_class0_ssse3() +* - ihevc_sao_edge_offset_class0_chroma_ssse3() +* - ihevc_sao_edge_offset_class1_ssse3() +* - ihevc_sao_edge_offset_class1_chroma_ssse3() +* - ihevc_sao_edge_offset_class2_ssse3() +* - ihevc_sao_edge_offset_class2_chroma_ssse3() +* - ihevc_sao_edge_offset_class3_ssse3() +* - ihevc_sao_edge_offset_class3_chroma_ssse3() +* +* @remarks +* None +* +******************************************************************************* +*/ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ +#include <stdio.h> + +#include "ihevc_typedefs.h" +#include "ihevc_platform_macros.h" +#include "ihevc_macros.h" +#include "ihevc_func_selector.h" +#include "ihevc_defs.h" +#include "ihevc_tables_x86_intr.h" +#include "ihevc_common_tables.h" +#include "ihevc_sao.h" + +#include <immintrin.h> + +#define NUM_BAND_TABLE 32 +/** +******************************************************************************* +* +* @brief +* Has two sets of functions : band offset and edge offset both for luma and chroma +* edge offset has horizontal ,vertical, 135 degree and 45 degree +* +* @par Description: +* +* +* @param[in-out] pu1_src +* Pointer to the source +* +* @param[in] src_strd +* Source stride +* +* @param[in-out] pu1_src_left +* source left boundary +* +* @param[in-out] pu1_src_top +* Source top boundary +* +* @param[in-out] pu1_src_top_left +* Source top left boundary +* +* @param[in] pu1_src_top_right +* Source top right boundary +* +* @param[in] pu1_src_bot_left +* Source bottom left boundary +* +* @param[in] pu1_avail +* boundary availability flags +* +* @param[in] pi1_sao_offset_u +* Chroma U sao offset values +* +* @param[in] pi1_sao_offset_v +* Chroma V sao offset values +* +* @param[in] pi1_sao_offset +* Luma sao offset values +* +* @param[in] wd +* width of the source + +* @param[in] ht +* height of the source +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + + +void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + UWORD8 *pu1_src_left, + UWORD8 *pu1_src_top, + UWORD8 *pu1_src_top_left, + WORD32 sao_band_pos, + WORD8 *pi1_sao_offset, + WORD32 wd, + WORD32 ht) +{ + WORD32 row, col; + UWORD8 *pu1_src_cpy; + WORD32 wd_rem; + WORD8 offset = 0; + + __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b; + __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b; + __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4; + __m128i band_pos_16x8b; + __m128i sao_offset; + __m128i cmp_mask, cmp_store; + + /* Updating left and top-left and top */ + for(row = 0; row < ht; row++) + { + pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)]; + } + pu1_src_top_left[0] = pu1_src_top[wd - 1]; + for(col = 0; col < wd; col += 8) + { + tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset)); + _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1); + offset += 8; + } + + //replicating sao_band_pos as 8 bit value 16 times + + + band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3)); + //value set for sao_offset extraction + tmp_set_128i_1 = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1); + tmp_set_128i_2 = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2); + tmp_set_128i_3 = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3); + tmp_set_128i_4 = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4); + + //loaded sao offset values + sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset); + + //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers + band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx)); + band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8)); + band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16)); + band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24)); + + //band_position addition + band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b); + band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b); + band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b); + band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b); + //sao_offset duplication + tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1); + tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2); + tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3); + tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4); + //settng for comparision + cmp_mask = _mm_set1_epi16(16); + cmp_store = _mm_set1_epi16(0x00ff); + + //sao_offset addition + band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1); + band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2); + band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3); + band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4); + //masking upper 8bit values of each 16 bit band table value + band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store); + band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store); + band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store); + band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store); + + switch(sao_band_pos) + { + case 0: + tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b); + band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2); + break; + case 28: + tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b); + band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2); + break; + case 29: + tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b); + band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2); + tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b); + band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2); + break; + case 30: + tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b); + band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2); + tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b); + band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2); + break; + case 31: + tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b); + band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2); + tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b); + band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2); + break; + default: + break; + } + //sao_offset is reused for zero cmp mask. + sao_offset = _mm_setzero_si128(); + tmp_set_128i_1 = _mm_set1_epi8(1); + //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0); + cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16); + + //masking upper 8bit values of each 16 bit band table value + band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store); + band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store); + band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store); + band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store); + + //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b + band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b); + band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b); + + band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31 + band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned + band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31 + + cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1); + // band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store); + + for(col = wd; col >= 16; col -= 16) + { + pu1_src_cpy = pu1_src; + for(row = ht; row > 0; row -= 2) + { + + + //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); + // row = 1 + src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + + + + //saturated substract 8 bit + tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b); + tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b); + //if the values less than 0 put ff + tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1); + tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3); + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); + //if the values gret=ater than 31 put ff + tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b); + tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b); + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); + + + //row 0 and row1 + //if the values >16 then put ff ,cmp_mask = dup16(15) + cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask); + //values 16 to 31 for row 0 & 1 but values <16 ==0 + tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store); + // values 0 to 15 for row 0 & 1 + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store); + //values 16 to 31 for row 0 & 1 but values <16 masked to ff + cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset); + tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store); + //row 2 and row 3 + //if the values >16 then put ff ,cmp_mask = dup16(15) + cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask); + //values 16 to 31 for row 2 & 3 but values <16 ==0 + tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store); + // values 0 to 15 for row 2 & 3 + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store); + //values 16 to 31 for row 2 & 3 but values <16 masked to ff + cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset); + tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store); + + //row 0 and row 1 + //to preserve pixel values in which no offset needs to be added. + cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2); + src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store); + + //row 2 and row 3 + //to preserve pixel values in which no offset needs to be added. + cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4); + src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store); + + //indexing 0 - 15 bandtable indexes + tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1); + tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3); + tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2); + tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4); + // combining all offsets results + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); + // combing results woth the pixel values + src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1); + src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3); + + + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b); + // row = 1 + _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b); + + pu1_src_cpy += (src_strd << 1); + } + pu1_src += 16; + } + wd_rem = wd & 0xF; + if(wd_rem) + {pu1_src_cpy = pu1_src; + for(row = ht; row > 0; row -= 4) + { + + + //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); + // row = 1 + src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); + // row = 3 + src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); + //row0 and row1 packed and row2 and row3 packed + + src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b); + src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b); + + //saturated substract 8 bit + tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b); + tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b); + //if the values less than 0 put ff + tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1); + tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3); + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); + //if the values gret=ater than 31 put ff + tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b); + tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b); + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); + + + + //row 0 and row1 + //if the values >16 then put ff ,cmp_mask = dup16(15) + cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask); + //values 16 to 31 for row 0 & 1 but values <16 ==0 + tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store); + // values 0 to 15 for row 0 & 1 + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store); + //values 16 to 31 for row 0 & 1 but values <16 masked to ff + cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset); + tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store); + //row 2 and row 3 + //if the values >16 then put ff ,cmp_mask = dup16(15) + cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask); + //values 16 to 31 for row 2 & 3 but values <16 ==0 + tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store); + // values 0 to 15 for row 2 & 3 + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store); + //values 16 to 31 for row 2 & 3 but values <16 masked to ff + cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset); + tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store); + + //row 0 and row 1 + //to preserve pixel values in which no offset needs to be added. + cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2); + src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store); + + //row 2 and row 3 + //to preserve pixel values in which no offset needs to be added. + cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4); + src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store); + + //indexing 0 - 15 bandtable indexes + tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1); + tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3); + tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2); + tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4); + // combining all offsets results + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); + // combing results woth the pixel values + src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1); + src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3); + + //Getting row1 separately + src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8); + //Getting row3 separately + src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); + + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b); + // row = 2 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b); + // row = 3 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b); + + pu1_src_cpy += (src_strd << 2); + + } + pu1_src += 8; + } + + +} + +void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + UWORD8 *pu1_src_left, + UWORD8 *pu1_src_top, + UWORD8 *pu1_src_top_left, + WORD32 sao_band_pos_u, + WORD32 sao_band_pos_v, + WORD8 *pi1_sao_offset_u, + WORD8 *pi1_sao_offset_v, + WORD32 wd, + WORD32 ht) +{ + WORD32 row, col; + WORD8 offset = 0; + + + __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b; + __m128i cmp_msk2; + __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b; + __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4; + __m128i band_pos_u_16x8b, band_pos_v_16x8b; + __m128i sao_offset; + __m128i cmp_mask; + + + /* Updating left and top and top-left */ + for(row = 0; row < ht; row++) + { + pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)]; + pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)]; + } + pu1_src_top_left[0] = pu1_src_top[wd - 2]; + pu1_src_top_left[1] = pu1_src_top[wd - 1]; + for(col = 0; col < wd; col += 8) + { + tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset)); + _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1); + offset += 8; + } + + { // band _table creation + __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b; + // Band table for U component : band_table0_16x8b and band_table2_16x8b + //replicating sao_band_pos as 8 bit value 16 times + band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3)); + //value set for sao_offset extraction + tmp_set_128i_1 = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1); + tmp_set_128i_2 = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2); + tmp_set_128i_3 = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3); + tmp_set_128i_4 = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4); + + //loaded sao offset values + sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); + + //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers + band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx)); + band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8)); + band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16)); + band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24)); + + //band_position addition + band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b); + band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b); + band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b); + band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b); + //sao_offset duplication + temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1); + temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2); + temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3); + temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4); + + //sao_offset addition + band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b); + band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b); + band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b); + band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b); + //reuse for clipping + temp1_8x16b = _mm_set1_epi16(0x00ff); + //settng for comparision + cmp_mask = _mm_set1_epi16(16); + + //masking upper 8bit values of each 16 bit band table value + band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b); + band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b); + band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b); + band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b); + + //temp1_8x16b reuse for compare storage + switch(sao_band_pos_u) + { + case 0: + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b); + band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b); + break; + case 28: + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b); + band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b); + break; + case 29: + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b); + band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b); + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b); + band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b); + break; + case 30: + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b); + band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b); + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b); + band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b); + break; + case 31: + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b); + band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b); + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b); + band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b); + break; + default: + break; + } + //masking upper 8bit values of each 16 bit band table value + band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b); + band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b); + band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b); + band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b); + //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b + band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b); + band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b); + // Band table for U component over + + // Band table for V component : band_table1_16x8b and band_table3_16x8b + // replicating sao_band_pos as 8 bit value 16 times + band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3)); + + //loaded sao offset values + sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); + + //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers + temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx)); + band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8)); + temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16)); + band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24)); + + //band_position addition + temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b); + band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b); + temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b); + band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b); + //sao_offset duplication + tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1); + tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2); + tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3); + tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4); + + //sao_offset addition + temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1); + band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2); + temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3); + band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4); + + //masking upper 8bit values of 16 bit band table value + temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b); + band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b); + temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b); + band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b); + //temp1_8x16b reuse for compare storage + + switch(sao_band_pos_v) + { + case 0: + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b); + temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b); + break; + case 28: + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b); + band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b); + break; + case 29: + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b); + temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b); + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b); + band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b); + break; + case 30: + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b); + band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b); + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b); + temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b); + break; + case 31: + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b); + temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b); + temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b); + band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b); + break; + default: + break; + } + //masking upper 8bit values of each 16 bit band table value + temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b); + band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b); + temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b); + band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b); + //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b + band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b); + band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b); + //band table for u and v created + } + { + UWORD8 *pu1_src_cpy; + WORD32 wd_rem; + + + //sao_offset is reused for zero cmp mask. + sao_offset = _mm_setzero_si128(); + tmp_set_128i_1 = _mm_set1_epi8(1); + //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0); + cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16); + //to avoid ffff to be saturated to 0 instead it should be to ff + + cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31 + band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned + band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned + cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31 + + cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1); + + for(col = wd; col >= 16; col -= 16) + { + pu1_src_cpy = pu1_src; + for(row = ht; row > 0; row -= 2) + { + //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); + // row = 1 + src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + + + //odd values + src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8); + src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8); + //even values + src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8); + src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8); + src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8); + src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8); + //combining odd values + src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b); + //combining even values + src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b); + + //saturated substract 8 bit + tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b); + tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b); + //if the values less than 0 put ff + tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1); + tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3); + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); + //if the values greater than 31 put ff + tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2); + tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2); + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); + // registers reused to increase performance + //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1 + src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask); + //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3 + src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask); + + //values 16 to 31 for row 0 & 1 but values <16 ==0 + tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b); + // values 0 to 15 for row 0 & 1 + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b); + //values 16 to 31 for row 2 & 3 but values <16 ==0 + tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b); + // values 0 to 15 for row 2 & 3 + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b); + + //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1 + src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset); + //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3 + src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset); + tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b); + tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b); + + + //to choose which pixel values to preserve in row 0 and row 1 + src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2); + //to choose which pixel values to preserve in row 2 and row 3 + src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4); + //values of all rows to which no offset needs to be added preserved. + src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b); + src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b); + + //indexing 0 - 15 bandtable indexes + tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low + tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low + //indexing 16 -31 bandtable indexes + tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high + tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high + // combining all offsets results + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V + // combing results with the pixel values + src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1); + src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3); + //reorganising even and odd values + src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b); + src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b); + + + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b); + // row = 1 + _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b); + + + pu1_src_cpy += (src_strd << 1); + + } + pu1_src += 16; + } + + wd_rem = wd & 0xF; + if(wd_rem) + { + pu1_src_cpy = pu1_src; + for(row = ht; row > 0; row -= 4) + { + //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); + // row = 1 + src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); + // row = 3 + src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); + //row0 and row1 packed and row2 and row3 packed + + src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b); + src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b); + //odd values + src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8); + src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8); + //even values + src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8); + src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8); + src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8); + src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8); + //combining odd values + src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b); + //combining even values + src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b); + + //saturated substract 8 bit + tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b); + tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b); + //if the values less than 0 put ff + tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1); + tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3); + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); + //if the values greater than 31 put ff + tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2); + tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2); + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); + // registers reused to increase performance + //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1 + src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask); + //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3 + src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask); + + //values 16 to 31 for row 0 & 1 but values <16 ==0 + tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b); + // values 0 to 15 for row 0 & 1 + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b); + //values 16 to 31 for row 2 & 3 but values <16 ==0 + tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b); + // values 0 to 15 for row 2 & 3 + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b); + + //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1 + src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset); + //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3 + src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset); + tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b); + tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b); + + + //to choose which pixel values to preserve in row 0 and row 1 + src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2); + //to choose which pixel values to preserve in row 2 and row 3 + src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4); + //values of all rows to which no offset needs to be added preserved. + src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b); + src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b); + + //indexing 0 - 15 bandtable indexes + tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low + tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low + //indexing 16 -31 bandtable indexes + tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high + tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high + // combining all offsets results + tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U + tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V + // combing results with the pixel values + src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1); + src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3); + //reorganising even and odd values + src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b); + src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b); + //Getting row1 separately + src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); + //Getting row3 separately + src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); + + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b); + // row = 2 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b); + // row = 3 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b); + + pu1_src_cpy += (src_strd << 2); + + } + pu1_src += 16; + } + + + } +} + + + +void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + UWORD8 *pu1_src_left, + UWORD8 *pu1_src_top, + UWORD8 *pu1_src_top_left, + UWORD8 *pu1_src_top_right, + UWORD8 *pu1_src_bot_left, + UWORD8 *pu1_avail, + WORD8 *pi1_sao_offset, + WORD32 wd, + WORD32 ht) +{ + WORD32 row, col; + UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp; + UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; + UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8]; + UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8]; + UWORD8 u1_avail0, u1_avail1; + WORD32 wd_rem; + WORD32 offset = 0; + __m128i src_temp0_16x8b, src_temp1_16x8b; + __m128i left0_16x8b, left1_16x8b; + __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b; + __m128i edge0_16x8b, edge1_16x8b; + __m128i au1_mask8x16b; + __m128i edge_idx_8x16b, sao_offset_8x16b; + __m128i const2_16x8b, const0_16x8b; + __m128i left_store_16x8b; + UNUSED(pu1_src_top_right); + UNUSED(pu1_src_bot_left); + + au1_mask8x16b = _mm_set1_epi8(0xff); + + /* Update top and top-left arrays */ + + *pu1_src_top_left = pu1_src_top[wd - 1]; + + for(col = wd; col >= 16; col -= 16) + { + const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd)); + _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b); + offset += 16; + } + + //setting availability mask to ff size MAX_CTB_SIZE + for(col = 0; col < MAX_CTB_SIZE; col += 16) + _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); + for(row = 0; row < ht; row++) + { + au1_src_left_tmp[row] = pu1_src_left[row]; + } + edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); + sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset); + + //availability mask creation + u1_avail0 = pu1_avail[0]; + u1_avail1 = pu1_avail[1]; + au1_mask[0] = u1_avail0; + au1_mask[wd - 1] = u1_avail1; + + const2_16x8b = _mm_set1_epi8(2); + const0_16x8b = _mm_setzero_si128(); + pu1_src_left_cpy = au1_src_left_tmp; + pu1_src_left_str = au1_src_left_tmp1; + { + au1_mask_cpy = au1_mask; + for(col = wd; col >= 16; col -= 16) + { + pu1_src_cpy = pu1_src; + au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); + //pu1_src_left_cpy =au1_src_left_tmp; + for(row = ht; row > 0; row -= 2) + { + + left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); + //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); + // row = 1 + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2); + //row 1 left + left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15); + //row 0 left + left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); + + + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b); + cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b); + cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b); + cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b); + //combining the appropriate sign change + left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b); + + //row = 0 right + edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1)); + // row = 1 right + edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1)); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b); + cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b); + cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b); + cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b); + + //combining sign-left and sign_right + edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); + + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //cnvert to 16 bit then add and then saturated pack + left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b); + src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); + + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); + + pu1_src_cpy += (src_strd << 1); + pu1_src_left_cpy += 2; + pu1_src_left_str += 2; + } + au1_mask_cpy += 16; + pu1_src += 16; + pu1_src_left_cpy -= ht; + pu1_src_left_str -= ht; + + pu1_left_tmp = pu1_src_left_cpy; + pu1_src_left_cpy = pu1_src_left_str; + pu1_src_left_str = pu1_left_tmp; + } + + wd_rem = wd & 0xF; + if(wd_rem) + { + + cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd)); + _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b); + + au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); + pu1_src_cpy = pu1_src; + au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); + //pu1_src_left_cpy =au1_src_left_tmp; + for(row = ht; row > 0; row -= 4) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); + //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); + // row = 1 + cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); + // row = 3 + cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); + + + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4); + //row 3 left + edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8); + cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15); + //row 2 left + edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); + left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15); + //row 1 left + edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8); + cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15); + //row 0 left + edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15); + + // packing rows together for 16 SIMD operations + src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b); + src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b); + // packing rows together for 16 SIMD operations + left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b); + left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b); + + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b); + cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b); + cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b); + cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b); + //combining the appropriate sign change + left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b); + + //row = 0 right + edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1)); + // row = 1 right + cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1)); + // row = 2 right + edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1)); + // row = 3 right + cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1)); + // packing rows together for 16 SIMD operations + edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b); + edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b); + + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b); + cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b); + cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b); + cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b); + + //combining sign-left and sign_right + edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //shuffle to get sao offset + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); + + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //cnvert to 16 bit then add and then saturated pack + left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b); + src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); + //separting row 1 and row 3 + cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8); + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b); + // row = 2 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b); + // row = 3 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b); + + pu1_src_cpy += (src_strd << 2); + pu1_src_left_cpy += 4; + pu1_src_left_str += 4; + } + pu1_src += wd; + pu1_src_left_cpy -= ht; + pu1_src_left_str -= ht; + + pu1_left_tmp = pu1_src_left_cpy; + pu1_src_left_cpy = pu1_src_left_str; + pu1_src_left_str = pu1_left_tmp; + } + for(row = 0; row < ht; row++) + { + pu1_src_left[row] = pu1_src_left_cpy[row]; + } + } +} + + +void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + UWORD8 *pu1_src_left, + UWORD8 *pu1_src_top, + UWORD8 *pu1_src_top_left, + UWORD8 *pu1_src_top_right, + UWORD8 *pu1_src_bot_left, + UWORD8 *pu1_avail, + WORD8 *pi1_sao_offset_u, + WORD8 *pi1_sao_offset_v, + WORD32 wd, + WORD32 ht) +{ + WORD32 row, col; + UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp; + UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; + UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)]; + UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)]; + UWORD8 u1_avail0, u1_avail1; + WORD32 wd_rem; + WORD32 offset = 0; + + __m128i src_temp0_16x8b, src_temp1_16x8b; + __m128i left0_16x8b, left1_16x8b; + __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; + __m128i edge0_16x8b, edge1_16x8b; + __m128i au1_mask8x16b; + __m128i edge_idx_8x16b, sao_offset_8x16b; + __m128i const2_16x8b, const0_16x8b; + __m128i left_store_16x8b; + __m128i chroma_offset_8x16b; + UNUSED(pu1_src_top_right); + UNUSED(pu1_src_bot_left); + + au1_mask8x16b = _mm_set1_epi8(0xff); + + /* Update top and top-left arrays */ + pu1_src_top_left[0] = pu1_src_top[wd - 2]; + pu1_src_top_left[1] = pu1_src_top[wd - 1];; + + for(col = wd; col >= 16; col -= 16) + { + const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd)); + _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b); + offset += 16; + } + for(row = 0; row < 2 * ht; row++) + { + au1_src_left_tmp[row] = pu1_src_left[row]; + } + //setting availability mask to ff size MAX_CTB_SIZE + for(col = 0; col < MAX_CTB_SIZE; col += 16) + _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); + + edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); + sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); + const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); + chroma_offset_8x16b = _mm_set1_epi16(0x0800); + //availability mask creation + u1_avail0 = pu1_avail[0]; + u1_avail1 = pu1_avail[1]; + au1_mask[0] = u1_avail0; + au1_mask[1] = u1_avail0; + au1_mask[wd - 1] = u1_avail1; + au1_mask[wd - 2] = u1_avail1; + sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b); + const2_16x8b = _mm_set1_epi8(2); + const0_16x8b = _mm_setzero_si128(); + + { + pu1_src_left_cpy = au1_src_left_tmp; + pu1_src_left_str = au1_src_left_tmp1; + au1_mask_cpy = au1_mask; + for(col = wd; col >= 16; col -= 16) + { + pu1_src_cpy = pu1_src; + au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); + + for(row = ht; row > 0; row -= 2) + { + + left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); + //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); + // row = 1 + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4); + //row 1 left + left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14); + //row 0 left + left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); + + + //separating +ve and and -ve values.row 0 left + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //separating +ve and and -ve values.row 1 left + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + + //row = 0 right + edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2)); + // row = 1 right + edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2)); + //separating +ve and and -ve values.row 0 right + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //separating +ve and and -ve values.row 1 right + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //combining sign-left and sign_right + edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); + + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //cnvert to 16 bit then add and then saturated pack + left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b); + src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); + edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); + src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); + + pu1_src_cpy += (src_strd << 1); + pu1_src_left_cpy += 4; + pu1_src_left_str += 4; + } + au1_mask_cpy += 16; + pu1_src += 16; + pu1_src_left_cpy -= 2 * ht; + pu1_src_left_str -= 2 * ht; + + pu1_left_tmp = pu1_src_left_cpy; + pu1_src_left_cpy = pu1_src_left_str; + pu1_src_left_str = pu1_left_tmp; + } + + wd_rem = wd & 0xF; + if(wd_rem) + { + + cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd)); + _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b); + + au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); + pu1_src_cpy = pu1_src; + au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); + + for(row = ht; row > 0; row -= 4) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); + //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); + // row = 1 + cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); + // row = 3 + cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); + + + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8); + //row 3 left + edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8); + left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14); + //row 2 left + edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); + left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14); + + + // packing rows together for 16 SIMD operations + src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b); + left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b); + + //row 1 left + edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8); + edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14); + //row 0 left + edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14); + // packing rows together for 16 SIMD operations + src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b); + left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b); + + //separating +ve and and -ve values.for row 2 and row 3 + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + + + + + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + + //row = 0 right + edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2)); + // row = 1 right + cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2)); + // row = 2 right + edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2)); + // row = 3 right + cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2)); + // packing rows together for 16 SIMD operations + edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b); + edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b); + + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //combining sign-left and sign_right + edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //shuffle to get sao offset + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); + + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //cnvert to 16 bit then add and then saturated pack + left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b); + src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); + edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); + src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); + + //seaprting row 1 and row 3 + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8); + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + // row = 2 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b); + // row = 3 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); + + pu1_src_cpy += (src_strd << 2); + pu1_src_left_cpy += 8; + pu1_src_left_str += 8; + } + pu1_src += wd; + pu1_src_left_cpy -= 2 * ht; + pu1_src_left_str -= 2 * ht; + + pu1_left_tmp = pu1_src_left_cpy; + pu1_src_left_cpy = pu1_src_left_str; + pu1_src_left_str = pu1_left_tmp; + } + for(row = 0; row < 2 * ht; row++) + { + pu1_src_left[row] = pu1_src_left_cpy[row]; + } + } + +} + + +void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + UWORD8 *pu1_src_left, + UWORD8 *pu1_src_top, + UWORD8 *pu1_src_top_left, + UWORD8 *pu1_src_top_right, + UWORD8 *pu1_src_bot_left, + UWORD8 *pu1_avail, + WORD8 *pi1_sao_offset, + WORD32 wd, + WORD32 ht) +{ + WORD32 row, col; + UWORD8 *pu1_src_top_cpy; + UWORD8 *pu1_src_cpy; + WORD32 wd_rem; + + + __m128i src_top_16x8b, src_bottom_16x8b; + __m128i src_temp0_16x8b, src_temp1_16x8b; + __m128i signup0_16x8b, signdwn1_16x8b; + __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; + __m128i edge0_16x8b, edge1_16x8b; + __m128i edge_idx_8x16b, sao_offset_8x16b; + __m128i const2_16x8b, const0_16x8b; + + UNUSED(pu1_src_top_right); + UNUSED(pu1_src_bot_left); + + + /* Updating left and top-left */ + for(row = 0; row < ht; row++) + { + pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)]; + } + *pu1_src_top_left = pu1_src_top[wd - 1]; + + + + pu1_src_top_cpy = pu1_src_top; + edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); + sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset); + + /* Update height and source pointers based on the availability flags */ + if(0 == pu1_avail[2]) + { + pu1_src_top_cpy = pu1_src; + pu1_src += src_strd; + ht--; + } + if(0 == pu1_avail[3]) + { + ht--; + } + + const2_16x8b = _mm_set1_epi8(2); + const0_16x8b = _mm_setzero_si128(); + + { + WORD32 ht_rem; + for(col = wd; col >= 16; col -= 16) + { + pu1_src_cpy = pu1_src; + src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); + //row = 0 + src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + for(row = ht; row >= 2; row -= 2) + { + + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + + + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //row1-row0 + edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); + + //row1 -bottom + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //combining sign-left and sign_right + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); + + //for the next iteration signup0_16x8b = -signdwn1_16x8b + signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //copying the next top + src_top_16x8b = src_temp1_16x8b; + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); + + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); + + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_cpy += (src_strd << 1); + } + ht_rem = ht & 0x1; + + if(ht_rem) + { + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + //current row -next row + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //adding top and botton and constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + //copying the next top + src_top_16x8b = src_temp0_16x8b; + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + } + if(0 == pu1_avail[3]) + { + src_top_16x8b = src_bottom_16x8b; + } + //updating top flag + _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); + pu1_src += 16; + } + + wd_rem = wd & 0xF; + if(wd_rem) + { + pu1_src_cpy = pu1_src; + src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col)); + //row = 0 + src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); + for(row = ht; row >= 4; row -= 4) + { + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); + + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //row1-row0 + edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) + //row1 -row2 + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) + edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) + //packing row 0 n row 1 + src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); + //row = 3 + src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); + // row = 4 + src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd)); + + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty + signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2) + //separating +ve and and -ve values.(2,3) + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) + + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down) + edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); + //separating +ve and and -ve values.(3,4) + cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4) + //combining sign-left and sign_right + edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3) + + edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2) + + //packing row 2 n row 3 + src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); + //for the next iteration signup0_16x8b = -signdwn1_16x8b + signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3) + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //the next top already in src_top_16x8b + //src_top_16x8b = src_temp1_16x8b; + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); + src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); + src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b); + src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); + + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + //row = 2 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); + // row = 3 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); + + src_temp0_16x8b = src_temp1_16x8b; + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); + pu1_src_cpy += (src_strd << 2); + + } + ht_rem = ht & 0x2; + if(ht_rem) + { + + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); + + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //row1-row0 + edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) + //row1 -row2 + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) + edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) + //adding top and down substraction + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty + //for the next iteration signup0_16x8b = -signdwn1_16x8b + signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next + src_top_16x8b = src_temp1_16x8b; + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + + //the next top already in src_top_16x8b + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); + + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_cpy += (src_strd << 1); + + } + ht_rem = ht & 0x1; + if(ht_rem) + { + + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); + + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //adding top and down substraction + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); + edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + src_top_16x8b = src_temp0_16x8b; + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + pu1_src_cpy += (src_strd); + + } + if(0 == pu1_avail[3]) + { + src_top_16x8b = src_bottom_16x8b; + } + _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); + pu1_src += 8; + } + } +} + +void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + UWORD8 *pu1_src_left, + UWORD8 *pu1_src_top, + UWORD8 *pu1_src_top_left, + UWORD8 *pu1_src_top_right, + UWORD8 *pu1_src_bot_left, + UWORD8 *pu1_avail, + WORD8 *pi1_sao_offset_u, + WORD8 *pi1_sao_offset_v, + WORD32 wd, + WORD32 ht) +{ + WORD32 row, col; + UWORD8 *pu1_src_top_cpy; + UWORD8 *pu1_src_cpy; + WORD32 wd_rem; + + + __m128i src_top_16x8b, src_bottom_16x8b; + __m128i src_temp0_16x8b, src_temp1_16x8b; + __m128i signup0_16x8b, signdwn1_16x8b; + __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; + __m128i edge0_16x8b, edge1_16x8b; + __m128i edge_idx_8x16b, sao_offset_8x16b; + __m128i const2_16x8b, const0_16x8b; + __m128i chroma_offset_8x16b; + + UNUSED(pu1_src_top_right); + UNUSED(pu1_src_bot_left); + + /* Updating left and top and top-left */ + for(row = 0; row < ht; row++) + { + pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)]; + pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)]; + } + pu1_src_top_left[0] = pu1_src_top[wd - 2]; + pu1_src_top_left[1] = pu1_src_top[wd - 1]; + + + + pu1_src_top_cpy = pu1_src_top; + edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); + sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); + const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); + chroma_offset_8x16b = _mm_set1_epi16(0x0800); + /* Update height and source pointers based on the availability flags */ + if(0 == pu1_avail[2]) + { + pu1_src_top_cpy = pu1_src; + pu1_src += src_strd; + ht--; + } + if(0 == pu1_avail[3]) + { + ht--; + } + sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b); + const2_16x8b = _mm_set1_epi8(2); + const0_16x8b = _mm_setzero_si128(); + + + { + WORD32 ht_rem; + + + + for(col = wd; col >= 16; col -= 16) + { + pu1_src_cpy = pu1_src; + src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); + //row = 0 + src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + for(row = ht; row >= 2; row -= 2) + { + + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + + + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //row1-row0 + edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); + + //row1 -bottom + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //combining sign-left and sign_right + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); + + //for the next iteration signup0_16x8b = -signdwn1_16x8b + signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //copying the next top + src_top_16x8b = src_temp1_16x8b; + + + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); + + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); + edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); + src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); + + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_cpy += (src_strd << 1); + } + ht_rem = ht & 0x1; + + if(ht_rem) + { + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + //current row -next row + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //adding top and botton and constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + //copying the next top + src_top_16x8b = src_temp0_16x8b; + + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + } + if(0 == pu1_avail[3]) + { + src_top_16x8b = src_bottom_16x8b; + } + //updating top flag + _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); + pu1_src += 16; + } + + wd_rem = wd & 0xF; + if(wd_rem) + { + pu1_src_cpy = pu1_src; + src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col)); + //row = 0 + src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); + for(row = ht; row >= 4; row -= 4) + { + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); + + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //row1-row0 + edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) + //row1 -row2 + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) + edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) + //packing row 0 n row 1 + src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); + //row = 3 + src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); + // row = 4 + src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd)); + + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty + signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2) + //separating +ve and and -ve values.(2,3) + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) + + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down) + edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); + //separating +ve and and -ve values.(3,4) + cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4) + //combining sign-left and sign_right + edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3) + + edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2) + + //packing row 2 n row 3 + src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); + //for the next iteration signup0_16x8b = -signdwn1_16x8b + signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3) + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); + + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //the next top already in src_top_16x8b + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); + src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); + edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b); + src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); + + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + //row = 2 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); + // row = 3 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); + + src_temp0_16x8b = src_temp1_16x8b; + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); + pu1_src_cpy += (src_strd << 2); + + } + ht_rem = ht & 0x2; + if(ht_rem) + { + + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); + + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //row1-row0 + edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) + //row1 -row2 + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) + edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) + //adding top and down substraction + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty + //for the next iteration signup0_16x8b = -signdwn1_16x8b + signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next + src_top_16x8b = src_temp1_16x8b; + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + //the next top already in src_top_16x8b + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); + + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_cpy += (src_strd << 1); + + } + ht_rem = ht & 0x1; + if(ht_rem) + { + + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); + + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //adding top and down substraction + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + src_top_16x8b = src_temp0_16x8b; + + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); + edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + pu1_src_cpy += (src_strd); + + } + if(0 == pu1_avail[3]) + { + src_top_16x8b = src_bottom_16x8b; + } + _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); + pu1_src += 8; + } + } +} + +/* 135 degree filtering */ +void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + UWORD8 *pu1_src_left, + UWORD8 *pu1_src_top, + UWORD8 *pu1_src_top_left, + UWORD8 *pu1_src_top_right, + UWORD8 *pu1_src_bot_left, + UWORD8 *pu1_avail, + WORD8 *pi1_sao_offset, + WORD32 wd, + WORD32 ht) +{ + WORD32 row, col; + UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2; + UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2; + UWORD8 *pu1_firstleft; + UWORD8 *pu1_src_cpy, *pu1_src_org; + UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; + UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8]; + UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8]; + WORD32 wd_rem; + UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp; + WORD32 ht_tmp, ht_0; + + WORD32 bit_depth; + UWORD8 u1_avail0, u1_avail1; + + __m128i src_top_16x8b, src_bottom_16x8b; + __m128i src_temp0_16x8b, src_temp1_16x8b; + __m128i signup0_16x8b, signdwn1_16x8b; + __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; + __m128i edge0_16x8b, edge1_16x8b; + __m128i au1_mask8x16b; + __m128i edge_idx_8x16b, sao_offset_8x16b; + __m128i const2_16x8b, const0_16x8b; + __m128i left_store_16x8b; + UNUSED(pu1_src_top_right); + UNUSED(pu1_src_bot_left); + + ht_0 = ht; ht_tmp = ht; + au1_mask8x16b = _mm_set1_epi8(0xff); + + //setting availability mask to ff size MAX_CTB_SIZE + for(col = 0; col < MAX_CTB_SIZE; col += 16) + _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); + for(row = 0; row < ht; row++) + { + au1_src_left_tmp[row] = pu1_src_left[row]; + } + bit_depth = BIT_DEPTH_LUMA; + pu1_src_org = pu1_src; + pu1_src_top_cpy = pu1_src_top; + pu1_src_left_cpy2 = au1_src_left_tmp; + pu1_src_left_cpy = au1_src_left_tmp; + pu1_src_left_str2 = au1_src_left_tmp1; + pu1_src_left_str = au1_src_left_tmp1; + edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); + sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset); + + + /* If top-left is available, process separately */ + if(0 != pu1_avail[4]) + { + WORD8 edge_idx; + + edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) + + SIGN(pu1_src[0] - pu1_src[1 + src_strd]); + + edge_idx = gi1_table_edge_idx[edge_idx]; + + if(0 != edge_idx) + { + u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1); + } + else + { + u1_pos_0_0_tmp = pu1_src[0]; + } + } + else + { + u1_pos_0_0_tmp = pu1_src[0]; + } + + /* If bottom-right is available, process separately */ + if(0 != pu1_avail[7]) + { + WORD8 edge_idx; + + edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) + + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]); + + edge_idx = gi1_table_edge_idx[edge_idx]; + + if(0 != edge_idx) + { + u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1); + } + else + { + u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]; + } + } + else + { + u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]; + } + pu1_firstleft = pu1_src_top_left; + + /* Update height and source pointers based on the availability flags */ + if(0 == pu1_avail[2]) + { + pu1_firstleft = pu1_src_left_cpy2; + pu1_src_left_cpy2++; + pu1_src_left_str2++; + pu1_src_top_cpy = pu1_src; + pu1_src += src_strd; + ht--; + } + if(0 == pu1_avail[3]) + { + ht--; + ht_0--; + } + //storing top left in a mmx register + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft); + const2_16x8b = _mm_set1_epi8(2); + const0_16x8b = _mm_setzero_si128(); + left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15); + //update top -left + *pu1_src_top_left = pu1_src_top[wd - 1]; + //availability mask creation + u1_avail0 = pu1_avail[0]; + u1_avail1 = pu1_avail[1]; + au1_mask[0] = u1_avail0; + au1_mask[wd - 1] = u1_avail1; + { + WORD32 ht_rem; + + + pu1_src_left_cpy = pu1_src_left_cpy2; + pu1_src_left_str = pu1_src_left_str2; + au1_mask_cpy = au1_mask; + for(col = wd; col >= 16; col -= 16) + { + pu1_src_cpy = pu1_src; + src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); + //row = 0 + src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); + src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15); + //loading the mask + au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + + for(row = ht; row >= 2; row -= 2) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + // row = 1 right + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1)); + //to insert left in row 0 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15); + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //manipulation for row 1 - row 0 + signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1) + //row1-row0 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + // row = 2 right + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1)); + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0) + + + //row1 -bottom + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + + //combining sign-left and sign_right + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + + //storing the row 1 left for next row. + signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14); + + //combining sign-left and sign_right + edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); + //manipulation for bottom - row 1 + signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15); + //eliminating old left for row 0 and row 1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); + //bottom - row1 + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //for the next iteration bottom -row1 + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //row1 getting it right for left of next block + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //row0 getting it right for left of next block + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); + //copying the next top + src_top_16x8b = src_temp1_16x8b; + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); + + //store left boundary + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); + + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_cpy += (src_strd << 1); + pu1_src_left_cpy += 2; + pu1_src_left_str += 2; + } + ht_rem = ht & 0x1; + + if(ht_rem) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1)); + //current row -next row + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //adding top and botton and constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + //eliminating old left for row 0 and row 1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); + + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + + //row0 getting it right for left of next block + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); + //copying the next top + src_top_16x8b = src_temp0_16x8b; + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + //store left boundary + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + pu1_src_cpy += (src_strd); + pu1_src_left_cpy += 1; + pu1_src_left_str += 1; + } + if(0 == pu1_avail[3]) + { + src_top_16x8b = src_bottom_16x8b; + pu1_src_left_str[0] = pu1_src_cpy[15]; + } + if(0 == pu1_avail[2]) + { + pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd]; + } + + //for the top left of next part of the block + left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); + //updating top flag + _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); + pu1_src += 16; + au1_mask_cpy += 16; + + + pu1_left_tmp = pu1_src_left_cpy2; + pu1_src_left_cpy2 = pu1_src_left_str2; + pu1_src_left_str2 = pu1_left_tmp; + + pu1_src_left_cpy = pu1_src_left_cpy2; + pu1_src_left_str = pu1_src_left_str2; + } + + wd_rem = wd & 0xF; + if(wd_rem) + { + pu1_src_left_cpy = pu1_src_left_cpy2; + pu1_src_left_str = pu1_src_left_str2; + pu1_src_cpy = pu1_src; + src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col)); + //row = 0 + src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); + src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15); + au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //???? + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //preparing au1_mask + au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); + + for(row = ht; row >= 4; row -= 4) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + //right row1 + signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + //manipulation for row 1 -row 0 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //row 0 left + signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //row 1 -row0 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //row1-row0 + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) + + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) + //right row2 + signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1); + //packing row 0 n row 1 + src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); + //row1 -row2 + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) + edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) + //manipulation for row 2 -row 1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); + //row 1 left + signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); + //row = 3 + src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd)); + + // row = 4 + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd)); + + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty + + //separating +ve and and -ve values.(2,1) + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + //manipulation for row 3 -row 2 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //row 2 left + signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) + + //separating +ve and and -ve values.(3,2) + cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1) + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //right row3 + signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1); + //combining the appropriate sign change + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2) + + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1) + + //separating +ve and and -ve values.(2,3) + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + //right row 4 + signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) + + //separating +ve and and -ve values.(3,bottom) + cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3) + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom) + edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3) + + //manipulation for bottom -row 3 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); + //eliminating old left for row 0,1,2,3 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); + //packing row 2 n row 3 + src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); + //row 3 left + signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15); + //loading row 3 right into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15); + //adding bottom and top values of row 2 and row 3 + edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2) + //separating +ve and and -ve values.(botttom,3) + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + //to store right of row 2 + signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration + + //storing right of row 2into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); + //to store right of row 0 + signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + //storing right of row 1 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + + //storing right of row 0 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); + src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); + src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b); + src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); + + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + //row = 2 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); + // row = 3 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); + + src_temp0_16x8b = src_temp1_16x8b; + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); + pu1_src_cpy += (src_strd << 2); + pu1_src_left_cpy += 4; + pu1_src_left_str += 4; + } + ht_rem = ht & 0x2; + if(ht_rem) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + + //row 0 -row 1 + signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + //manipulation for row 1 -row 0 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //manipulation for row 1 - row 0 + signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //row1-row0 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign chang + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //row 1 -bottom + signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1); + + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) + //row1 -bottom + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) + edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) + //manipulation for bottom -row1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); + //manipulation for bottom- row 1 + signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); + //adding top and down substraction + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty + //bottom - row 1 + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + + //eliminating old left for row 0,1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); + signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //for the next iteration signup0_16x8b + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next + + //storing right of row 1 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); + //for storing right of row 1 + signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + + src_top_16x8b = src_temp1_16x8b; + //storing right of row 0 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + + //the next top already in src_top_16x8b + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); + + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_cpy += (src_strd << 1); + pu1_src_left_cpy += 2; + pu1_src_left_str += 2; + } + ht_rem = ht & 0x1; + if(ht_rem) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + //left store manipulation 1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); + //row 0 -row1 + signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //adding top and down substraction + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + //for row 0 right to put into left store + signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); + edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); + //filling the left boundary value + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); + + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + src_top_16x8b = src_temp0_16x8b; + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + pu1_src_cpy += (src_strd); + pu1_src_left_cpy += 1; + pu1_src_left_str += 1; + } + if(0 == pu1_avail[3]) + { + src_top_16x8b = src_bottom_16x8b; + pu1_src_left_str[0] = pu1_src_cpy[7]; + } + + if(0 == pu1_avail[2]) + { + pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd]; + } + + _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); + pu1_src += 8; + au1_mask_cpy += 16; + + pu1_left_tmp = pu1_src_left_cpy2; + pu1_src_left_cpy2 = pu1_src_left_str2; + pu1_src_left_str2 = pu1_left_tmp; + + pu1_src_left_cpy = pu1_src_left_cpy2; + pu1_src_left_str = pu1_src_left_str2; + } + pu1_src_org[0] = u1_pos_0_0_tmp; + pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp; + pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy; + for(row = 0; row < ht_tmp; row++) + { + pu1_src_left[row] = pu1_src_left_cpy[row]; + } + } + +} + +/* 135 degree filtering */ +void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + UWORD8 *pu1_src_left, + UWORD8 *pu1_src_top, + UWORD8 *pu1_src_top_left, + UWORD8 *pu1_src_top_right, + UWORD8 *pu1_src_bot_left, + UWORD8 *pu1_avail, + WORD8 *pi1_sao_offset_u, + WORD8 *pi1_sao_offset_v, + WORD32 wd, + WORD32 ht) +{ + WORD32 row, col; + UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2; + UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2; + UWORD8 *pu1_firstleft; + UWORD8 *pu1_src_cpy, *pu1_src_org; + UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; + UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)]; + UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)]; + WORD32 wd_rem; + UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v; + WORD32 ht_tmp; + WORD32 ht_0; + + WORD32 bit_depth; + UWORD8 u1_avail0, u1_avail1; + + __m128i src_temp0_16x8b, src_temp1_16x8b; + __m128i signup0_16x8b, signdwn1_16x8b; + __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; + __m128i edge0_16x8b, edge1_16x8b; + __m128i src_top_16x8b, src_bottom_16x8b; + __m128i au1_mask8x16b; + __m128i edge_idx_8x16b, sao_offset_8x16b; + __m128i const2_16x8b, const0_16x8b; + __m128i left_store_16x8b; + __m128i chroma_offset_8x16b; + + UNUSED(pu1_src_top_right); + UNUSED(pu1_src_bot_left); + + ht_0 = ht; ht_tmp = ht; + au1_mask8x16b = _mm_set1_epi8(0xff); + /* Updating left and top-left */ + for(row = 0; row < 2 * ht; row++) + { + au1_src_left_tmp[row] = pu1_src_left[row]; + } + //setting availability mask to ff size MAX_CTB_SIZE + for(col = 0; col < MAX_CTB_SIZE; col += 16) + _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); + bit_depth = BIT_DEPTH_LUMA; + pu1_src_org = pu1_src; + pu1_src_top_cpy = pu1_src_top; + pu1_src_left_cpy2 = au1_src_left_tmp; + pu1_src_left_cpy = au1_src_left_tmp; + pu1_src_left_str2 = au1_src_left_tmp1; + pu1_src_left_str = au1_src_left_tmp1; + edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); + sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); + const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); + chroma_offset_8x16b = _mm_set1_epi16(0x0800); + + /* If top-left is available, process separately */ + if(0 != pu1_avail[4]) + { + WORD32 edge_idx; + + /* U */ + edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) + + SIGN(pu1_src[0] - pu1_src[2 + src_strd]); + + edge_idx = gi1_table_edge_idx[edge_idx]; + + if(0 != edge_idx) + { + u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1); + } + else + { + u1_pos_0_0_tmp_u = pu1_src[0]; + } + + /* V */ + edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) + + SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]); + + edge_idx = gi1_table_edge_idx[edge_idx]; + + if(0 != edge_idx) + { + u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1); + } + else + { + u1_pos_0_0_tmp_v = pu1_src[1]; + } + } + else + { + u1_pos_0_0_tmp_u = pu1_src[0]; + u1_pos_0_0_tmp_v = pu1_src[1]; + } + + /* If bottom-right is available, process separately */ + if(0 != pu1_avail[7]) + { + WORD32 edge_idx; + + /* U */ + edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) + + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]); + + edge_idx = gi1_table_edge_idx[edge_idx]; + + if(0 != edge_idx) + { + u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1); + } + else + { + u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]; + } + + /* V */ + edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) + + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]); + + edge_idx = gi1_table_edge_idx[edge_idx]; + + if(0 != edge_idx) + { + u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1); + } + else + { + u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd]; + } + } + else + { + u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]; + u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd]; + } + pu1_firstleft = pu1_src_top_left; + + /* Update height and source pointers based on the availability flags */ + if(0 == pu1_avail[2]) + { + pu1_firstleft = pu1_src_left_cpy2; + pu1_src_left_cpy2 += 2; + pu1_src_left_str2 += 2; + pu1_src_top_cpy = pu1_src; + pu1_src += src_strd; + ht--; + } + if(0 == pu1_avail[3]) + { + ht--; + ht_0--; + } + //storing top left in a mmx register + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft); + sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b); + const2_16x8b = _mm_set1_epi8(2); + const0_16x8b = _mm_setzero_si128(); + left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14); + + //availability mask creation + u1_avail0 = pu1_avail[0]; + u1_avail1 = pu1_avail[1]; + au1_mask[0] = u1_avail0; + au1_mask[1] = u1_avail0; + au1_mask[wd - 1] = u1_avail1; + au1_mask[wd - 2] = u1_avail1; + + /* top-left arrays */ + pu1_src_top_left[0] = pu1_src_top[wd - 2]; + pu1_src_top_left[1] = pu1_src_top[wd - 1]; + { + WORD32 ht_rem; + au1_mask_cpy = au1_mask; + + pu1_src_left_cpy = pu1_src_left_cpy2; + pu1_src_left_str = pu1_src_left_str2; + for(col = wd; col >= 16; col -= 16) + { + pu1_src_cpy = pu1_src; + src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); + //row = 0 + src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); + src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14); + //loading the mask + au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + + for(row = ht; row >= 2; row -= 2) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + // row = 1 right + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2)); + //to insert left in row 0 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //manipulation for row 1 - row 0 + signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1) + //row1-row0 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + // row = 2 right + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2)); + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0) + + + //row1 -bottom + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + + //combining sign-left and sign_right + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + + //storing the row 1 left for next row. + signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12); + + //combining sign-left and sign_right + edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); + //manipulation for bottom - row 1 + signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14); + //eliminating old left for row 0 and row 1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); + //bottom - row1 + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //for the next iteration bottom -row1 + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //row1 getting it right for left of next iteration + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14); + //copying the next top + src_top_16x8b = src_temp1_16x8b; + //row0 getting its right for left of next iteration. + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); + + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); + + + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); + edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); + src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); + + //store left boundary + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); + + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_cpy += (src_strd << 1); + pu1_src_left_cpy += 4; + pu1_src_left_str += 4; + } + ht_rem = ht & 0x1; + + if(ht_rem) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2)); + //current row -next row + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //adding top and botton and constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + + //eliminating old left for row 0 and row 1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); + //copying the next top + src_top_16x8b = src_temp0_16x8b; + //row0 getting it right for left of next block + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); + + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + pu1_src_cpy += (src_strd); + pu1_src_left_cpy += 2; + pu1_src_left_str += 2; + } + if(0 == pu1_avail[3]) + { + src_top_16x8b = src_bottom_16x8b; + pu1_src_left_str[1] = pu1_src_cpy[15]; + pu1_src_left_str[0] = pu1_src_cpy[14]; + } + if(0 == pu1_avail[2]) + { + pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd]; + pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd]; + } + + //for the top left of next part of the block + left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); + //updating top flag + _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); + pu1_src += 16; + au1_mask_cpy += 16; + + pu1_left_tmp = pu1_src_left_cpy2; + pu1_src_left_cpy2 = pu1_src_left_str2; + pu1_src_left_str2 = pu1_left_tmp; + + pu1_src_left_cpy = pu1_src_left_cpy2; + pu1_src_left_str = pu1_src_left_str2; + } + wd_rem = wd & 0xF; + if(wd_rem) + { + pu1_src_left_cpy = pu1_src_left_cpy2; + pu1_src_left_str = pu1_src_left_str2; + pu1_src_cpy = pu1_src; + src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col)); + //row = 0 + src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); + src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14); + au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //???? + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //preparing au1_mask + au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); + + for(row = ht; row >= 4; row -= 4) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + //right row1 + signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + //manipulation for row 1 -row 0 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //row 0 left + signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //row 1 -row0 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //row1-row0 + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) + + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) + //right row2 + signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2); + //packing row 0 n row 1 + src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); + //row1 -row2 + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) + edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) + //manipulation for row 2 -row 1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); + //row 1 left + signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); + //row = 3 + src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd)); + + // row = 4 + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd)); + + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty + + //separating +ve and and -ve values.(2,1) + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + //manipulation for row 3 -row 2 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //row 2 left + signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) + + //separating +ve and and -ve values.(3,2) + cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1) + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //right row3 + signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2); + //combining the appropriate sign change + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2) + + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1) + + //separating +ve and and -ve values.(2,3) + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + //right row 4 + signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) + + //separating +ve and and -ve values.(3,bottom) + cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3) + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom) + edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3) + + //manipulation for bottom -row 3 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 8); + //eliminating old left for row 0,1,2,3 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8); + //packing row 2 n row 3 + src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); + //row 3 left + signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14); + + //adding bottom and top values of row 2 and row 3 + edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2) + //separating +ve and and -ve values.(botttom,3) + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration + + //to store right of row 2 + signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8); + //loading row 3 right into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14); + //storing right of row 2into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); + //to store right of row 0 + signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + //storing right of row 1 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); + //storing right of row 0 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); + + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); + + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); + src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); + edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b); + src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); + + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); + + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + //row = 2 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); + // row = 3 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); + + src_temp0_16x8b = src_temp1_16x8b; + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); + pu1_src_cpy += (src_strd << 2); + pu1_src_left_cpy += 8; + pu1_src_left_str += 8; + } + ht_rem = ht & 0x2; + if(ht_rem) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + + //row 0 -row 1 + signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + //manipulation for row 1 -row 0 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //manipulation for row 1 - row 0 + signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //row1-row0 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign chang + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //row 1 -bottom + signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2); + + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) + //row1 -bottom + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) + edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) + //manipulation for bottom -row1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); + //eliminating old left for row 0,1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); + //manipulation for bottom- row 1 + signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); + //adding top and down substraction + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty + //bottom - row 1 + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + + //shifting row 1 + signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //for the next iteration signup0_16x8b + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next + //storing right of row 1 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0 + signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + //the next top in src_top_16x8b + src_top_16x8b = src_temp1_16x8b; + //storing right of row 0 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); + + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + //the next top already in src_top_16x8b + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); + + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_cpy += (src_strd << 1); + pu1_src_left_cpy += 4; + pu1_src_left_str += 4; + } + ht_rem = ht & 0x1; + if(ht_rem) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + + //row 0 -row1 + signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //adding top and down substraction + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + + //for row 0 right to put into left store + signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + //left store manipulation 1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); + src_top_16x8b = src_temp0_16x8b; + //filling the left boundary value + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); + edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); + + + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + pu1_src_cpy += (src_strd); + pu1_src_left_cpy += 2; + pu1_src_left_str += 2; + } + if(0 == pu1_avail[3]) + { + src_top_16x8b = src_bottom_16x8b; + pu1_src_left_str[1] = pu1_src_cpy[7]; + pu1_src_left_str[0] = pu1_src_cpy[6]; + } + + if(0 == pu1_avail[2]) + { + pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd]; + pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd]; + } + + _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); + pu1_src += 8; + + pu1_left_tmp = pu1_src_left_cpy2; + pu1_src_left_cpy2 = pu1_src_left_str2; + pu1_src_left_str2 = pu1_left_tmp; + + pu1_src_left_cpy = pu1_src_left_cpy2; + pu1_src_left_str = pu1_src_left_str2; + } + pu1_src_org[0] = u1_pos_0_0_tmp_u; + pu1_src_org[1] = u1_pos_0_0_tmp_v; + pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u; + pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v; + pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy; + for(row = 0; row < 2 * ht_tmp; row++) + { + pu1_src_left[row] = pu1_src_left_cpy[row]; + } + } + +} + +void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + UWORD8 *pu1_src_left, + UWORD8 *pu1_src_top, + UWORD8 *pu1_src_top_left, + UWORD8 *pu1_src_top_right, + UWORD8 *pu1_src_bot_left, + UWORD8 *pu1_avail, + WORD8 *pi1_sao_offset, + WORD32 wd, + WORD32 ht) +{ + WORD32 row, col; + UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2; + UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2; + UWORD8 *pu1_src_cpy, *pu1_src_org; + UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8]; + UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8]; + UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; + WORD32 wd_rem; + UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp; + WORD32 ht_tmp; + WORD32 bit_depth; + UWORD8 u1_avail0, u1_avail1; + + __m128i src_top_16x8b, src_bottom_16x8b; + __m128i src_temp0_16x8b, src_temp1_16x8b; + __m128i signup0_16x8b, signdwn1_16x8b; + __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; + __m128i edge0_16x8b, edge1_16x8b; + __m128i au1_mask8x16b; + __m128i edge_idx_8x16b, sao_offset_8x16b; + __m128i const2_16x8b, const0_16x8b; + __m128i left_store_16x8b; + + ht_tmp = ht; + au1_mask8x16b = _mm_set1_epi8(0xff); + + au1_src_left_tmp[0] = pu1_src[(wd - 1)]; + //manipulation for bottom left + for(row = 1; row < ht; row++) + { + au1_src_left_tmp[row] = pu1_src_left[row]; + } + au1_src_left_tmp[ht] = pu1_src_bot_left[0]; + + *pu1_src_top_left = pu1_src_top[wd - 1]; + //setting availability mask to ff size MAX_CTB_SIZE + for(col = 0; col < MAX_CTB_SIZE; col += 16) + _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); + bit_depth = BIT_DEPTH_LUMA; + pu1_src_org = pu1_src; + pu1_src_top_cpy = pu1_src_top; + pu1_src_left_cpy2 = au1_src_left_tmp; + pu1_src_left_cpy = au1_src_left_tmp; + pu1_src_left_str2 = au1_src_left_tmp1; + pu1_src_left_str = au1_src_left_tmp1; + edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); + sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset); + + /* If top-right is available, process separately */ + if(0 != pu1_avail[5]) + { + WORD32 edge_idx; + + edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) + + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]); + + edge_idx = gi1_table_edge_idx[edge_idx]; + + if(0 != edge_idx) + { + u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1); + } + else + { + u1_pos_wd_0_tmp = pu1_src[wd - 1]; + } + } + else + { + u1_pos_wd_0_tmp = pu1_src[wd - 1]; + } + + /* If bottom-left is available, process separately */ + if(0 != pu1_avail[6]) + { + WORD32 edge_idx; + + edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) + + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]); + + edge_idx = gi1_table_edge_idx[edge_idx]; + + if(0 != edge_idx) + { + u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1); + } + else + { + u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd]; + } + } + else + { + u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd]; + } + + + + /* Update height and source pointers based on the availability flags */ + if(0 == pu1_avail[2]) + { + pu1_src_left_cpy2++; + pu1_src_left_str2++; + pu1_src_top_cpy = pu1_src; + pu1_src += src_strd; + ht--; + } + if(0 == pu1_avail[3]) + { + ht--; + } + + + const2_16x8b = _mm_set1_epi8(2); + const0_16x8b = _mm_setzero_si128(); + + + //availability mask creation + u1_avail0 = pu1_avail[0]; + u1_avail1 = pu1_avail[1]; + au1_mask[0] = u1_avail0; + au1_mask[wd - 1] = u1_avail1; + { + WORD32 ht_rem; + + pu1_src_left_cpy = pu1_src_left_cpy2; + pu1_src_left_str = pu1_src_left_str2; + au1_mask_cpy = au1_mask; + for(col = wd; col >= 16; col -= 16) + { + pu1_src_cpy = pu1_src; + src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1)); + //row = 0 + src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); + + //loading the mask + au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + for(row = ht; row >= 2; row -= 2) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); + //row = 1 + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + //to insert left in row 1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); + // row = 0 right + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1)); + + //manipulation for row 1 - row 0 + signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1) + //combining sign-left and sign_right + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + + //row1-row0 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + // row = 1 right + signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1)); + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0) + + //bottom - row1 + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //for the next iteration bottom -row1 + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //to insert left in row 1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13); + //manipulation for row 1 - bottom + signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); + + //row1 -bottom + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //combining sign-left and sign_right + edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); + + //eliminating old left for row 0 and row 1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); + + //row1 getting it right for left of next block + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //row0 getting it right for left of next block + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); + //copying the next top + src_top_16x8b = src_temp1_16x8b; + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); + //store left boundary + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); + + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_cpy += (src_strd << 1); + pu1_src_left_cpy += 2; + pu1_src_left_str += 2; + } + ht_rem = ht & 0x1; + + if(ht_rem) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + //to insert left in row 1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); + //manipulation for row 1 - row 0 + signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); + + //current row -next row + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //adding top and bottom and constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + //eliminating old left for row 0 and row 1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); + + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + + //row0 getting it right for left of next block + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); + //copying the next top + src_top_16x8b = src_temp0_16x8b; + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + //store left boundary + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + pu1_src_cpy += (src_strd); + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_left_cpy++; + pu1_src_left_str++; + } + { //for bottom right + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + } + if(0 == pu1_avail[3]) + { + src_top_16x8b = src_bottom_16x8b; + } + //for the top left of next part of the block + left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); + //updating top flag + _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); + pu1_src += 16; + au1_mask_cpy += 16; + + pu1_left_tmp = pu1_src_left_cpy2; + pu1_src_left_cpy2 = pu1_src_left_str2; + pu1_src_left_str2 = pu1_left_tmp; + + pu1_src_left_cpy = pu1_src_left_cpy2; + pu1_src_left_str = pu1_src_left_str2; + } + + wd_rem = wd & 0xF; + if(wd_rem) + { + pu1_src_cpy = pu1_src; + pu1_src_left_cpy = pu1_src_left_cpy2; + pu1_src_left_str = pu1_src_left_str2; + src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1)); + //row = 0 + src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); + au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //???? + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //preparing au1_mask + au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); + + for(row = ht; row >= 4; row -= 4) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + //manipulation for row 0 -row 1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); + //row 1 left + signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //manipulatiing for row 1 -row 0 + signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //row 1 -row0 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //row1-row0 + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) + + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) + //manipulation for row 1 -row 2 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13); + //row 2 left + signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); + //packing row 0 n row 1 + src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); + //row1 -row2 + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) + edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) + + //row 1 right + signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); + //row = 3 + src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd)); + + // row = 4 + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd)); + + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty + + //separating +ve and and -ve values.(2,1) + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //row 2 right + signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) + + //separating +ve and and -ve values.(3,2) + cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1) + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //manipulation for row 2 -row 3 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); + //row 3 left + signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15); + //combining the appropriate sign change + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2) + + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1) + + //separating +ve and and -ve values.(2,3) + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + + //manipulation for row 3 -bottom + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 11); + //bottom left + signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) + + //separating +ve and and -ve values.(3,bottom) + cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3) + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom) + edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3) + + + //eliminating old left for row 0,1,2,3 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); + //packing row 2 n row 3 + src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); + //row 3 right + signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1); + //loading row 3 right into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15); + //adding bottom and top values of row 2 and row 3 + edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2) + //separating +ve and and -ve values.(botttom,3) + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + //to store right of row 2 + signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration + + //storing right of row 2into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); + //to store right of row 0 + signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + //storing right of row 1 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + + //storing right of row 0 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); + src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); + src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b); + src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); + + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + //row = 2 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); + // row = 3 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); + + src_temp0_16x8b = src_temp1_16x8b; + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); + pu1_src_cpy += (src_strd << 2); + pu1_src_left_cpy += 4; + pu1_src_left_str += 4; + } + ht_rem = ht & 0x2; + if(ht_rem) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + + //manipulation for row 0 -row 1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); + //bottom left + signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //manipulation for row 1 - row 0 + signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //row1-row0 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign chang + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //manipulation for row 1 -bottom + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13); + //bottom left + signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); + + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) + //row1 -bottom + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) + edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) + //manipulation for bottom- row 1 (row 1 right) + signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); + //adding top and down substraction + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty + //bottom - row 1 + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + + //eliminating old left for row 0,1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); + signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //for the next iteration signup0_16x8b + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next + + //storing right of row 1 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); + //for storing right of row 1 + signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + + src_top_16x8b = src_temp1_16x8b; + //storing right of row 0 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + + //the next top already in src_top_16x8b + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); + + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_cpy += (src_strd << 1); + pu1_src_left_cpy += 2; + pu1_src_left_str += 2; + } + ht_rem = ht & 0x1; + if(ht_rem) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + + + //manipulation for row 0 -bottom + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); + //bottom left + signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //adding top and down substraction + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + //for row 0 right to put into left store + signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); + edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); + //left store manipulation 1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); + //filling the left boundary value + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); + + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + src_top_16x8b = src_temp0_16x8b; + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); + + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + pu1_src_cpy += (src_strd); + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_left_cpy++; + pu1_src_left_str++; + } + { //for bottom right + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); + src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); + _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); + } + if(0 == pu1_avail[3]) + { + src_top_16x8b = src_bottom_16x8b; + } + _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); + pu1_src += 8; + + pu1_left_tmp = pu1_src_left_cpy2; + pu1_src_left_cpy2 = pu1_src_left_str2; + pu1_src_left_str2 = pu1_left_tmp; + + pu1_src_left_cpy = pu1_src_left_cpy2; + pu1_src_left_str = pu1_src_left_str2; + + } + pu1_src_org[wd - 1] = u1_pos_wd_0_tmp; + pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp; + pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy; + pu1_src_left[0] = au1_src_left_tmp[0]; + for(row = 1; row < ht_tmp; row++) + { + pu1_src_left[row] = pu1_src_left_cpy[row]; + } + } + +} + +void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + UWORD8 *pu1_src_left, + UWORD8 *pu1_src_top, + UWORD8 *pu1_src_top_left, + UWORD8 *pu1_src_top_right, + UWORD8 *pu1_src_bot_left, + UWORD8 *pu1_avail, + WORD8 *pi1_sao_offset_u, + WORD8 *pi1_sao_offset_v, + WORD32 wd, + WORD32 ht) +{ + WORD32 row, col; + UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2; + UWORD8 *pu1_src_cpy, *pu1_src_org; + UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)]; + UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; + WORD32 wd_rem; + UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v; + WORD32 ht_tmp; + WORD32 bit_depth; + UWORD8 u1_avail0, u1_avail1; + + __m128i src_top_16x8b, src_bottom_16x8b; + __m128i src_temp0_16x8b, src_temp1_16x8b; + __m128i signup0_16x8b, signdwn1_16x8b; + __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; + __m128i edge0_16x8b, edge1_16x8b; + __m128i au1_mask8x16b; + __m128i edge_idx_8x16b, sao_offset_8x16b; + __m128i left_store_16x8b; + __m128i const0_16x8b, const2_16x8b; + __m128i chroma_offset_8x16b; + + ht_tmp = ht; + au1_mask8x16b = _mm_set1_epi8(0xff); + + + au1_src_left_tmp[0] = pu1_src[(wd - 2)]; + au1_src_left_tmp[1] = pu1_src[(wd - 1)]; + //manipulation for bottom left + for(row = 2; row < 2 * ht; row++) + { + au1_src_left_tmp[row] = pu1_src_left[row]; + } + au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0]; + au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1]; + + pu1_src_top_left[0] = pu1_src_top[wd - 2]; + pu1_src_top_left[1] = pu1_src_top[wd - 1]; + //setting availability mask to ff size MAX_CTB_SIZE + for(col = 0; col < MAX_CTB_SIZE; col += 16) + _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); + bit_depth = BIT_DEPTH_LUMA; + pu1_src_org = pu1_src; + pu1_src_top_cpy = pu1_src_top; + pu1_src_left_cpy2 = au1_src_left_tmp; + pu1_src_left_cpy = au1_src_left_tmp; + edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); + sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); + const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); + chroma_offset_8x16b = _mm_set1_epi16(0x0800); + /* If top-right is available, process separately */ + if(0 != pu1_avail[5]) + { + WORD32 edge_idx; + + /* U */ + edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) + + SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]); + + edge_idx = gi1_table_edge_idx[edge_idx]; + + if(0 != edge_idx) + { + u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1); + } + else + { + u1_pos_wd_0_tmp_u = pu1_src[wd - 2]; + } + + /* V */ + edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) + + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]); + + edge_idx = gi1_table_edge_idx[edge_idx]; + + if(0 != edge_idx) + { + u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1); + } + else + { + u1_pos_wd_0_tmp_v = pu1_src[wd - 1]; + } + } + else + { + u1_pos_wd_0_tmp_u = pu1_src[wd - 2]; + u1_pos_wd_0_tmp_v = pu1_src[wd - 1]; + } + + /* If bottom-left is available, process separately */ + if(0 != pu1_avail[6]) + { + WORD32 edge_idx; + + /* U */ + edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) + + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]); + + edge_idx = gi1_table_edge_idx[edge_idx]; + + if(0 != edge_idx) + { + u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1); + } + else + { + u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd]; + } + + /* V */ + edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) + + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]); + + edge_idx = gi1_table_edge_idx[edge_idx]; + + if(0 != edge_idx) + { + u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1); + } + else + { + u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]; + } + } + else + { + u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd]; + u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]; + } + + + + /* Update height and source pointers based on the availability flags */ + if(0 == pu1_avail[2]) + { + pu1_src_left_cpy2 += 2; + pu1_src_top_cpy = pu1_src; + pu1_src += src_strd; + ht--; + } + if(0 == pu1_avail[3]) + { + ht--; + } + + sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b); + const2_16x8b = _mm_set1_epi8(2); + const0_16x8b = _mm_setzero_si128(); + + + //availability mask creation + u1_avail0 = pu1_avail[0]; + u1_avail1 = pu1_avail[1]; + au1_mask[0] = u1_avail0; + au1_mask[1] = u1_avail0; + au1_mask[wd - 1] = u1_avail1; + au1_mask[wd - 2] = u1_avail1; + { + WORD32 ht_rem; + au1_mask_cpy = au1_mask; + for(col = wd; col >= 16; col -= 16) + { + pu1_src_cpy = pu1_src; + src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2)); + //row = 0 + src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); + + //loading the mask + au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + pu1_src_left_cpy = pu1_src_left_cpy2; + + for(row = ht; row >= 2; row -= 2) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); + //row = 1 + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + //to insert left in row 1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); + // row = 0 right + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2)); + + //manipulation for row 1 - row 0 + signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1) + //combining sign-left and sign_right + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + + //row1-row0 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + // row = 1 right + signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2)); + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0) + + //bottom - row1 + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //for the next iteration bottom -row1 + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //to insert left in row 1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10); + //manipulation for row 1 - bottom + signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); + + //row1 -bottom + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //combining sign-left and sign_right + edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); + + //eliminating old left for row 0 and row 1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); + //row1 getting it right for left of next block + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14); + //row0 getting it right for left of next block + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); + //copying the next top + src_top_16x8b = src_temp1_16x8b; + + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); + + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); + + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); + edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); + src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); + //store left boundary + _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); + + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_cpy += (src_strd << 1); + pu1_src_left_cpy += 4; + } + ht_rem = ht & 0x1; + + if(ht_rem) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + //to insert left in row 1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); + //manipulation for row 1 - row 0 + signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); + + //current row -next row + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //adding top and bottom and constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + //eliminating old left for row 0 and row 1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); + //row0 getting it right for left of next block + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); + //copying the next top + src_top_16x8b = src_temp0_16x8b; + + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + + + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + //store left boundary + _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); + + _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + pu1_src_cpy += (src_strd); + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_left_cpy += 2; + } + { //for bottom right + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); + _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); + } + if(0 == pu1_avail[3]) + { + src_top_16x8b = src_bottom_16x8b; + } + //for the top left of next part of the block + left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); + //updating top flag + _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); + pu1_src += 16; + au1_mask_cpy += 16; + } + pu1_src_left_cpy = pu1_src_left_cpy2; + wd_rem = wd & 0xF; + if(wd_rem) + { + pu1_src_cpy = pu1_src; + src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2)); + //row = 0 + src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); + au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //???? + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //preparing au1_mask + au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); + pu1_src_left_cpy = pu1_src_left_cpy2; + for(row = ht; row >= 4; row -= 4) + { + left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy); + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + //manipulation for row 0 -row 1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); + //row 1 left + signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); + //row 0 -row1 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //manipulatiing for row 1 -row 0 + signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //row 1 -row0 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //row1-row0 + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) + + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) + //manipulation for row 1 -row 2 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10); + //row 2 left + signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); + //packing row 0 n row 1 + src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); + //row1 -row2 + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) + edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) + + //row 1 right + signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); + //row = 3 + src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd)); + + // row = 4 + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd)); + + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty + + //separating +ve and and -ve values.(2,1) + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //row 2 right + signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2); + //combining the appropriate sign change + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) + + //separating +ve and and -ve values.(3,2) + cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1) + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //manipulation for row 2 -row 3 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 8); + //row 3 left + signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14); + //combining the appropriate sign change + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2) + + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1) + + //separating +ve and and -ve values.(2,3) + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + + //manipulation for row 3 -bottom + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 6); + //bottom left + signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) + + //separating +ve and and -ve values.(3,bottom) + cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3) + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom) + edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3) + + + //eliminating old left for row 0,1,2,3 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8); + //packing row 2 n row 3 + src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); + //row 3 right + signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2); + //loading row 3 right into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14); + //adding bottom and top values of row 2 and row 3 + edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2) + //separating +ve and and -ve values.(botttom,3) + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + //to store right of row 2 + signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration + + //storing right of row 2into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); + //to store right of row 0 + signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + //storing right of row 1 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); + //storing right of row 0 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); + + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); + + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); + + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); + cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); + src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); + edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); + cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); + src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b); + src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); + + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); + _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + //row = 2 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); + // row = 3 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); + + src_temp0_16x8b = src_temp1_16x8b; + signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); + pu1_src_cpy += (src_strd << 2); + pu1_src_left_cpy += 8; + } + ht_rem = ht & 0x2; + if(ht_rem) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + // row = 2 + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); + + //manipulation for row 0 -row 1 + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); + //bottom left + signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //manipulation for row 1 - row 0 + signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //row1-row0 + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign chang + edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + + //manipulation for row 1 -bottom + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10); + //bottom left + signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); + + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) + signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) + //row1 -bottom + cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); + + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) + edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) + + //manipulation for bottom- row 1 (row 1 right) + signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); + //adding top and down substraction + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty + //bottom - row 1 + cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); + + //eliminating old left for row 0,1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); + signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //for the next iteration signup0_16x8b + signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next + + //storing right of row 1 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); + //for storing right of row 1 + signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + + src_top_16x8b = src_temp1_16x8b; + //storing right of row 0 into left + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); + + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + //the next top already in src_top_16x8b + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); + edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); + + cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); + + _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + // row = 1 + _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_cpy += (src_strd << 1); + pu1_src_left_cpy += 4; + } + ht_rem = ht & 0x1; + if(ht_rem) + { + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. + src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); + + + //manipulation for row 0 -bottom + signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); + //bottom left + signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); + //separating +ve and and -ve values. + cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); + cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); + //creating mask 00 for +ve and -ve values and FF for zero. + cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); + //combining the appropriate sign change + edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); + //adding top and down substraction + edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); + //for row 0 right to put into left store + signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + //adding constant 2 + edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); + edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); + edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); + //left store manipulation 1 + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); + //filling the left boundary value + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); + src_top_16x8b = src_temp0_16x8b; + + //shuffle to get sao index + edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); + //using availability mask + edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); + //adding chroma offset to access U and V + edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); + //shuffle to get sao offset + edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); + + //cnvert to 16 bit then add and then saturated pack + signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); + src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); + cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); + src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); + src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); + + _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); + //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. + _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); + pu1_src_cpy += (src_strd); + src_temp0_16x8b = src_bottom_16x8b; + pu1_src_left_cpy += 2; + } + { //for bottom right + left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); + left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); + src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); + left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); + _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); + } + if(0 == pu1_avail[3]) + { + src_top_16x8b = src_bottom_16x8b; + } + + _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); + pu1_src += 8; + } + pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u; + pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v; + pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u; + pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v; + for(row = 0; row < 2 * ht_tmp; row++) + { + pu1_src_left[row] = au1_src_left_tmp[row]; + } + } + +} |