summaryrefslogtreecommitdiffstats
path: root/common/x86/ihevc_sao_ssse3_intr.c
diff options
context:
space:
mode:
Diffstat (limited to 'common/x86/ihevc_sao_ssse3_intr.c')
-rw-r--r--common/x86/ihevc_sao_ssse3_intr.c5653
1 files changed, 5653 insertions, 0 deletions
diff --git a/common/x86/ihevc_sao_ssse3_intr.c b/common/x86/ihevc_sao_ssse3_intr.c
new file mode 100644
index 0000000..cffd2a9
--- /dev/null
+++ b/common/x86/ihevc_sao_ssse3_intr.c
@@ -0,0 +1,5653 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_sao_atom_intr.c
+*
+* @brief
+* Contains function definitions for Sample adaptive offset(SAO) used in-loop
+* filtering
+*
+* @author
+* 100592
+*
+* @par List of Functions:
+* - ihevc_sao_band_offset_luma_ssse3()
+* - ihevc_sao_band_offset_chroma_ssse3()
+* - ihevc_sao_edge_offset_class0_ssse3()
+* - ihevc_sao_edge_offset_class0_chroma_ssse3()
+* - ihevc_sao_edge_offset_class1_ssse3()
+* - ihevc_sao_edge_offset_class1_chroma_ssse3()
+* - ihevc_sao_edge_offset_class2_ssse3()
+* - ihevc_sao_edge_offset_class2_chroma_ssse3()
+* - ihevc_sao_edge_offset_class3_ssse3()
+* - ihevc_sao_edge_offset_class3_chroma_ssse3()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_defs.h"
+#include "ihevc_tables_x86_intr.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_sao.h"
+
+#include <immintrin.h>
+
+#define NUM_BAND_TABLE 32
+/**
+*******************************************************************************
+*
+* @brief
+* Has two sets of functions : band offset and edge offset both for luma and chroma
+* edge offset has horizontal ,vertical, 135 degree and 45 degree
+*
+* @par Description:
+*
+*
+* @param[in-out] pu1_src
+* Pointer to the source
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in-out] pu1_src_left
+* source left boundary
+*
+* @param[in-out] pu1_src_top
+* Source top boundary
+*
+* @param[in-out] pu1_src_top_left
+* Source top left boundary
+*
+* @param[in] pu1_src_top_right
+* Source top right boundary
+*
+* @param[in] pu1_src_bot_left
+* Source bottom left boundary
+*
+* @param[in] pu1_avail
+* boundary availability flags
+*
+* @param[in] pi1_sao_offset_u
+* Chroma U sao offset values
+*
+* @param[in] pi1_sao_offset_v
+* Chroma V sao offset values
+*
+* @param[in] pi1_sao_offset
+* Luma sao offset values
+*
+* @param[in] wd
+* width of the source
+
+* @param[in] ht
+* height of the source
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ WORD32 sao_band_pos,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_cpy;
+ WORD32 wd_rem;
+ WORD8 offset = 0;
+
+ __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
+ __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b;
+ __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
+ __m128i band_pos_16x8b;
+ __m128i sao_offset;
+ __m128i cmp_mask, cmp_store;
+
+ /* Updating left and top-left and top */
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
+ }
+ pu1_src_top_left[0] = pu1_src_top[wd - 1];
+ for(col = 0; col < wd; col += 8)
+ {
+ tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
+ _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
+ offset += 8;
+ }
+
+ //replicating sao_band_pos as 8 bit value 16 times
+
+
+ band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3));
+ //value set for sao_offset extraction
+ tmp_set_128i_1 = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
+ tmp_set_128i_2 = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
+ tmp_set_128i_3 = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
+ tmp_set_128i_4 = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
+
+ //loaded sao offset values
+ sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+ //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
+ band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
+ band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
+ band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
+ band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
+
+ //band_position addition
+ band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b);
+ band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b);
+ band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b);
+ band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b);
+ //sao_offset duplication
+ tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
+ tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
+ tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
+ //settng for comparision
+ cmp_mask = _mm_set1_epi16(16);
+ cmp_store = _mm_set1_epi16(0x00ff);
+
+ //sao_offset addition
+ band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1);
+ band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2);
+ band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3);
+ band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4);
+ //masking upper 8bit values of each 16 bit band table value
+ band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
+ band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
+ band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
+ band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
+
+ switch(sao_band_pos)
+ {
+ case 0:
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
+ band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2);
+ break;
+ case 28:
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
+ band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2);
+ break;
+ case 29:
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
+ band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2);
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
+ band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2);
+ break;
+ case 30:
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
+ band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2);
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
+ band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2);
+ break;
+ case 31:
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
+ band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2);
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
+ band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2);
+ break;
+ default:
+ break;
+ }
+ //sao_offset is reused for zero cmp mask.
+ sao_offset = _mm_setzero_si128();
+ tmp_set_128i_1 = _mm_set1_epi8(1);
+ //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
+ cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
+
+ //masking upper 8bit values of each 16 bit band table value
+ band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
+ band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
+ band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
+ band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
+
+ //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b
+ band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b);
+ band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b);
+
+ band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
+ band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned
+ band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31
+
+ cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
+ // band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store);
+
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ for(row = ht; row > 0; row -= 2)
+ {
+
+
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ // row = 1
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+
+
+ //saturated substract 8 bit
+ tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
+ tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
+ //if the values less than 0 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ //if the values gret=ater than 31 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+
+
+ //row 0 and row1
+ //if the values >16 then put ff ,cmp_mask = dup16(15)
+ cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
+ //values 16 to 31 for row 0 & 1 but values <16 ==0
+ tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
+ // values 0 to 15 for row 0 & 1
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
+ //values 16 to 31 for row 0 & 1 but values <16 masked to ff
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
+ tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
+ //row 2 and row 3
+ //if the values >16 then put ff ,cmp_mask = dup16(15)
+ cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
+ //values 16 to 31 for row 2 & 3 but values <16 ==0
+ tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
+ // values 0 to 15 for row 2 & 3
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
+ //values 16 to 31 for row 2 & 3 but values <16 masked to ff
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
+ tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
+
+ //row 0 and row 1
+ //to preserve pixel values in which no offset needs to be added.
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
+ src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
+
+ //row 2 and row 3
+ //to preserve pixel values in which no offset needs to be added.
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
+ src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
+
+ //indexing 0 - 15 bandtable indexes
+ tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
+ tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
+ tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
+ tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
+ // combining all offsets results
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ // combing results woth the pixel values
+ src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
+ src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
+
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b);
+
+ pu1_src_cpy += (src_strd << 1);
+ }
+ pu1_src += 16;
+ }
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {pu1_src_cpy = pu1_src;
+ for(row = ht; row > 0; row -= 4)
+ {
+
+
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ // row = 1
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ // row = 3
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+ //row0 and row1 packed and row2 and row3 packed
+
+ src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
+ src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
+
+ //saturated substract 8 bit
+ tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
+ tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
+ //if the values less than 0 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ //if the values gret=ater than 31 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+
+
+
+ //row 0 and row1
+ //if the values >16 then put ff ,cmp_mask = dup16(15)
+ cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
+ //values 16 to 31 for row 0 & 1 but values <16 ==0
+ tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
+ // values 0 to 15 for row 0 & 1
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
+ //values 16 to 31 for row 0 & 1 but values <16 masked to ff
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
+ tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
+ //row 2 and row 3
+ //if the values >16 then put ff ,cmp_mask = dup16(15)
+ cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
+ //values 16 to 31 for row 2 & 3 but values <16 ==0
+ tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
+ // values 0 to 15 for row 2 & 3
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
+ //values 16 to 31 for row 2 & 3 but values <16 masked to ff
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
+ tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
+
+ //row 0 and row 1
+ //to preserve pixel values in which no offset needs to be added.
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
+ src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
+
+ //row 2 and row 3
+ //to preserve pixel values in which no offset needs to be added.
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
+ src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
+
+ //indexing 0 - 15 bandtable indexes
+ tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
+ tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
+ tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
+ tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
+ // combining all offsets results
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ // combing results woth the pixel values
+ src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
+ src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
+
+ //Getting row1 separately
+ src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8);
+ //Getting row3 separately
+ src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b);
+ // row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b);
+
+ pu1_src_cpy += (src_strd << 2);
+
+ }
+ pu1_src += 8;
+ }
+
+
+}
+
+void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ WORD32 sao_band_pos_u,
+ WORD32 sao_band_pos_v,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ WORD8 offset = 0;
+
+
+ __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
+ __m128i cmp_msk2;
+ __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b;
+ __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
+ __m128i band_pos_u_16x8b, band_pos_v_16x8b;
+ __m128i sao_offset;
+ __m128i cmp_mask;
+
+
+ /* Updating left and top and top-left */
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
+ pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
+ }
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];
+ for(col = 0; col < wd; col += 8)
+ {
+ tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
+ _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
+ offset += 8;
+ }
+
+ { // band _table creation
+ __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b;
+ // Band table for U component : band_table0_16x8b and band_table2_16x8b
+ //replicating sao_band_pos as 8 bit value 16 times
+ band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3));
+ //value set for sao_offset extraction
+ tmp_set_128i_1 = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
+ tmp_set_128i_2 = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
+ tmp_set_128i_3 = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
+ tmp_set_128i_4 = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
+
+ //loaded sao offset values
+ sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+
+ //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
+ band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
+ band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
+ band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
+ band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
+
+ //band_position addition
+ band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b);
+ band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b);
+ band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b);
+ band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b);
+ //sao_offset duplication
+ temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
+ temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
+ temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
+ temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
+
+ //sao_offset addition
+ band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b);
+ band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b);
+ band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b);
+ band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b);
+ //reuse for clipping
+ temp1_8x16b = _mm_set1_epi16(0x00ff);
+ //settng for comparision
+ cmp_mask = _mm_set1_epi16(16);
+
+ //masking upper 8bit values of each 16 bit band table value
+ band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
+ band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
+ band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
+ band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
+
+ //temp1_8x16b reuse for compare storage
+ switch(sao_band_pos_u)
+ {
+ case 0:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
+ band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b);
+ break;
+ case 28:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
+ band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
+ break;
+ case 29:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
+ band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b);
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
+ band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
+ break;
+ case 30:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
+ band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
+ band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b);
+ break;
+ case 31:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
+ band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b);
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
+ band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
+ break;
+ default:
+ break;
+ }
+ //masking upper 8bit values of each 16 bit band table value
+ band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
+ band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
+ band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
+ band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
+ //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b
+ band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b);
+ band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b);
+ // Band table for U component over
+
+ // Band table for V component : band_table1_16x8b and band_table3_16x8b
+ // replicating sao_band_pos as 8 bit value 16 times
+ band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3));
+
+ //loaded sao offset values
+ sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+
+ //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
+ temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
+ band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
+ temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
+ band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
+
+ //band_position addition
+ temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b);
+ band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b);
+ temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b);
+ band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b);
+ //sao_offset duplication
+ tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
+ tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
+ tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
+
+ //sao_offset addition
+ temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1);
+ band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2);
+ temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3);
+ band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4);
+
+ //masking upper 8bit values of 16 bit band table value
+ temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
+ band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
+ temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
+ band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
+ //temp1_8x16b reuse for compare storage
+
+ switch(sao_band_pos_v)
+ {
+ case 0:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
+ temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b);
+ break;
+ case 28:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
+ band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
+ break;
+ case 29:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
+ temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b);
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
+ band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
+ break;
+ case 30:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
+ band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
+ temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b);
+ break;
+ case 31:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
+ temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b);
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
+ band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
+ break;
+ default:
+ break;
+ }
+ //masking upper 8bit values of each 16 bit band table value
+ temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
+ band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
+ temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
+ band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
+ //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b
+ band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b);
+ band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b);
+ //band table for u and v created
+ }
+ {
+ UWORD8 *pu1_src_cpy;
+ WORD32 wd_rem;
+
+
+ //sao_offset is reused for zero cmp mask.
+ sao_offset = _mm_setzero_si128();
+ tmp_set_128i_1 = _mm_set1_epi8(1);
+ //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
+ cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
+ //to avoid ffff to be saturated to 0 instead it should be to ff
+
+ cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
+ band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned
+ band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned
+ cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31
+
+ cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
+
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ for(row = ht; row > 0; row -= 2)
+ {
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ // row = 1
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+
+ //odd values
+ src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
+ src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
+ //even values
+ src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
+ src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
+ src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
+ src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
+ //combining odd values
+ src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ //combining even values
+ src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
+
+ //saturated substract 8 bit
+ tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
+ tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
+ //if the values less than 0 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ //if the values greater than 31 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ // registers reused to increase performance
+ //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
+ src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
+ //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3
+ src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
+
+ //values 16 to 31 for row 0 & 1 but values <16 ==0
+ tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
+ // values 0 to 15 for row 0 & 1
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
+ //values 16 to 31 for row 2 & 3 but values <16 ==0
+ tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
+ // values 0 to 15 for row 2 & 3
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
+
+ //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
+ src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
+ //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3
+ src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
+ tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
+ tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
+
+
+ //to choose which pixel values to preserve in row 0 and row 1
+ src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
+ //to choose which pixel values to preserve in row 2 and row 3
+ src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
+ //values of all rows to which no offset needs to be added preserved.
+ src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
+ src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
+
+ //indexing 0 - 15 bandtable indexes
+ tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
+ tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
+ //indexing 16 -31 bandtable indexes
+ tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
+ tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
+ // combining all offsets results
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
+ // combing results with the pixel values
+ src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
+ src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
+ //reorganising even and odd values
+ src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
+
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b);
+
+
+ pu1_src_cpy += (src_strd << 1);
+
+ }
+ pu1_src += 16;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_cpy = pu1_src;
+ for(row = ht; row > 0; row -= 4)
+ {
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ // row = 1
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ // row = 3
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+ //row0 and row1 packed and row2 and row3 packed
+
+ src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
+ src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
+ //odd values
+ src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
+ src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
+ //even values
+ src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
+ src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
+ src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
+ src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
+ //combining odd values
+ src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ //combining even values
+ src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
+
+ //saturated substract 8 bit
+ tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
+ tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
+ //if the values less than 0 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ //if the values greater than 31 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ // registers reused to increase performance
+ //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
+ src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
+ //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3
+ src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
+
+ //values 16 to 31 for row 0 & 1 but values <16 ==0
+ tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
+ // values 0 to 15 for row 0 & 1
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
+ //values 16 to 31 for row 2 & 3 but values <16 ==0
+ tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
+ // values 0 to 15 for row 2 & 3
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
+
+ //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
+ src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
+ //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3
+ src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
+ tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
+ tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
+
+
+ //to choose which pixel values to preserve in row 0 and row 1
+ src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
+ //to choose which pixel values to preserve in row 2 and row 3
+ src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
+ //values of all rows to which no offset needs to be added preserved.
+ src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
+ src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
+
+ //indexing 0 - 15 bandtable indexes
+ tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
+ tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
+ //indexing 16 -31 bandtable indexes
+ tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
+ tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
+ // combining all offsets results
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
+ // combing results with the pixel values
+ src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
+ src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
+ //reorganising even and odd values
+ src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
+ //Getting row1 separately
+ src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+ //Getting row3 separately
+ src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b);
+ // row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b);
+
+ pu1_src_cpy += (src_strd << 2);
+
+ }
+ pu1_src += 16;
+ }
+
+
+ }
+}
+
+
+
+void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
+ UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+ UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
+ UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
+ UWORD8 u1_avail0, u1_avail1;
+ WORD32 wd_rem;
+ WORD32 offset = 0;
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i left0_16x8b, left1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i au1_mask8x16b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+ __m128i left_store_16x8b;
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ au1_mask8x16b = _mm_set1_epi8(0xff);
+
+ /* Update top and top-left arrays */
+
+ *pu1_src_top_left = pu1_src_top[wd - 1];
+
+ for(col = wd; col >= 16; col -= 16)
+ {
+ const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
+ _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
+ offset += 16;
+ }
+
+ //setting availability mask to ff size MAX_CTB_SIZE
+ for(col = 0; col < MAX_CTB_SIZE; col += 16)
+ _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+ for(row = 0; row < ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src_left[row];
+ }
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+ //availability mask creation
+ u1_avail0 = pu1_avail[0];
+ u1_avail1 = pu1_avail[1];
+ au1_mask[0] = u1_avail0;
+ au1_mask[wd - 1] = u1_avail1;
+
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+ pu1_src_left_cpy = au1_src_left_tmp;
+ pu1_src_left_str = au1_src_left_tmp1;
+ {
+ au1_mask_cpy = au1_mask;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+ //pu1_src_left_cpy =au1_src_left_tmp;
+ for(row = ht; row > 0; row -= 2)
+ {
+
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ // row = 1
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2);
+ //row 1 left
+ left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
+ //row 0 left
+ left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+
+
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
+ cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
+ cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
+ cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
+
+ //row = 0 right
+ edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
+ // row = 1 right
+ edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
+ cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
+ cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
+ cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ au1_mask_cpy += 16;
+ pu1_src += 16;
+ pu1_src_left_cpy -= ht;
+ pu1_src_left_str -= ht;
+
+ pu1_left_tmp = pu1_src_left_cpy;
+ pu1_src_left_cpy = pu1_src_left_str;
+ pu1_src_left_str = pu1_left_tmp;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+
+ cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
+ _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b);
+
+ au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
+ pu1_src_cpy = pu1_src;
+ au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+ //pu1_src_left_cpy =au1_src_left_tmp;
+ for(row = ht; row > 0; row -= 4)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ // row = 1
+ cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ // row = 3
+ cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
+ //row 3 left
+ edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8);
+ cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
+ //row 2 left
+ edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+ left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
+ //row 1 left
+ edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
+ //row 0 left
+ edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
+
+ // packing rows together for 16 SIMD operations
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b);
+ // packing rows together for 16 SIMD operations
+ left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b);
+ left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b);
+
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
+ cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
+ cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
+ cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
+
+ //row = 0 right
+ edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1));
+ // row = 1 right
+ cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1));
+ // row = 2 right
+ edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
+ // row = 3 right
+ cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1));
+ // packing rows together for 16 SIMD operations
+ edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
+ edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b);
+
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
+ cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
+ cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
+ cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //shuffle to get sao offset
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+ //separting row 1 and row 3
+ cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b);
+ // row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b);
+
+ pu1_src_cpy += (src_strd << 2);
+ pu1_src_left_cpy += 4;
+ pu1_src_left_str += 4;
+ }
+ pu1_src += wd;
+ pu1_src_left_cpy -= ht;
+ pu1_src_left_str -= ht;
+
+ pu1_left_tmp = pu1_src_left_cpy;
+ pu1_src_left_cpy = pu1_src_left_str;
+ pu1_src_left_str = pu1_left_tmp;
+ }
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[row] = pu1_src_left_cpy[row];
+ }
+ }
+}
+
+
+void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
+ UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+ UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
+ UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
+ UWORD8 u1_avail0, u1_avail1;
+ WORD32 wd_rem;
+ WORD32 offset = 0;
+
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i left0_16x8b, left1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i au1_mask8x16b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+ __m128i left_store_16x8b;
+ __m128i chroma_offset_8x16b;
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ au1_mask8x16b = _mm_set1_epi8(0xff);
+
+ /* Update top and top-left arrays */
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];;
+
+ for(col = wd; col >= 16; col -= 16)
+ {
+ const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
+ _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
+ offset += 16;
+ }
+ for(row = 0; row < 2 * ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src_left[row];
+ }
+ //setting availability mask to ff size MAX_CTB_SIZE
+ for(col = 0; col < MAX_CTB_SIZE; col += 16)
+ _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+ const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+ chroma_offset_8x16b = _mm_set1_epi16(0x0800);
+ //availability mask creation
+ u1_avail0 = pu1_avail[0];
+ u1_avail1 = pu1_avail[1];
+ au1_mask[0] = u1_avail0;
+ au1_mask[1] = u1_avail0;
+ au1_mask[wd - 1] = u1_avail1;
+ au1_mask[wd - 2] = u1_avail1;
+ sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+
+ {
+ pu1_src_left_cpy = au1_src_left_tmp;
+ pu1_src_left_str = au1_src_left_tmp1;
+ au1_mask_cpy = au1_mask;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+
+ for(row = ht; row > 0; row -= 2)
+ {
+
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ // row = 1
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
+ //row 1 left
+ left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
+ //row 0 left
+ left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+
+
+ //separating +ve and and -ve values.row 0 left
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //separating +ve and and -ve values.row 1 left
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+ //row = 0 right
+ edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
+ // row = 1 right
+ edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
+ //separating +ve and and -ve values.row 0 right
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //separating +ve and and -ve values.row 1 right
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 4;
+ pu1_src_left_str += 4;
+ }
+ au1_mask_cpy += 16;
+ pu1_src += 16;
+ pu1_src_left_cpy -= 2 * ht;
+ pu1_src_left_str -= 2 * ht;
+
+ pu1_left_tmp = pu1_src_left_cpy;
+ pu1_src_left_cpy = pu1_src_left_str;
+ pu1_src_left_str = pu1_left_tmp;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+
+ cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
+ _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b);
+
+ au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
+ pu1_src_cpy = pu1_src;
+ au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+
+ for(row = ht; row > 0; row -= 4)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ // row = 1
+ cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ // row = 3
+ cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8);
+ //row 3 left
+ edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8);
+ left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
+ //row 2 left
+ edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+ left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
+
+
+ // packing rows together for 16 SIMD operations
+ src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b);
+ left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b);
+
+ //row 1 left
+ edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
+ edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
+ //row 0 left
+ edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
+ // packing rows together for 16 SIMD operations
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
+ left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b);
+
+ //separating +ve and and -ve values.for row 2 and row 3
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+
+
+
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+ //row = 0 right
+ edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2));
+ // row = 1 right
+ cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2));
+ // row = 2 right
+ edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
+ // row = 3 right
+ cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2));
+ // packing rows together for 16 SIMD operations
+ edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
+ edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b);
+
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //shuffle to get sao offset
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+ //seaprting row 1 and row 3
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ // row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ pu1_src_cpy += (src_strd << 2);
+ pu1_src_left_cpy += 8;
+ pu1_src_left_str += 8;
+ }
+ pu1_src += wd;
+ pu1_src_left_cpy -= 2 * ht;
+ pu1_src_left_str -= 2 * ht;
+
+ pu1_left_tmp = pu1_src_left_cpy;
+ pu1_src_left_cpy = pu1_src_left_str;
+ pu1_src_left_str = pu1_left_tmp;
+ }
+ for(row = 0; row < 2 * ht; row++)
+ {
+ pu1_src_left[row] = pu1_src_left_cpy[row];
+ }
+ }
+
+}
+
+
+void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_top_cpy;
+ UWORD8 *pu1_src_cpy;
+ WORD32 wd_rem;
+
+
+ __m128i src_top_16x8b, src_bottom_16x8b;
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i signup0_16x8b, signdwn1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+
+ /* Updating left and top-left */
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
+ }
+ *pu1_src_top_left = pu1_src_top[wd - 1];
+
+
+
+ pu1_src_top_cpy = pu1_src_top;
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_top_cpy = pu1_src;
+ pu1_src += src_strd;
+ ht--;
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+
+ {
+ WORD32 ht_rem;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+
+ //for the next iteration signup0_16x8b = -signdwn1_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //copying the next top
+ src_top_16x8b = src_temp1_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ }
+ ht_rem = ht & 0x1;
+
+ if(ht_rem)
+ {
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //current row -next row
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and botton and constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ //copying the next top
+ src_top_16x8b = src_temp0_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ //updating top flag
+ _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 16;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ for(row = ht; row >= 4; row -= 4)
+ {
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //packing row 0 n row 1
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+ //row = 3
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+ // row = 4
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
+ //separating +ve and and -ve values.(2,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
+ edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
+ //separating +ve and and -ve values.(3,4)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
+ //combining sign-left and sign_right
+ edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
+
+ edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+
+ //packing row 2 n row 3
+ src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+ //for the next iteration signup0_16x8b = -signdwn1_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //the next top already in src_top_16x8b
+ //src_top_16x8b = src_temp1_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
+ src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ //row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ src_temp0_16x8b = src_temp1_16x8b;
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_cpy += (src_strd << 2);
+
+ }
+ ht_rem = ht & 0x2;
+ if(ht_rem)
+ {
+
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ //for the next iteration signup0_16x8b = -signdwn1_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
+ src_top_16x8b = src_temp1_16x8b;
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+
+ }
+ ht_rem = ht & 0x1;
+ if(ht_rem)
+ {
+
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+ edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ src_top_16x8b = src_temp0_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 8;
+ }
+ }
+}
+
+void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_top_cpy;
+ UWORD8 *pu1_src_cpy;
+ WORD32 wd_rem;
+
+
+ __m128i src_top_16x8b, src_bottom_16x8b;
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i signup0_16x8b, signdwn1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+ __m128i chroma_offset_8x16b;
+
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ /* Updating left and top and top-left */
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
+ pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
+ }
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];
+
+
+
+ pu1_src_top_cpy = pu1_src_top;
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+ const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+ chroma_offset_8x16b = _mm_set1_epi16(0x0800);
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_top_cpy = pu1_src;
+ pu1_src += src_strd;
+ ht--;
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+ sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+
+
+ {
+ WORD32 ht_rem;
+
+
+
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+
+ //for the next iteration signup0_16x8b = -signdwn1_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //copying the next top
+ src_top_16x8b = src_temp1_16x8b;
+
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ }
+ ht_rem = ht & 0x1;
+
+ if(ht_rem)
+ {
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //current row -next row
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and botton and constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ //copying the next top
+ src_top_16x8b = src_temp0_16x8b;
+
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ //updating top flag
+ _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 16;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ for(row = ht; row >= 4; row -= 4)
+ {
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //packing row 0 n row 1
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+ //row = 3
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+ // row = 4
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
+ //separating +ve and and -ve values.(2,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
+ edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
+ //separating +ve and and -ve values.(3,4)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
+ //combining sign-left and sign_right
+ edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
+
+ edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+
+ //packing row 2 n row 3
+ src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+ //for the next iteration signup0_16x8b = -signdwn1_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
+ src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ //row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ src_temp0_16x8b = src_temp1_16x8b;
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_cpy += (src_strd << 2);
+
+ }
+ ht_rem = ht & 0x2;
+ if(ht_rem)
+ {
+
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ //for the next iteration signup0_16x8b = -signdwn1_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
+ src_top_16x8b = src_temp1_16x8b;
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+
+ }
+ ht_rem = ht & 0x1;
+ if(ht_rem)
+ {
+
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ src_top_16x8b = src_temp0_16x8b;
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+ edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 8;
+ }
+ }
+}
+
+/* 135 degree filtering */
+void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
+ UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
+ UWORD8 *pu1_firstleft;
+ UWORD8 *pu1_src_cpy, *pu1_src_org;
+ UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+ UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
+ UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
+ WORD32 wd_rem;
+ UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp;
+ WORD32 ht_tmp, ht_0;
+
+ WORD32 bit_depth;
+ UWORD8 u1_avail0, u1_avail1;
+
+ __m128i src_top_16x8b, src_bottom_16x8b;
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i signup0_16x8b, signdwn1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i au1_mask8x16b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+ __m128i left_store_16x8b;
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ ht_0 = ht; ht_tmp = ht;
+ au1_mask8x16b = _mm_set1_epi8(0xff);
+
+ //setting availability mask to ff size MAX_CTB_SIZE
+ for(col = 0; col < MAX_CTB_SIZE; col += 16)
+ _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+ for(row = 0; row < ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src_left[row];
+ }
+ bit_depth = BIT_DEPTH_LUMA;
+ pu1_src_org = pu1_src;
+ pu1_src_top_cpy = pu1_src_top;
+ pu1_src_left_cpy2 = au1_src_left_tmp;
+ pu1_src_left_cpy = au1_src_left_tmp;
+ pu1_src_left_str2 = au1_src_left_tmp1;
+ pu1_src_left_str = au1_src_left_tmp1;
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+
+ /* If top-left is available, process separately */
+ if(0 != pu1_avail[4])
+ {
+ WORD8 edge_idx;
+
+ edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
+ SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_0_tmp = pu1_src[0];
+ }
+ }
+ else
+ {
+ u1_pos_0_0_tmp = pu1_src[0];
+ }
+
+ /* If bottom-right is available, process separately */
+ if(0 != pu1_avail[7])
+ {
+ WORD8 edge_idx;
+
+ edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
+ SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
+ }
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
+ }
+ pu1_firstleft = pu1_src_top_left;
+
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_firstleft = pu1_src_left_cpy2;
+ pu1_src_left_cpy2++;
+ pu1_src_left_str2++;
+ pu1_src_top_cpy = pu1_src;
+ pu1_src += src_strd;
+ ht--;
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ ht_0--;
+ }
+ //storing top left in a mmx register
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+ left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15);
+ //update top -left
+ *pu1_src_top_left = pu1_src_top[wd - 1];
+ //availability mask creation
+ u1_avail0 = pu1_avail[0];
+ u1_avail1 = pu1_avail[1];
+ au1_mask[0] = u1_avail0;
+ au1_mask[wd - 1] = u1_avail1;
+ {
+ WORD32 ht_rem;
+
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ au1_mask_cpy = au1_mask;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
+ //loading the mask
+ au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 1 right
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
+ //to insert left in row 0
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ // row = 2 right
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
+
+
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+ //storing the row 1 left for next row.
+ signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+
+ //combining sign-left and sign_right
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+ //manipulation for bottom - row 1
+ signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15);
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ //bottom - row1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration bottom -row1
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row1 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+ //copying the next top
+ src_top_16x8b = src_temp1_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ ht_rem = ht & 0x1;
+
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
+ //current row -next row
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and botton and constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+ //copying the next top
+ src_top_16x8b = src_temp0_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ pu1_src_left_cpy += 1;
+ pu1_src_left_str += 1;
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ pu1_src_left_str[0] = pu1_src_cpy[15];
+ }
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd];
+ }
+
+ //for the top left of next part of the block
+ left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //updating top flag
+ _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 16;
+ au1_mask_cpy += 16;
+
+
+ pu1_left_tmp = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 = pu1_src_left_str2;
+ pu1_src_left_str2 = pu1_left_tmp;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
+ au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //preparing au1_mask
+ au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+
+ for(row = ht; row >= 4; row -= 4)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ //right row1
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //manipulation for row 1 -row 0
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row 0 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row 1 -row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //right row2
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
+ //packing row 0 n row 1
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //manipulation for row 2 -row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //row 1 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+ //row = 3
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+ // row = 4
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+
+ //separating +ve and and -ve values.(2,1)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+ //manipulation for row 3 -row 2
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row 2 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
+
+ //separating +ve and and -ve values.(3,2)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //right row3
+ signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
+
+ //separating +ve and and -ve values.(2,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+ //right row 4
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+ //separating +ve and and -ve values.(3,bottom)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
+ edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
+
+ //manipulation for bottom -row 3
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //eliminating old left for row 0,1,2,3
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+ //packing row 2 n row 3
+ src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+ //row 3 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
+ //loading row 3 right into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
+ //adding bottom and top values of row 2 and row 3
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+ //separating +ve and and -ve values.(botttom,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //to store right of row 2
+ signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
+
+ //storing right of row 2into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+ //to store right of row 0
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
+ src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ //row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ src_temp0_16x8b = src_temp1_16x8b;
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_cpy += (src_strd << 2);
+ pu1_src_left_cpy += 4;
+ pu1_src_left_str += 4;
+ }
+ ht_rem = ht & 0x2;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //row 0 -row 1
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //manipulation for row 1 -row 0
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign chang
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row 1 -bottom
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //manipulation for bottom -row1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //manipulation for bottom- row 1
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ //bottom - row 1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //eliminating old left for row 0,1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration signup0_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
+
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+ //for storing right of row 1
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+
+ src_top_16x8b = src_temp1_16x8b;
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ ht_rem = ht & 0x1;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //left store manipulation 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+ //row 0 -row1
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ //for row 0 right to put into left store
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+ edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+ //filling the left boundary value
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ src_top_16x8b = src_temp0_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ pu1_src_left_cpy += 1;
+ pu1_src_left_str += 1;
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ pu1_src_left_str[0] = pu1_src_cpy[7];
+ }
+
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd];
+ }
+
+ _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 8;
+ au1_mask_cpy += 16;
+
+ pu1_left_tmp = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 = pu1_src_left_str2;
+ pu1_src_left_str2 = pu1_left_tmp;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ }
+ pu1_src_org[0] = u1_pos_0_0_tmp;
+ pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp;
+ pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
+ for(row = 0; row < ht_tmp; row++)
+ {
+ pu1_src_left[row] = pu1_src_left_cpy[row];
+ }
+ }
+
+}
+
+/* 135 degree filtering */
+void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
+ UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
+ UWORD8 *pu1_firstleft;
+ UWORD8 *pu1_src_cpy, *pu1_src_org;
+ UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+ UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
+ UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
+ WORD32 wd_rem;
+ UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v;
+ WORD32 ht_tmp;
+ WORD32 ht_0;
+
+ WORD32 bit_depth;
+ UWORD8 u1_avail0, u1_avail1;
+
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i signup0_16x8b, signdwn1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i src_top_16x8b, src_bottom_16x8b;
+ __m128i au1_mask8x16b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+ __m128i left_store_16x8b;
+ __m128i chroma_offset_8x16b;
+
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ ht_0 = ht; ht_tmp = ht;
+ au1_mask8x16b = _mm_set1_epi8(0xff);
+ /* Updating left and top-left */
+ for(row = 0; row < 2 * ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src_left[row];
+ }
+ //setting availability mask to ff size MAX_CTB_SIZE
+ for(col = 0; col < MAX_CTB_SIZE; col += 16)
+ _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+ bit_depth = BIT_DEPTH_LUMA;
+ pu1_src_org = pu1_src;
+ pu1_src_top_cpy = pu1_src_top;
+ pu1_src_left_cpy2 = au1_src_left_tmp;
+ pu1_src_left_cpy = au1_src_left_tmp;
+ pu1_src_left_str2 = au1_src_left_tmp1;
+ pu1_src_left_str = au1_src_left_tmp1;
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+ const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+ chroma_offset_8x16b = _mm_set1_epi16(0x0800);
+
+ /* If top-left is available, process separately */
+ if(0 != pu1_avail[4])
+ {
+ WORD32 edge_idx;
+
+ /* U */
+ edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
+ SIGN(pu1_src[0] - pu1_src[2 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_0_tmp_u = pu1_src[0];
+ }
+
+ /* V */
+ edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
+ SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_0_tmp_v = pu1_src[1];
+ }
+ }
+ else
+ {
+ u1_pos_0_0_tmp_u = pu1_src[0];
+ u1_pos_0_0_tmp_v = pu1_src[1];
+ }
+
+ /* If bottom-right is available, process separately */
+ if(0 != pu1_avail[7])
+ {
+ WORD32 edge_idx;
+
+ /* U */
+ edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
+ SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
+ }
+
+ /* V */
+ edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
+ SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
+ }
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
+ u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
+ }
+ pu1_firstleft = pu1_src_top_left;
+
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_firstleft = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 += 2;
+ pu1_src_left_str2 += 2;
+ pu1_src_top_cpy = pu1_src;
+ pu1_src += src_strd;
+ ht--;
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ ht_0--;
+ }
+ //storing top left in a mmx register
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
+ sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+ left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+
+ //availability mask creation
+ u1_avail0 = pu1_avail[0];
+ u1_avail1 = pu1_avail[1];
+ au1_mask[0] = u1_avail0;
+ au1_mask[1] = u1_avail0;
+ au1_mask[wd - 1] = u1_avail1;
+ au1_mask[wd - 2] = u1_avail1;
+
+ /* top-left arrays */
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];
+ {
+ WORD32 ht_rem;
+ au1_mask_cpy = au1_mask;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
+ //loading the mask
+ au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 1 right
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
+ //to insert left in row 0
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ // row = 2 right
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
+
+
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+ //storing the row 1 left for next row.
+ signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+
+ //combining sign-left and sign_right
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+ //manipulation for bottom - row 1
+ signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14);
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+ //bottom - row1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration bottom -row1
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row1 getting it right for left of next iteration
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
+ //copying the next top
+ src_top_16x8b = src_temp1_16x8b;
+ //row0 getting its right for left of next iteration.
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 4;
+ pu1_src_left_str += 4;
+ }
+ ht_rem = ht & 0x1;
+
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
+ //current row -next row
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and botton and constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ //copying the next top
+ src_top_16x8b = src_temp0_16x8b;
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ pu1_src_left_str[1] = pu1_src_cpy[15];
+ pu1_src_left_str[0] = pu1_src_cpy[14];
+ }
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd];
+ pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd];
+ }
+
+ //for the top left of next part of the block
+ left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //updating top flag
+ _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 16;
+ au1_mask_cpy += 16;
+
+ pu1_left_tmp = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 = pu1_src_left_str2;
+ pu1_src_left_str2 = pu1_left_tmp;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ }
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
+ au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //preparing au1_mask
+ au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+
+ for(row = ht; row >= 4; row -= 4)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ //right row1
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //manipulation for row 1 -row 0
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row 0 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row 1 -row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //right row2
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
+ //packing row 0 n row 1
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //manipulation for row 2 -row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //row 1 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+ //row = 3
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+ // row = 4
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+
+ //separating +ve and and -ve values.(2,1)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+ //manipulation for row 3 -row 2
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row 2 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
+
+ //separating +ve and and -ve values.(3,2)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //right row3
+ signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
+
+ //separating +ve and and -ve values.(2,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+ //right row 4
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+ //separating +ve and and -ve values.(3,bottom)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
+ edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
+
+ //manipulation for bottom -row 3
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 8);
+ //eliminating old left for row 0,1,2,3
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
+ //packing row 2 n row 3
+ src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+ //row 3 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
+
+ //adding bottom and top values of row 2 and row 3
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+ //separating +ve and and -ve values.(botttom,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
+
+ //to store right of row 2
+ signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
+ //loading row 3 right into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
+ //storing right of row 2into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+ //to store right of row 0
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
+ src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ //row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ src_temp0_16x8b = src_temp1_16x8b;
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_cpy += (src_strd << 2);
+ pu1_src_left_cpy += 8;
+ pu1_src_left_str += 8;
+ }
+ ht_rem = ht & 0x2;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //row 0 -row 1
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //manipulation for row 1 -row 0
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign chang
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row 1 -bottom
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //manipulation for bottom -row1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //eliminating old left for row 0,1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+ //manipulation for bottom- row 1
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ //bottom - row 1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //shifting row 1
+ signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration signup0_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //the next top in src_top_16x8b
+ src_top_16x8b = src_temp1_16x8b;
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 4;
+ pu1_src_left_str += 4;
+ }
+ ht_rem = ht & 0x1;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+ //row 0 -row1
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+ //for row 0 right to put into left store
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //left store manipulation 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ src_top_16x8b = src_temp0_16x8b;
+ //filling the left boundary value
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+ edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ pu1_src_left_str[1] = pu1_src_cpy[7];
+ pu1_src_left_str[0] = pu1_src_cpy[6];
+ }
+
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd];
+ pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd];
+ }
+
+ _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 8;
+
+ pu1_left_tmp = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 = pu1_src_left_str2;
+ pu1_src_left_str2 = pu1_left_tmp;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ }
+ pu1_src_org[0] = u1_pos_0_0_tmp_u;
+ pu1_src_org[1] = u1_pos_0_0_tmp_v;
+ pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u;
+ pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v;
+ pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy;
+ for(row = 0; row < 2 * ht_tmp; row++)
+ {
+ pu1_src_left[row] = pu1_src_left_cpy[row];
+ }
+ }
+
+}
+
+void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
+ UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
+ UWORD8 *pu1_src_cpy, *pu1_src_org;
+ UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
+ UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
+ UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+ WORD32 wd_rem;
+ UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp;
+ WORD32 ht_tmp;
+ WORD32 bit_depth;
+ UWORD8 u1_avail0, u1_avail1;
+
+ __m128i src_top_16x8b, src_bottom_16x8b;
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i signup0_16x8b, signdwn1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i au1_mask8x16b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+ __m128i left_store_16x8b;
+
+ ht_tmp = ht;
+ au1_mask8x16b = _mm_set1_epi8(0xff);
+
+ au1_src_left_tmp[0] = pu1_src[(wd - 1)];
+ //manipulation for bottom left
+ for(row = 1; row < ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src_left[row];
+ }
+ au1_src_left_tmp[ht] = pu1_src_bot_left[0];
+
+ *pu1_src_top_left = pu1_src_top[wd - 1];
+ //setting availability mask to ff size MAX_CTB_SIZE
+ for(col = 0; col < MAX_CTB_SIZE; col += 16)
+ _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+ bit_depth = BIT_DEPTH_LUMA;
+ pu1_src_org = pu1_src;
+ pu1_src_top_cpy = pu1_src_top;
+ pu1_src_left_cpy2 = au1_src_left_tmp;
+ pu1_src_left_cpy = au1_src_left_tmp;
+ pu1_src_left_str2 = au1_src_left_tmp1;
+ pu1_src_left_str = au1_src_left_tmp1;
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+ /* If top-right is available, process separately */
+ if(0 != pu1_avail[5])
+ {
+ WORD32 edge_idx;
+
+ edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
+ SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_0_tmp = pu1_src[wd - 1];
+ }
+ }
+ else
+ {
+ u1_pos_wd_0_tmp = pu1_src[wd - 1];
+ }
+
+ /* If bottom-left is available, process separately */
+ if(0 != pu1_avail[6])
+ {
+ WORD32 edge_idx;
+
+ edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
+ SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
+ }
+ }
+ else
+ {
+ u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
+ }
+
+
+
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_left_cpy2++;
+ pu1_src_left_str2++;
+ pu1_src_top_cpy = pu1_src;
+ pu1_src += src_strd;
+ ht--;
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+
+
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+
+
+ //availability mask creation
+ u1_avail0 = pu1_avail[0];
+ u1_avail1 = pu1_avail[1];
+ au1_mask[0] = u1_avail0;
+ au1_mask[wd - 1] = u1_avail1;
+ {
+ WORD32 ht_rem;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ au1_mask_cpy = au1_mask;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+
+ //loading the mask
+ au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+ //row = 1
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //to insert left in row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ // row = 0 right
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
+
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ // row = 1 right
+ signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
+
+ //bottom - row1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration bottom -row1
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //to insert left in row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
+ //manipulation for row 1 - bottom
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //combining sign-left and sign_right
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+
+ //row1 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+ //copying the next top
+ src_top_16x8b = src_temp1_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ ht_rem = ht & 0x1;
+
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //to insert left in row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+
+ //current row -next row
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and bottom and constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+ //copying the next top
+ src_top_16x8b = src_temp0_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_left_cpy++;
+ pu1_src_left_str++;
+ }
+ { //for bottom right
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ //for the top left of next part of the block
+ left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //updating top flag
+ _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 16;
+ au1_mask_cpy += 16;
+
+ pu1_left_tmp = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 = pu1_src_left_str2;
+ pu1_src_left_str2 = pu1_left_tmp;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_cpy = pu1_src;
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //preparing au1_mask
+ au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+
+ for(row = ht; row >= 4; row -= 4)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ //manipulation for row 0 -row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //row 1 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulatiing for row 1 -row 0
+ signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row 1 -row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //manipulation for row 1 -row 2
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
+ //row 2 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+ //packing row 0 n row 1
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+
+ //row 1 right
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+ //row = 3
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+ // row = 4
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+
+ //separating +ve and and -ve values.(2,1)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row 2 right
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
+
+ //separating +ve and and -ve values.(3,2)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 2 -row 3
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //row 3 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
+
+ //separating +ve and and -ve values.(2,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //manipulation for row 3 -bottom
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 11);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+ //separating +ve and and -ve values.(3,bottom)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
+ edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
+
+
+ //eliminating old left for row 0,1,2,3
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+ //packing row 2 n row 3
+ src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+ //row 3 right
+ signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
+ //loading row 3 right into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
+ //adding bottom and top values of row 2 and row 3
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+ //separating +ve and and -ve values.(botttom,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //to store right of row 2
+ signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
+
+ //storing right of row 2into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+ //to store right of row 0
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
+ src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ //row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ src_temp0_16x8b = src_temp1_16x8b;
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_cpy += (src_strd << 2);
+ pu1_src_left_cpy += 4;
+ pu1_src_left_str += 4;
+ }
+ ht_rem = ht & 0x2;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //manipulation for row 0 -row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign chang
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //manipulation for row 1 -bottom
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //manipulation for bottom- row 1 (row 1 right)
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ //bottom - row 1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //eliminating old left for row 0,1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration signup0_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
+
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+ //for storing right of row 1
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+
+ src_top_16x8b = src_temp1_16x8b;
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ ht_rem = ht & 0x1;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+
+ //manipulation for row 0 -bottom
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ //for row 0 right to put into left store
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+ edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+ //left store manipulation 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+ //filling the left boundary value
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ src_top_16x8b = src_temp0_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_left_cpy++;
+ pu1_src_left_str++;
+ }
+ { //for bottom right
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+ src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 8;
+
+ pu1_left_tmp = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 = pu1_src_left_str2;
+ pu1_src_left_str2 = pu1_left_tmp;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+
+ }
+ pu1_src_org[wd - 1] = u1_pos_wd_0_tmp;
+ pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp;
+ pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
+ pu1_src_left[0] = au1_src_left_tmp[0];
+ for(row = 1; row < ht_tmp; row++)
+ {
+ pu1_src_left[row] = pu1_src_left_cpy[row];
+ }
+ }
+
+}
+
+void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
+ UWORD8 *pu1_src_cpy, *pu1_src_org;
+ UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
+ UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+ WORD32 wd_rem;
+ UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v;
+ WORD32 ht_tmp;
+ WORD32 bit_depth;
+ UWORD8 u1_avail0, u1_avail1;
+
+ __m128i src_top_16x8b, src_bottom_16x8b;
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i signup0_16x8b, signdwn1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i au1_mask8x16b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i left_store_16x8b;
+ __m128i const0_16x8b, const2_16x8b;
+ __m128i chroma_offset_8x16b;
+
+ ht_tmp = ht;
+ au1_mask8x16b = _mm_set1_epi8(0xff);
+
+
+ au1_src_left_tmp[0] = pu1_src[(wd - 2)];
+ au1_src_left_tmp[1] = pu1_src[(wd - 1)];
+ //manipulation for bottom left
+ for(row = 2; row < 2 * ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src_left[row];
+ }
+ au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0];
+ au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1];
+
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];
+ //setting availability mask to ff size MAX_CTB_SIZE
+ for(col = 0; col < MAX_CTB_SIZE; col += 16)
+ _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+ bit_depth = BIT_DEPTH_LUMA;
+ pu1_src_org = pu1_src;
+ pu1_src_top_cpy = pu1_src_top;
+ pu1_src_left_cpy2 = au1_src_left_tmp;
+ pu1_src_left_cpy = au1_src_left_tmp;
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+ const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+ chroma_offset_8x16b = _mm_set1_epi16(0x0800);
+ /* If top-right is available, process separately */
+ if(0 != pu1_avail[5])
+ {
+ WORD32 edge_idx;
+
+ /* U */
+ edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
+ SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
+ }
+
+ /* V */
+ edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
+ SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
+ }
+ }
+ else
+ {
+ u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
+ u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
+ }
+
+ /* If bottom-left is available, process separately */
+ if(0 != pu1_avail[6])
+ {
+ WORD32 edge_idx;
+
+ /* U */
+ edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
+ SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
+ }
+
+ /* V */
+ edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
+ SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
+ }
+ }
+ else
+ {
+ u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
+ u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
+ }
+
+
+
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_left_cpy2 += 2;
+ pu1_src_top_cpy = pu1_src;
+ pu1_src += src_strd;
+ ht--;
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+
+ sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+
+
+ //availability mask creation
+ u1_avail0 = pu1_avail[0];
+ u1_avail1 = pu1_avail[1];
+ au1_mask[0] = u1_avail0;
+ au1_mask[1] = u1_avail0;
+ au1_mask[wd - 1] = u1_avail1;
+ au1_mask[wd - 2] = u1_avail1;
+ {
+ WORD32 ht_rem;
+ au1_mask_cpy = au1_mask;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+
+ //loading the mask
+ au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+ //row = 1
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //to insert left in row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ // row = 0 right
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
+
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ // row = 1 right
+ signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
+
+ //bottom - row1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration bottom -row1
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //to insert left in row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
+ //manipulation for row 1 - bottom
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //combining sign-left and sign_right
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+ //row1 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+ //copying the next top
+ src_top_16x8b = src_temp1_16x8b;
+
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 4;
+ }
+ ht_rem = ht & 0x1;
+
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //to insert left in row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+
+ //current row -next row
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and bottom and constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+ //copying the next top
+ src_top_16x8b = src_temp0_16x8b;
+
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+
+
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_left_cpy += 2;
+ }
+ { //for bottom right
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ //for the top left of next part of the block
+ left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //updating top flag
+ _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 16;
+ au1_mask_cpy += 16;
+ }
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //preparing au1_mask
+ au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ for(row = ht; row >= 4; row -= 4)
+ {
+ left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ //manipulation for row 0 -row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //row 1 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulatiing for row 1 -row 0
+ signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row 1 -row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //manipulation for row 1 -row 2
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
+ //row 2 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+ //packing row 0 n row 1
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+
+ //row 1 right
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+ //row = 3
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+ // row = 4
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+
+ //separating +ve and and -ve values.(2,1)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row 2 right
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
+
+ //separating +ve and and -ve values.(3,2)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 2 -row 3
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 8);
+ //row 3 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
+
+ //separating +ve and and -ve values.(2,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //manipulation for row 3 -bottom
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 6);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+ //separating +ve and and -ve values.(3,bottom)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
+ edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
+
+
+ //eliminating old left for row 0,1,2,3
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
+ //packing row 2 n row 3
+ src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+ //row 3 right
+ signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
+ //loading row 3 right into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
+ //adding bottom and top values of row 2 and row 3
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+ //separating +ve and and -ve values.(botttom,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //to store right of row 2
+ signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
+
+ //storing right of row 2into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+ //to store right of row 0
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
+ src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ //row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ src_temp0_16x8b = src_temp1_16x8b;
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_cpy += (src_strd << 2);
+ pu1_src_left_cpy += 8;
+ }
+ ht_rem = ht & 0x2;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //manipulation for row 0 -row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign chang
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //manipulation for row 1 -bottom
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+
+ //manipulation for bottom- row 1 (row 1 right)
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ //bottom - row 1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //eliminating old left for row 0,1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+ signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration signup0_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
+
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+ //for storing right of row 1
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+
+ src_top_16x8b = src_temp1_16x8b;
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 4;
+ }
+ ht_rem = ht & 0x1;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+
+ //manipulation for row 0 -bottom
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ //for row 0 right to put into left store
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+ edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+ //left store manipulation 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ //filling the left boundary value
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+ src_top_16x8b = src_temp0_16x8b;
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_left_cpy += 2;
+ }
+ { //for bottom right
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+
+ _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 8;
+ }
+ pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u;
+ pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v;
+ pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u;
+ pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v;
+ for(row = 0; row < 2 * ht_tmp; row++)
+ {
+ pu1_src_left[row] = au1_src_left_tmp[row];
+ }
+ }
+
+}