diff options
Diffstat (limited to 'common/x86/ihevc_intra_pred_filters_ssse3_intr.c')
-rw-r--r-- | common/x86/ihevc_intra_pred_filters_ssse3_intr.c | 5127 |
1 files changed, 5127 insertions, 0 deletions
diff --git a/common/x86/ihevc_intra_pred_filters_ssse3_intr.c b/common/x86/ihevc_intra_pred_filters_ssse3_intr.c new file mode 100644 index 0000000..dbab80a --- /dev/null +++ b/common/x86/ihevc_intra_pred_filters_ssse3_intr.c @@ -0,0 +1,5127 @@ +/****************************************************************************** +* +* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at: +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +******************************************************************************/ +/** +******************************************************************************* +* @file +* ihevc_intra_pred_filters_atom_intr.c +* +* @brief +* Contains function Definition for intra prediction interpolation filters +* +* +* @author +* Ittiam +* +* @par List of Functions: +* - ihevc_intra_pred_luma_planar_ssse3() +* - ihevc_intra_pred_luma_dc_ssse3() +* - ihevc_intra_pred_luma_horz_ssse3() +* - ihevc_intra_pred_luma_ver_ssse3() +* - ihevc_intra_pred_luma_mode2_ssse3() +* - ihevc_intra_pred_luma_mode_18_34_ssse3() +* - ihevc_intra_pred_luma_mode_3_to_9_ssse3() +* - ihevc_intra_pred_luma_mode_11_to_17_ssse3() +* - ihevc_intra_pred_luma_mode_19_to_25_ssse3() +* - ihevc_intra_pred_luma_mode_27_to_33_ssse3() +* - ihevc_intra_pred_luma_ref_substitution_ssse3() +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ +#include <stdlib.h> + +#include "ihevc_typedefs.h" +#include "ihevc_intra_pred.h" +#include "ihevc_platform_macros.h" +#include "ihevc_macros.h" +#include "ihevc_func_selector.h" +#include "ihevc_common_tables.h" +#include "ihevc_defs.h" +#include "ihevc_tables_x86_intr.h" + +#include <immintrin.h> + +/****************************************************************************/ +/* Constant Macros */ +/****************************************************************************/ +#define MAX_CU_SIZE 64 +#define BIT_DEPTH 8 +#define T32_4NT 128 +#define T16_4NT 64 + + +/****************************************************************************/ +/* Function Macros */ +/****************************************************************************/ +#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x) + +/* tables to shuffle 8-bit values */ + + +/*****************************************************************************/ +/* global tables Definition */ +/*****************************************************************************/ + + +/*****************************************************************************/ +/* Function Definition */ +/*****************************************************************************/ + + +/** +******************************************************************************* +* +* @brief +* Intra prediction interpolation filter for pu1_ref substitution +* +* +* @par Description: +* Reference substitution process for samples unavailable for prediction +* Refer to section 8.4.4.2.2 +* +* @param[in] pu1_top_left +* UWORD8 pointer to the top-left +* +* @param[in] pu1_top +* UWORD8 pointer to the top +* +* @param[in] pu1_left +* UWORD8 pointer to the left +* +* @param[in] src_strd +* WORD32 Source stride +* +* @param[in] nbr_flags +* WORD32 neighbor availability flags +* +* @param[in] nt +* WORD32 transform Block size +* +* @param[in] dst_strd +* WORD32 Destination stride +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void ihevc_intra_pred_luma_ref_substitution_ssse3(UWORD8 *pu1_top_left, + UWORD8 *pu1_top, + UWORD8 *pu1_left, + WORD32 src_strd, + WORD32 nt, + WORD32 nbr_flags, + UWORD8 *pu1_dst, + WORD32 dst_strd) +{ + UWORD8 pu1_ref; + WORD32 dc_val, i; + WORD32 total_samples = (4 * nt) + 1; + WORD32 two_nt = 2 * nt; + + WORD32 three_nt = 3 * nt; + WORD32 get_bits; + WORD32 next; + WORD32 bot_left, left, top, tp_right, tp_left; + + WORD32 idx, nbr_id_from_bl, frwd_nbr_flag; + UNUSED(dst_strd); + + dc_val = 1 << (BIT_DEPTH - 1); + + + /* Neighbor Flag Structure*/ + /* MSB ---> LSB */ + /* Top-Left | Top-Right | Top | Left | Bottom-Left + 1 4 4 4 4 + */ + /* If no neighbor flags are present, fill the neighbor samples with DC value */ + if(nbr_flags == 0) + { + for(i = 0; i < total_samples; i++) + { + pu1_dst[i] = dc_val; + } + } + else + { + /* Else fill the corresponding samples */ + pu1_dst[two_nt] = *pu1_top_left; + for(i = 0; i < two_nt; i++) + pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd]; + for(i = 0; i < two_nt; i++) + pu1_dst[two_nt + 1 + i] = pu1_top[i]; + + if(nt <= 8) + { + /* 1 bit extraction for all the neighboring blocks */ + tp_left = (nbr_flags & 0x10000) >> 16; + bot_left = (nbr_flags & 0x8) >> 3; + left = (nbr_flags & 0x80) >> 7; + top = (nbr_flags & 0x100) >> 8; + tp_right = (nbr_flags & 0x1000) >> 12; + + next = 1; + + /* If bottom -left is not available, reverse substitution process*/ + if(bot_left == 0) + { + WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right }; + + /* Check for the 1st available sample from bottom-left*/ + while(!a_nbr_flag[next]) + next++; + + /* If Left, top-left are available*/ + if(next <= 2) + { + idx = nt * next; + pu1_ref = pu1_dst[idx]; + for(i = 0; i < idx; i++) + pu1_dst[i] = pu1_ref; + } + else /* If top, top-right are available */ + { + /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/ + idx = (nt * (next - 1)) + 1; + pu1_ref = pu1_dst[idx]; + for(i = 0; i < idx; i++) + pu1_dst[i] = pu1_ref; + } + } + + /* Forward Substitution Process */ + /* If left is Unavailable, copy the last bottom-left value */ + if(left == 0) + { + for(i = 0; i < nt; i++) + pu1_dst[nt + i] = pu1_dst[nt - 1]; + } + /* If top-left is Unavailable, copy the last left value */ + if(tp_left == 0) + pu1_dst[two_nt] = pu1_dst[two_nt - 1]; + /* If top is Unavailable, copy the last top-left value */ + if(top == 0) + { + for(i = 0; i < nt; i++) + pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt]; + } + /* If to right is Unavailable, copy the last top value */ + if(tp_right == 0) + { + for(i = 0; i < nt; i++) + pu1_dst[three_nt + 1 + i] = pu1_dst[three_nt]; + } + } + + if(nt == 16) + { + WORD32 nbr_flags_temp = 0; + nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4) + + ((nbr_flags & 0x300) >> 4) + + ((nbr_flags & 0x3000) >> 6) + + ((nbr_flags & 0x10000) >> 8); + + /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/ + /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */ + { + nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */ + + if(nbr_id_from_bl == 64) + nbr_id_from_bl = 32; + + if(nbr_id_from_bl == 32) + { + /* for top left : 1 pel per nbr bit */ + if(!((nbr_flags_temp >> 8) & 0x1)) + { + nbr_id_from_bl++; + nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */ + //nbr_id_from_bl += idx * 8; + } + } + /* Reverse Substitution Process*/ + if(nbr_id_from_bl) + { + /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */ + pu1_ref = pu1_dst[nbr_id_from_bl]; + for(i = (nbr_id_from_bl - 1); i >= 0; i--) + { + pu1_dst[i] = pu1_ref; + } + } + } + + /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */ + while(nbr_id_from_bl < ((T16_4NT) + 1)) + { + /* To Obtain the next unavailable idx flag after reverse neighbor substitution */ + /* Devide by 8 to obtain the original index */ + frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/ + + /* The Top-left flag is at the last bit location of nbr_flags*/ + if(nbr_id_from_bl == (T16_4NT / 2)) + { + get_bits = GET_BITS(nbr_flags_temp, 8); + + /* only pel substitution for TL */ + if(!get_bits) + pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1]; + } + else + { + get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag); + if(!get_bits) + { + /* 8 pel substitution (other than TL) */ + pu1_ref = pu1_dst[nbr_id_from_bl - 1]; + for(i = 0; i < 8; i++) + pu1_dst[nbr_id_from_bl + i] = pu1_ref; + } + + } + nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8; + } + + + } + + if(nt == 32) + { + /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/ + /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */ + { + nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */ + + if(nbr_id_from_bl == 64) + { + /* for top left : 1 pel per nbr bit */ + if(!((nbr_flags >> 16) & 0x1)) + { + /* top left not available */ + nbr_id_from_bl++; + /* top and top right; 8 pels per nbr bit */ + nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8; + } + } + /* Reverse Substitution Process*/ + if(nbr_id_from_bl) + { + /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */ + pu1_ref = pu1_dst[nbr_id_from_bl]; + for(i = (nbr_id_from_bl - 1); i >= 0; i--) + pu1_dst[i] = pu1_ref; + } + } + + /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */ + while(nbr_id_from_bl < ((T32_4NT) + 1)) + { + /* To Obtain the next unavailable idx flag after reverse neighbor substitution */ + /* Devide by 8 to obtain the original index */ + frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/ + + /* The Top-left flag is at the last bit location of nbr_flags*/ + if(nbr_id_from_bl == (T32_4NT / 2)) + { + get_bits = GET_BITS(nbr_flags, 16); + /* only pel substitution for TL */ + if(!get_bits) + pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1]; + } + else + { + get_bits = GET_BITS(nbr_flags, frwd_nbr_flag); + if(!get_bits) + { + /* 8 pel substitution (other than TL) */ + pu1_ref = pu1_dst[nbr_id_from_bl - 1]; + for(i = 0; i < 8; i++) + pu1_dst[nbr_id_from_bl + i] = pu1_ref; + } + + } + nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8; + } + } + + } +} + +/** +******************************************************************************* +* +* @brief +* Intra prediction interpolation filter for ref_filtering +* +* +* @par Description: +* Reference DC filtering for neighboring samples dependent on TU size and +* mode Refer to section 8.4.4.2.3 in the standard +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] nt +* integer Transform Block size +* +* @param[in] mode +* integer intraprediction mode +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void ihevc_intra_pred_ref_filtering_ssse3(UWORD8 *pu1_src, + WORD32 nt, + UWORD8 *pu1_dst, + WORD32 mode, + WORD32 strong_intra_smoothing_enable_flag) +{ + WORD32 filter_flag; + WORD32 i; /* Generic indexing variable */ + WORD32 four_nt = 4 * nt; + UWORD8 au1_flt[(4 * MAX_CU_SIZE) + 1]; + WORD32 bi_linear_int_flag = 0; + WORD32 abs_cond_left_flag = 0; + WORD32 abs_cond_top_flag = 0; + WORD32 dc_val = 1 << (BIT_DEPTH - 5); + __m128i src_temp1, src_temp2, src_temp3, src_temp7; + __m128i src_temp4, src_temp5, src_temp6, src_temp8; + + //WORD32 strong_intra_smoothing_enable_flag = 1; + + filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2)); + if(0 == filter_flag) + { + if(pu1_src == pu1_dst) + { + return; + } + else + { + if(nt == 4) + { + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src)); + _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); + pu1_dst[four_nt] = pu1_src[four_nt]; + + } + + else if(nt == 8) + { + + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src)); + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16)); + + _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); + + + pu1_dst[four_nt] = pu1_src[four_nt]; + } + else if(nt == 16) + { + + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src)); + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16)); + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32)); + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48)); + + _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4); + + pu1_dst[four_nt] = pu1_src[four_nt]; + } + else if(nt == 32) + { + + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src)); + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16)); + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32)); + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48)); + + src_temp5 = _mm_loadu_si128((__m128i *)(pu1_src + 64)); + src_temp6 = _mm_loadu_si128((__m128i *)(pu1_src + 80)); + src_temp7 = _mm_loadu_si128((__m128i *)(pu1_src + 96)); + src_temp8 = _mm_loadu_si128((__m128i *)(pu1_src + 112)); + + _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4); + + _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7); + _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8); + + pu1_dst[four_nt] = pu1_src[four_nt]; + } + + } + } + + else + { + /* If strong intra smoothin is enabled and transform size is 32 */ + if((1 == strong_intra_smoothing_enable_flag) && (32 == nt)) + { + /* Strong Intra Filtering */ + abs_cond_top_flag = (abs(pu1_src[2 * nt] + pu1_src[4 * nt] + - (2 * pu1_src[3 * nt]))) < dc_val; + abs_cond_left_flag = (abs(pu1_src[2 * nt] + pu1_src[0] + - (2 * pu1_src[nt]))) < dc_val; + + bi_linear_int_flag = ((1 == abs_cond_left_flag) + && (1 == abs_cond_top_flag)); + } + /* Extremities Untouched*/ + au1_flt[0] = pu1_src[0]; + au1_flt[4 * nt] = pu1_src[4 * nt]; + + /* Strong filtering of reference samples */ + if(1 == bi_linear_int_flag) + { + au1_flt[2 * nt] = pu1_src[2 * nt]; + + for(i = 1; i < (2 * nt); i++) + au1_flt[i] = (((2 * nt) - i) * pu1_src[0] + i * pu1_src[2 * nt] + 32) >> 6; + + for(i = 1; i < (2 * nt); i++) + au1_flt[i + (2 * nt)] = (((2 * nt) - i) * pu1_src[2 * nt] + i * pu1_src[4 * nt] + 32) >> 6; + } + else + { + __m128i const_value_8x16, zero_8x16b; + + const_value_8x16 = _mm_set1_epi16(2); + + au1_flt[0] = pu1_src[0]; + au1_flt[4 * nt] = pu1_src[4 * nt]; + + zero_8x16b = _mm_setzero_si128(); + + /* Perform bilinear filtering of Reference Samples */ + for(i = 0; i < (four_nt); i += 16) + { + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src + i)); + src_temp2 = _mm_srli_si128(src_temp1, 1); + src_temp3 = _mm_srli_si128(src_temp2, 1); + + src_temp1 = _mm_unpacklo_epi8(src_temp1, zero_8x16b); + src_temp2 = _mm_unpacklo_epi8(src_temp2, zero_8x16b); + src_temp3 = _mm_unpacklo_epi8(src_temp3, zero_8x16b); + + src_temp2 = _mm_slli_epi16(src_temp2, 1); + + src_temp1 = _mm_add_epi16(src_temp1, src_temp2); + src_temp1 = _mm_add_epi16(src_temp1, src_temp3); + src_temp1 = _mm_add_epi16(src_temp1, const_value_8x16); + + src_temp1 = _mm_srai_epi16(src_temp1, 2); + + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 8 + i)); + src_temp5 = _mm_srli_si128(src_temp4, 1); + src_temp6 = _mm_srli_si128(src_temp5, 1); + + src_temp4 = _mm_unpacklo_epi8(src_temp4, zero_8x16b); + src_temp5 = _mm_unpacklo_epi8(src_temp5, zero_8x16b); + src_temp6 = _mm_unpacklo_epi8(src_temp6, zero_8x16b); + + src_temp5 = _mm_slli_epi16(src_temp5, 1); + + src_temp4 = _mm_add_epi16(src_temp4, src_temp5); + src_temp4 = _mm_add_epi16(src_temp4, src_temp6); + src_temp4 = _mm_add_epi16(src_temp4, const_value_8x16); + + src_temp4 = _mm_srai_epi16(src_temp4, 2); + + /* converting 16 bit to 8 bit */ + src_temp1 = _mm_packus_epi16(src_temp1, src_temp4); + + _mm_storeu_si128((__m128i *)(au1_flt + 1 + i), src_temp1); + } + au1_flt[4 * nt] = pu1_src[4 * nt]; + } + + if(nt == 4) + { + src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt)); + _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); + pu1_dst[four_nt] = au1_flt[four_nt]; + } + else if(nt == 8) + { + + src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt)); + src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16)); + + _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); + + pu1_dst[four_nt] = au1_flt[four_nt]; + } + else if(nt == 16) + { + + src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt)); + src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16)); + src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32)); + src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48)); + + _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4); + + pu1_dst[four_nt] = au1_flt[four_nt]; + } + + else if(nt == 32) + { + + src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt)); + src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16)); + src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32)); + src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48)); + + src_temp5 = _mm_loadu_si128((__m128i *)(au1_flt + 64)); + src_temp6 = _mm_loadu_si128((__m128i *)(au1_flt + 80)); + src_temp7 = _mm_loadu_si128((__m128i *)(au1_flt + 96)); + src_temp8 = _mm_loadu_si128((__m128i *)(au1_flt + 112)); + + _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4); + + _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7); + _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8); + + pu1_dst[four_nt] = au1_flt[four_nt]; + } + + } +} + +/** +******************************************************************************* +* +* @brief +* Intra prediction interpolation filter for luma planar +* +* @par Description: +* Planar Intraprediction with reference neighboring samples location +* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer +* to section 8.4.4.2.4 in the standard +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] nt +* integer Transform Block size +* +* @param[in] mode +* integer intraprediction mode +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + + +void ihevc_intra_pred_luma_planar_ssse3(UWORD8 *pu1_ref, + WORD32 src_strd, + UWORD8 *pu1_dst, + WORD32 dst_strd, + WORD32 nt, + WORD32 mode) +{ + + + WORD32 row, col; + WORD32 two_nt, three_nt; + UWORD16 temp; + + __m128i pu1_ref_16x8b, const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b; + __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b, const_temp8_4x32b; + __m128i nt_row_16x8b, nt_row1_16x8b, nt_row2_16x8b, nt_row3_16x8b; //nt-1-row + __m128i row_16x8b, row1_16x8b, row2_16x8b, row3_16x8b; //row+1 + UNUSED(src_strd); + UNUSED(mode); + + two_nt = 2 * nt; + three_nt = 3 * nt; + + /* Planar filtering */ + temp = pu1_ref[nt - 1]; + temp = (temp << 8) | ((UWORD16)pu1_ref[three_nt + 1]); + /* setting vallues in registera*/ + pu1_ref_16x8b = _mm_set1_epi16(temp); + const_temp6_4x32b = _mm_set1_epi16(nt); + + + + if(nt == 32) /* for nt multiple of 8*/ + { + + + const_temp4_4x32b = _mm_set1_epi16(0x0400); + const_temp1_4x32b = _mm_set1_epi16(0x0100); + const_temp8_4x32b = _mm_set1_epi16(0x0008); + //(nt-1-y) (nt-1-x) ; x= 0..15 , y = row + //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7); + nt_row_16x8b = _mm_set_epi16(0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x1f1e, 0x1f1f); + //(y+1) (x+1) ; x= 0..15 , y = row + //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1); + row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101); + + for(row = 0; row < nt; row += 1) + { + __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b; + __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b; + + __m128i src_temp_8x16b, src_temp1_8x16b; + + + res_temp1_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]); + + nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp8_4x32b); + row1_16x8b = _mm_add_epi16(row_16x8b, const_temp8_4x32b); + nt_row2_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp8_4x32b); + row2_16x8b = _mm_add_epi16(row1_16x8b, const_temp8_4x32b); + nt_row3_16x8b = _mm_sub_epi16(nt_row2_16x8b, const_temp8_4x32b); + row3_16x8b = _mm_add_epi16(row2_16x8b, const_temp8_4x32b); + /* loding 8bit 16 pixles*/ + src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); + src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); + + res_temp4_8x16b = _mm_unpacklo_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=0*/ + res_temp5_8x16b = _mm_unpackhi_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=1*/ + res_temp6_8x16b = _mm_unpacklo_epi8(res_temp1_8x16b, src_temp1_8x16b); /* row=2*/ + res_temp7_8x16b = _mm_unpackhi_epi8(res_temp1_8x16b, src_temp1_8x16b); /* row=3*/ + + /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */ + res_temp_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b); + res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b); + res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b); + res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b); + /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */ + res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b); + res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b); + res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b); + res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b); + + res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b); + res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b); + res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b); + res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b); + /*res_temp + nt)*/ + res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b); + res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b); + res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b); + res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b); + + res_temp_8x16b = _mm_srli_epi16(res_temp_8x16b, 6); //log2(32)+1 + res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 6); + res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 6); + res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 6); + + res_temp_8x16b = _mm_packus_epi16(res_temp_8x16b, res_temp1_8x16b); + res_temp1_8x16b = _mm_packus_epi16(res_temp2_8x16b, res_temp3_8x16b); + + + _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b); + _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd) + 16), res_temp1_8x16b); + + + nt_row_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp1_4x32b); + row_16x8b = _mm_add_epi16(row_16x8b, const_temp1_4x32b); + } + } + else if(nt == 16) /* for nt multiple of 8*/ + { + + const_temp4_4x32b = _mm_set1_epi16(0x0400); + const_temp1_4x32b = _mm_set1_epi16(0x0100); + const_temp8_4x32b = _mm_set1_epi16(0x0008); + //(nt-1-y) (nt-1-x) ; x= 0..15 , y = row + //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7); + nt_row_16x8b = _mm_set_epi16(0x0f08, 0x0f09, 0x0f0a, 0x0f0b, 0x0f0c, 0x0f0d, 0x0f0e, 0x0f0f); + //(y+1) (x+1) ; x= 0..15 , y = row + //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1); + row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101); + + for(row = 0; row < nt; row += 2) + { + __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b; + __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b; + + __m128i src_temp_8x16b; + + + res_temp1_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]); + res_temp2_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 2 - row]); + + + nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp1_4x32b); + row1_16x8b = _mm_add_epi16(row_16x8b, const_temp1_4x32b); + nt_row2_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp8_4x32b); + row2_16x8b = _mm_add_epi16(row_16x8b, const_temp8_4x32b); + nt_row3_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp8_4x32b); + row3_16x8b = _mm_add_epi16(row1_16x8b, const_temp8_4x32b); + /* loding 8bit 16 pixles*/ + src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); + + + res_temp4_8x16b = _mm_unpacklo_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=0*/ + res_temp5_8x16b = _mm_unpacklo_epi8(res_temp2_8x16b, src_temp_8x16b); /* row=1*/ + res_temp6_8x16b = _mm_unpackhi_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=2*/ + res_temp7_8x16b = _mm_unpackhi_epi8(res_temp2_8x16b, src_temp_8x16b); /* row=3*/ + + /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */ + res_temp_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b); + res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b); + res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b); + res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b); + /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */ + res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b); + res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b); + res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b); + res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b); + + res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b); + res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b); + res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b); + res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b); + /*res_temp + nt)*/ + res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b); + res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b); + res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b); + res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b); + + res_temp_8x16b = _mm_srli_epi16(res_temp_8x16b, 5); //log2(16)+1 + res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 5); + res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 5); + res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 5); + + res_temp_8x16b = _mm_packus_epi16(res_temp_8x16b, res_temp2_8x16b); + res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, res_temp3_8x16b); + + _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), res_temp1_8x16b); + + nt_row_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp1_4x32b); + row_16x8b = _mm_add_epi16(row1_16x8b, const_temp1_4x32b); + } + } + else if(nt == 8) + { + + + const_temp4_4x32b = _mm_set1_epi16(0x0400); + const_temp1_4x32b = _mm_set1_epi16(0x0100); + zero_8x16b = _mm_set1_epi32(0); + + //(nt-1-y) (nt-1-x) ; x= 0..7 , y = row + //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7); + nt_row_16x8b = _mm_set_epi16(0x0700, 0x0701, 0x0702, 0x0703, 0x0704, 0x0705, 0x0706, 0x0707); + //(y+1) (x+1) ; x= 0..7 , y = row + //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1); + row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101); + + for(row = 0; row < nt; row += 4) + { + __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b; + __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b; + + __m128i src_temp_8x16b; + + + res_temp4_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]); + res_temp5_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 2 - row]); + res_temp6_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 3 - row]); + res_temp7_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 4 - row]); + + nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp1_4x32b); + row1_16x8b = _mm_add_epi16(row_16x8b, const_temp1_4x32b); + nt_row2_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp1_4x32b); + row2_16x8b = _mm_add_epi16(row1_16x8b, const_temp1_4x32b); + nt_row3_16x8b = _mm_sub_epi16(nt_row2_16x8b, const_temp1_4x32b); + row3_16x8b = _mm_add_epi16(row2_16x8b, const_temp1_4x32b); + /* loding 8bit 16 pixles*/ + src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); + + res_temp4_8x16b = _mm_unpacklo_epi8(res_temp4_8x16b, src_temp_8x16b); /* row=0*/ + res_temp5_8x16b = _mm_unpacklo_epi8(res_temp5_8x16b, src_temp_8x16b); /* row=1*/ + res_temp6_8x16b = _mm_unpacklo_epi8(res_temp6_8x16b, src_temp_8x16b); /* row=2*/ + res_temp7_8x16b = _mm_unpacklo_epi8(res_temp7_8x16b, src_temp_8x16b); /* row=3*/ + + /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */ + res_temp_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b); + res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b); + res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b); + res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b); + /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */ + res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b); + res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b); + res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b); + res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b); + + res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b); + res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b); + res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b); + res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b); + /*res_temp + nt)*/ + res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b); + res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b); + res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b); + res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b); + + res_temp_8x16b = _mm_srli_epi16(res_temp_8x16b, 4); //log2(16)+1 + res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 4); + res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 4); + res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 4); + + res_temp_8x16b = _mm_packus_epi16(res_temp_8x16b, zero_8x16b); + res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b); + res_temp2_8x16b = _mm_packus_epi16(res_temp2_8x16b, zero_8x16b); + res_temp3_8x16b = _mm_packus_epi16(res_temp3_8x16b, zero_8x16b); + + _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b); + _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), res_temp1_8x16b); + _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), res_temp2_8x16b); + _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), res_temp3_8x16b); + + nt_row_16x8b = _mm_sub_epi16(nt_row3_16x8b, const_temp1_4x32b); + row_16x8b = _mm_add_epi16(row3_16x8b, const_temp1_4x32b); + } + } + else + { + + /* for nt multiple of 4*/ + const_temp7_4x32b = _mm_set1_epi16(4); + const_temp4_4x32b = _mm_set1_epi16(nt - 1); + const_temp_4x32b = _mm_set1_epi16(pu1_ref[three_nt + 1]); + const_temp1_4x32b = _mm_set1_epi16(pu1_ref[nt - 1]); + zero_8x16b = _mm_set1_epi32(0); + + for(row = 0; row < nt; row++) + { + __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b; + __m128i res_temp3_8x16b; + + const_temp2_4x32b = _mm_set1_epi16(pu1_ref[two_nt - 1 - row]); + const_temp3_4x32b = _mm_set1_epi16((row + 1)); + + + row_8x16b = _mm_set1_epi16((nt - 1 - row)); + + const_temp5_4x32b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); + col_8x16b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + + const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b); + + /*(row + 1) * pu1_ref[nt - 1]*/ + res_temp_8x16b = _mm_mullo_epi16(const_temp3_4x32b, const_temp1_4x32b); + + /*(row + 1) * pu1_ref[nt - 1] + nt)*/ + res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b); + + for(col = 0; col < nt; col += 4) + { + __m128i src_temp_8x16b; + int temp1; + + /* loding 8bit 16 pixles*/ + src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + col)); + + src_temp_8x16b = _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b); /* row=0*/ + + /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */ + res_temp1_8x16b = _mm_mullo_epi16(src_temp_8x16b, row_8x16b); + + /*(col + 1) * pu1_ref[three_nt + 1]*/ + res_temp2_8x16b = _mm_mullo_epi16(const_temp_4x32b, col_8x16b); + + /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/ + res_temp3_8x16b = _mm_mullo_epi16(const_temp2_4x32b, const_temp5_4x32b); + + res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b); + res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); + res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b); + + res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 3); //log2(16)+1 + res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b); + + temp1 = _mm_cvtsi128_si32(res_temp1_8x16b); + + *(WORD32 *)(&pu1_dst[(row * dst_strd) + col]) = temp1; + + const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b); + col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b); + } /* inner loop ends here */ + } + } + + +} + +/** +******************************************************************************* +* +* @brief +* Intra prediction interpolation filter for luma dc +* +* @par Description: +* Intraprediction for DC mode with reference neighboring samples location +* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer +* to section 8.4.4.2.5 in the standard +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] nt +* integer Transform Block size +* +* @param[in] mode +* integer intraprediction mode +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void ihevc_intra_pred_luma_dc_ssse3(UWORD8 *pu1_ref, + WORD32 src_strd, + UWORD8 *pu1_dst, + WORD32 dst_strd, + WORD32 nt, + WORD32 mode) +{ + + WORD32 acc_dc; + WORD32 dc_val, two_dc_val, three_dc_val; + WORD32 row; + WORD32 log2nt = 5; + WORD32 two_nt, three_nt; + __m128i src_temp1, src_temp7, src_temp3, src_temp4, src_temp5, src_temp6; + __m128i src_temp8, src_temp10, src_temp2; + __m128i m_zero = _mm_setzero_si128(); + __m128i sm = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK5[0]); + UNUSED(src_strd); + UNUSED(mode); + + + switch(nt) + { + case 32: + log2nt = 5; + break; + case 16: + log2nt = 4; + break; + case 8: + log2nt = 3; + break; + case 4: + log2nt = 2; + break; + default: + break; + } + two_nt = 2 * nt; + three_nt = 3 * nt; + + acc_dc = 0; + /* Calculate DC value for the transform block */ + + + + if(nt == 32) + { + __m128i temp; + WORD32 itr_count; + + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16)); + src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32)); + src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48)); + + src_temp3 = _mm_sad_epu8(src_temp3, m_zero); + src_temp4 = _mm_sad_epu8(src_temp4, m_zero); + src_temp7 = _mm_sad_epu8(src_temp7, m_zero); + src_temp8 = _mm_sad_epu8(src_temp8, m_zero); + + src_temp4 = _mm_add_epi16(src_temp3, src_temp4); + src_temp8 = _mm_add_epi16(src_temp7, src_temp8); + src_temp4 = _mm_add_epi16(src_temp4, src_temp8); + + src_temp4 = _mm_shuffle_epi8(src_temp4, sm); + src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); + + acc_dc = _mm_cvtsi128_si32(src_temp4); + + acc_dc += pu1_ref[three_nt]; + acc_dc -= pu1_ref[two_nt]; + + /* computing acc_dc value */ + dc_val = (acc_dc + nt) >> (log2nt + 1); + + two_dc_val = 2 * dc_val; + three_dc_val = 3 * dc_val; + + temp = _mm_set1_epi8(dc_val); + + for(itr_count = 0; itr_count < 2; itr_count++) + { + /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ + _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp); + + _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp); + + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp); + + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp); + + pu1_dst += 16 * dst_strd; + } + } + else + + { + __m128i sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]); + + /* DC filtering for the first top row and first left column */ + + + + if(nt == 4) /* nt multiple of 4*/ + { + WORD32 temp1, temp2, temp3; + + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); + + src_temp4 = _mm_unpacklo_epi8(src_temp3, m_zero); + src_temp2 = _mm_unpacklo_epi8(src_temp2, m_zero); + + src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); + src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); + src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); + + acc_dc = _mm_cvtsi128_si32(src_temp4); + acc_dc += pu1_ref[three_nt]; + acc_dc -= pu1_ref[two_nt]; + +/* computing acc_dc value */ + + dc_val = (acc_dc + nt) >> (log2nt + 1); + + three_dc_val = 3 * dc_val; + + /* loding 8-bit 16 pixel */ + src_temp1 = _mm_set1_epi16(three_dc_val + 2); + two_dc_val = 2 * dc_val; + + /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */ + src_temp2 = _mm_add_epi16(src_temp2, src_temp1); + + /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2 */ + src_temp2 = _mm_srli_epi16(src_temp2, 2); + + src_temp2 = _mm_packus_epi16(src_temp2, m_zero); + + temp1 = _mm_cvtsi128_si32(src_temp2); + + *(WORD32 *)(&pu1_dst[0]) = temp1; + + src_temp2 = _mm_insert_epi16(src_temp2, dc_val, 0); + + src_temp2 = _mm_shuffle_epi8(src_temp2, sm1); + src_temp3 = _mm_shuffle_epi8(src_temp2, sm1); + src_temp4 = _mm_shuffle_epi8(src_temp2, sm1); + + temp1 = _mm_cvtsi128_si32(src_temp2); + temp2 = _mm_cvtsi128_si32(src_temp3); + temp3 = _mm_cvtsi128_si32(src_temp4); + + *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1; + *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2; + *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3; + + /* retore first value*/ + pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) + >> 2); + + for(row = 1; row < nt; row++) + pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2) + >> 2; + + } + else if(nt == 8) /* if nt%8==0*/ + { + + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); + + src_temp4 = _mm_sad_epu8(src_temp3, m_zero); + src_temp4 = _mm_shuffle_epi8(src_temp4, sm); + src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); + + acc_dc = _mm_cvtsi128_si32(src_temp4); + + acc_dc += pu1_ref[three_nt]; + acc_dc -= pu1_ref[two_nt]; + + /* computing acc_dc value */ + + dc_val = (acc_dc + nt) >> (log2nt + 1); + + three_dc_val = 3 * dc_val; + src_temp1 = _mm_set1_epi16(three_dc_val + 2); + two_dc_val = 2 * dc_val; + + /* loding 8-bit 16 pixel */ + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); + src_temp2 = _mm_unpacklo_epi8(src_temp2, m_zero); + + /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */ + src_temp2 = _mm_add_epi16(src_temp2, src_temp1); + + /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */ + src_temp2 = _mm_srli_epi16(src_temp2, 2); + src_temp2 = _mm_packus_epi16(src_temp2, m_zero); + + _mm_storel_epi64((__m128i *)(pu1_dst), src_temp2); + + /* Fill the remaining rows with DC value*/ + + src_temp1 = _mm_set1_epi8(dc_val); + src_temp2 = _mm_set1_epi8(dc_val); + src_temp3 = _mm_set1_epi8(dc_val); + src_temp4 = _mm_set1_epi8(dc_val); + src_temp5 = _mm_set1_epi8(dc_val); + src_temp6 = _mm_set1_epi8(dc_val); + src_temp7 = _mm_set1_epi8(dc_val); + + _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1); + _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2); + _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3); + _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4); + _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5); + _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6); + _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7); + + /* retore first value*/ + pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) + >> 2); + + for(row = 1; row < nt; row++) + pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2) + >> 2; + + } + else /* if nt == 16*/ + { + + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16)); + + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); + src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8)); + + src_temp3 = _mm_sad_epu8(src_temp3, m_zero); + src_temp4 = _mm_sad_epu8(src_temp4, m_zero); + + src_temp2 = _mm_unpacklo_epi8(src_temp2, m_zero); + src_temp10 = _mm_unpacklo_epi8(src_temp10, m_zero); + + src_temp4 = _mm_add_epi16(src_temp3, src_temp4); + src_temp4 = _mm_shuffle_epi8(src_temp4, sm); + src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); + + acc_dc = _mm_cvtsi128_si32(src_temp4); + + acc_dc += pu1_ref[three_nt]; + acc_dc -= pu1_ref[two_nt]; + + /* computing acc_dc value */ + + dc_val = (acc_dc + nt) >> (log2nt + 1); + + three_dc_val = 3 * dc_val; + src_temp1 = _mm_set1_epi16(three_dc_val + 2); + two_dc_val = 2 * dc_val; + + /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */ + src_temp2 = _mm_add_epi16(src_temp2, src_temp1); + src_temp10 = _mm_add_epi16(src_temp10, src_temp1); + /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */ + src_temp2 = _mm_srli_epi16(src_temp2, 2); + src_temp10 = _mm_srli_epi16(src_temp10, 2); + + src_temp2 = _mm_packus_epi16(src_temp2, src_temp10); + + _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2); + + /* Fill the remaining rows with DC value*/ + src_temp1 = _mm_set1_epi8(dc_val); + src_temp2 = _mm_set1_epi8(dc_val); + src_temp3 = _mm_set1_epi8(dc_val); + src_temp4 = _mm_set1_epi8(dc_val); + src_temp5 = _mm_set1_epi8(dc_val); + src_temp6 = _mm_set1_epi8(dc_val); + src_temp7 = _mm_set1_epi8(dc_val); + + for(row = 1; row < nt; row += 8) + { + + _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4); + _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7); + + _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp3); + + _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp4); + _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp7); + + _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp1); + + } + + /* retore first value*/ + pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) + >> 2); + + for(row = 1; row < nt; row++) + pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2) + >> 2; + + } + } +} + +/** +******************************************************************************* +* +* @brief +* Intra prediction interpolation filter for horizontal luma variable. +* +* @par Description: +* Horizontal intraprediction(mode 10) with reference samples location +* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer +* to section 8.4.4.2.6 in the standard (Special case) +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] nt +* integer Transform Block size +* +* @param[in] mode +* integer intraprediction mode +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void ihevc_intra_pred_luma_horz_ssse3(UWORD8 *pu1_ref, + WORD32 src_strd, + UWORD8 *pu1_dst, + WORD32 dst_strd, + WORD32 nt, + WORD32 mode) +{ + + WORD32 row; + WORD32 two_nt; + UNUSED(src_strd); + UNUSED(mode); + + two_nt = 2 * nt; + + + if(nt == 32) + { + __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8; + __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16; + __m128i sm = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]); + + for(row = 0; row < nt; row += 16) + { + { + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 - row - 15)); + + src_temp2 = _mm_srli_si128(src_temp1, 1); + src_temp3 = _mm_srli_si128(src_temp1, 2); + src_temp4 = _mm_srli_si128(src_temp1, 3); + src_temp5 = _mm_srli_si128(src_temp1, 4); + src_temp6 = _mm_srli_si128(src_temp1, 5); + src_temp7 = _mm_srli_si128(src_temp1, 6); + src_temp8 = _mm_srli_si128(src_temp1, 7); + + src_temp9 = _mm_srli_si128(src_temp1, 8); + src_temp10 = _mm_srli_si128(src_temp1, 9); + src_temp11 = _mm_srli_si128(src_temp1, 10); + src_temp12 = _mm_srli_si128(src_temp1, 11); + src_temp13 = _mm_srli_si128(src_temp1, 12); + src_temp14 = _mm_srli_si128(src_temp1, 13); + src_temp15 = _mm_srli_si128(src_temp1, 14); + src_temp16 = _mm_srli_si128(src_temp1, 15); + + src_temp8 = _mm_shuffle_epi8(src_temp8, sm); + src_temp7 = _mm_shuffle_epi8(src_temp7, sm); + src_temp6 = _mm_shuffle_epi8(src_temp6, sm); + src_temp5 = _mm_shuffle_epi8(src_temp5, sm); + src_temp4 = _mm_shuffle_epi8(src_temp4, sm); + src_temp3 = _mm_shuffle_epi8(src_temp3, sm); + src_temp2 = _mm_shuffle_epi8(src_temp2, sm); + src_temp1 = _mm_shuffle_epi8(src_temp1, sm); + + src_temp16 = _mm_shuffle_epi8(src_temp16, sm); + src_temp15 = _mm_shuffle_epi8(src_temp15, sm); + src_temp14 = _mm_shuffle_epi8(src_temp14, sm); + src_temp13 = _mm_shuffle_epi8(src_temp13, sm); + src_temp12 = _mm_shuffle_epi8(src_temp12, sm); + src_temp11 = _mm_shuffle_epi8(src_temp11, sm); + src_temp10 = _mm_shuffle_epi8(src_temp10, sm); + src_temp9 = _mm_shuffle_epi8(src_temp9, sm); + + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp16); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp15); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp14); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp13); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp12); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp11); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp10); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp9); + + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 8) * dst_strd)), src_temp8); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 9) * dst_strd)), src_temp7); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 10) * dst_strd)), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 11) * dst_strd)), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 12) * dst_strd)), src_temp4); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 13) * dst_strd)), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 14) * dst_strd)), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 15) * dst_strd)), src_temp1); + + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 0) * dst_strd)), src_temp16); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 1) * dst_strd)), src_temp15); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 2) * dst_strd)), src_temp14); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 3) * dst_strd)), src_temp13); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 4) * dst_strd)), src_temp12); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 5) * dst_strd)), src_temp11); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 6) * dst_strd)), src_temp10); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 7) * dst_strd)), src_temp9); + + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 8) * dst_strd)), src_temp8); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 9) * dst_strd)), src_temp7); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 10) * dst_strd)), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 11) * dst_strd)), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 12) * dst_strd)), src_temp4); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 13) * dst_strd)), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 14) * dst_strd)), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 15) * dst_strd)), src_temp1); + + } + + } + + } + else + + { + __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6; + __m128i src_temp10, zero_8x16b, src_temp7; + + /* DC filtering for the first top row and first left column */ + + zero_8x16b = _mm_set1_epi16(0); + + /*Filtering done for the 1st row */ + + src_temp2 = _mm_set1_epi16(pu1_ref[two_nt - 1]); + src_temp10 = _mm_set1_epi16(pu1_ref[two_nt]); + + /* loding 8-bit 16 pixels */ + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); + + src_temp4 = _mm_unpacklo_epi8(src_temp4, zero_8x16b); + + /*(pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt])*/ + src_temp3 = _mm_sub_epi16(src_temp4, src_temp10); + + /* ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/ + src_temp3 = _mm_srai_epi16(src_temp3, 1); + + /* pu1_ref[two_nt - 1]+((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/ + src_temp3 = _mm_add_epi16(src_temp2, src_temp3); + + if(nt == 4) + { + int temp1, temp2, temp3; + src_temp3 = _mm_packus_epi16(src_temp3, zero_8x16b); + temp1 = _mm_cvtsi128_si32(src_temp3); + + *(WORD32 *)(&pu1_dst[0]) = temp1; + + src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 2]); + src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 3]); + src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 4]); + + temp1 = _mm_cvtsi128_si32(src_temp2); + temp2 = _mm_cvtsi128_si32(src_temp3); + temp3 = _mm_cvtsi128_si32(src_temp4); + + /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ + *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1; + *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2; + *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3; + + } + else if(nt == 8) + { + src_temp10 = _mm_packus_epi16(src_temp3, zero_8x16b); + + + src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 2]); + src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 3]); + src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 4]); + src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 5]); + src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 6]); + src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 7]); + src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 8]); + + _mm_storel_epi64((__m128i *)(pu1_dst), src_temp10); + + /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ + _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); + _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp2); + _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp3); + _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4); + _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp5); + _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp6); + _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp7); + + } + else if(nt == 16) + { + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8)); + src_temp4 = _mm_unpacklo_epi8(src_temp4, zero_8x16b); + //src_temp4 = _mm_cvtepu8_epi16 (src_temp4); + + src_temp10 = _mm_sub_epi16(src_temp4, src_temp10); + src_temp10 = _mm_srai_epi16(src_temp10, 1); + src_temp10 = _mm_add_epi16(src_temp2, src_temp10); + + src_temp3 = _mm_packus_epi16(src_temp3, src_temp10); + _mm_storeu_si128((__m128i *)(pu1_dst), src_temp3); + + /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ + src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 2]); + src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 3]); + src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 4]); + src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 5]); + src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 6]); + src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 7]); + src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 8]); + src_temp10 = _mm_set1_epi8(pu1_ref[two_nt - 9]); + + _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4); + _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7); + _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp10); + + src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 10]); + src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 11]); + src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 12]); + src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 13]); + src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 14]); + src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 15]); + src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 16]); + + _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp4); + _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp7); + + } + } +} + + +/** +******************************************************************************* +* +* @brief +* Intra prediction interpolation filter for vertical luma variable. +* +* @par Description: +* Horizontal intraprediction with reference neighboring samples location +* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer +* to section 8.4.4.2.6 in the standard (Special case) +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] nt +* integer Transform Block size +* +* @param[in] mode +* integer intraprediction mode +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + + +void ihevc_intra_pred_luma_ver_ssse3(UWORD8 *pu1_ref, + WORD32 src_strd, + UWORD8 *pu1_dst, + WORD32 dst_strd, + WORD32 nt, + WORD32 mode) +{ + WORD32 row; + WORD16 s2_predpixel; + WORD32 two_nt = 2 * nt; + __m128i src_temp0, src_temp2; + UNUSED(src_strd); + UNUSED(mode); + + + if(nt == 32) + { + __m128i temp1, temp2; + WORD32 itr_count; + + temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); + temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16)); + + for(itr_count = 0; itr_count < 2; itr_count++) + { + /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ + _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1); + + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2); + + _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1); + + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2); + + pu1_dst += 16 * dst_strd; + } + } + else + { + /* Replication to next columns*/ + + if(nt == 4) + { + int temp1; + + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); + + temp1 = _mm_cvtsi128_si32(src_temp2); + + /* loding 4-bit 8 pixels values */ + *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1; + *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1; + *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp1; + *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp1; + + } + else if(nt == 8) + { + + src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); + + _mm_storel_epi64((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp0); + _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp0); + _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp0); + _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp0); + _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp0); + _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp0); + _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp0); + _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp0); + + + } + else if(nt == 16) + { + for(row = 0; row < nt; row += 8) + { + + src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); + + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp0); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp0); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp0); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp0); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp0); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp0); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp0); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp0); + + } + + } + + /*Filtering done for the 1st column */ + for(row = nt - 1; row >= 0; row--) + { + s2_predpixel = pu1_ref[two_nt + 1] + + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); + pu1_dst[row * dst_strd] = CLIP_U8(s2_predpixel); + } + + + } +} + +/** +******************************************************************************* +* +* @brief +* Intra prediction interpolation filter for luma mode2. +* +* @par Description: +* Intraprediction for mode 2 (sw angle) with reference neighboring samples +* location pointed by 'pu1_ref' to the TU block location pointed by +* 'pu1_dst' Refer to section 8.4.4.2.6 in the standard +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] nt +* integer Transform Block size +* +* @param[in] mode +* integer intraprediction mode +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void ihevc_intra_pred_luma_mode2_ssse3(UWORD8 *pu1_ref, + WORD32 src_strd, + UWORD8 *pu1_dst, + WORD32 dst_strd, + WORD32 nt, + WORD32 mode) +{ + WORD32 row, col; + WORD32 two_nt = 2 * nt; + + __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8; + __m128i sm1, sm2, sm3; + UNUSED(src_strd); + UNUSED(mode); + + + sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY1[0]); + sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY2[0]); + sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY3[0]); + + /* For the angle 45, replication is done from the corresponding angle */ + /* intra_pred_ang = tan(angle) in q5 format */ + + if(nt == 4) + { + int temp1, temp2, temp3, temp4; + + /*pu1_ref[two_nt - row - (col+1) - 1]*/ + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 8)); + src_temp2 = _mm_srli_si128(src_temp1, 1); + src_temp3 = _mm_srli_si128(src_temp1, 2); + src_temp4 = _mm_srli_si128(src_temp1, 3); + + src_temp4 = _mm_shuffle_epi8(src_temp4, sm1); + src_temp3 = _mm_shuffle_epi8(src_temp3, sm1); + src_temp2 = _mm_shuffle_epi8(src_temp2, sm1); + src_temp1 = _mm_shuffle_epi8(src_temp1, sm1); + + temp1 = _mm_cvtsi128_si32(src_temp4); + temp2 = _mm_cvtsi128_si32(src_temp3); + temp3 = _mm_cvtsi128_si32(src_temp2); + temp4 = _mm_cvtsi128_si32(src_temp1); + + /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ + *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1; + *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2; + *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3; + *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4; + + + } + else if(nt == 8) + { + /*pu1_ref[two_nt - row - (col+1) - 1]*/ + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); + src_temp2 = _mm_srli_si128(src_temp1, 1); + src_temp3 = _mm_srli_si128(src_temp1, 2); + src_temp4 = _mm_srli_si128(src_temp1, 3); + src_temp5 = _mm_srli_si128(src_temp1, 4); + src_temp6 = _mm_srli_si128(src_temp1, 5); + src_temp7 = _mm_srli_si128(src_temp1, 6); + src_temp8 = _mm_srli_si128(src_temp1, 7); + + src_temp1 = _mm_shuffle_epi8(src_temp1, sm2); + src_temp2 = _mm_shuffle_epi8(src_temp2, sm2); + src_temp3 = _mm_shuffle_epi8(src_temp3, sm2); + src_temp4 = _mm_shuffle_epi8(src_temp4, sm2); + src_temp5 = _mm_shuffle_epi8(src_temp5, sm2); + src_temp6 = _mm_shuffle_epi8(src_temp6, sm2); + src_temp7 = _mm_shuffle_epi8(src_temp7, sm2); + src_temp8 = _mm_shuffle_epi8(src_temp8, sm2); + + _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp8); + _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp7); + _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp6); + _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp5); + _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4); + _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp3); + _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp2); + _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1); + + } + else + { + for(row = 0; row < nt; row += 8) + { + for(col = 0; col < nt; col += 16) + { /*pu1_ref[two_nt - row - (col+1) - 1]*/ + + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 0) - (col + 16) - 1)); + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 1) - (col + 16) - 1)); + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 2) - (col + 16) - 1)); + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 3) - (col + 16) - 1)); + src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 4) - (col + 16) - 1)); + src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 5) - (col + 16) - 1)); + src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 6) - (col + 16) - 1)); + src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 7) - (col + 16) - 1)); + + src_temp1 = _mm_shuffle_epi8(src_temp1, sm3); + src_temp2 = _mm_shuffle_epi8(src_temp2, sm3); + src_temp3 = _mm_shuffle_epi8(src_temp3, sm3); + src_temp4 = _mm_shuffle_epi8(src_temp4, sm3); + src_temp5 = _mm_shuffle_epi8(src_temp5, sm3); + src_temp6 = _mm_shuffle_epi8(src_temp6, sm3); + src_temp7 = _mm_shuffle_epi8(src_temp7, sm3); + src_temp8 = _mm_shuffle_epi8(src_temp8, sm3); + + _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), src_temp4); + _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), src_temp7); + _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), src_temp8); + } + } + } + +} + +/** +******************************************************************************* +* +* @brief +* Intra prediction interpolation filter for luma mode 18 & mode 34. +* +* @par Description: +* Intraprediction for mode 34 (ne angle) and mode 18 (nw angle) with +* reference neighboring samples location pointed by 'pu1_ref' to the TU +* block location pointed by 'pu1_dst' +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] nt +* integer Transform Block size +* +* @param[in] mode +* integer intraprediction mode +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void ihevc_intra_pred_luma_mode_18_34_ssse3(UWORD8 *pu1_ref, + WORD32 src_strd, + UWORD8 *pu1_dst, + WORD32 dst_strd, + WORD32 nt, + WORD32 mode) +{ + WORD32 row; + WORD32 two_nt = 2 * nt; + __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8; + UNUSED(src_strd); + if(mode == 34) + { + if(nt == 4) + { + + int temp1, temp2, temp3, temp4; + + /*pu1_ref[two_nt + col + idx + 1]*/ + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 2)); + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 3)); + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 4)); + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 5)); + + temp1 = _mm_cvtsi128_si32(src_temp1); + temp2 = _mm_cvtsi128_si32(src_temp2); + temp3 = _mm_cvtsi128_si32(src_temp3); + temp4 = _mm_cvtsi128_si32(src_temp4); + + /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ + *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1; + *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2; + *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3; + *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4; + + } + else if(nt == 8) + { + /*pu1_ref[two_nt + col + idx + 1]*/ + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 2)); + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 3)); + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 4)); + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 5)); + src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 6)); + src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 7)); + src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 8)); + src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 9)); + + _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); + _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2); + _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3); + _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4); + _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5); + _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6); + _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7); + _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8); + + } + else if(nt == 16) + { + for(row = 0; row < nt; row += 8) + { + /*pu1_ref[two_nt + col + idx + 1]*/ + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 0) + 2)); + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 1) + 2)); + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 2) + 2)); + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 3) + 2)); + src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 4) + 2)); + src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 5) + 2)); + src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 6) + 2)); + src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 7) + 2)); + + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp8); + + + } + } + else + { + __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16; + for(row = 0; row < nt; row += 8) + { + /*pu1_ref[two_nt + col + idx + 1]*/ + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (0 + 0) + 2)); + src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (0 + 16) + 2)); + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (1 + 0) + 2)); + src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (1 + 16) + 2)); + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (2 + 0) + 2)); + src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (2 + 16) + 2)); + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (3 + 0) + 2)); + src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (3 + 16) + 2)); + + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9); + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10); + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11); + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12); + + src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (4 + 0) + 2)); + src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (4 + 16) + 2)); + src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (5 + 0) + 2)); + src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (5 + 16) + 2)); + src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (6 + 0) + 2)); + src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (6 + 16) + 2)); + src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (7 + 0) + 2)); + src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (7 + 16) + 2)); + + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13); + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14); + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15); + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16); + + pu1_ref += 8; + pu1_dst += 8 * dst_strd; + } + } + } + else + { + if(nt == 4) + { + int temp1, temp2, temp3, temp4; + + /*pu1_ref[two_nt + col + idx + 1]*/ + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3)); + src_temp2 = _mm_srli_si128(src_temp1, 1); + src_temp3 = _mm_srli_si128(src_temp1, 2); + src_temp4 = _mm_srli_si128(src_temp1, 3); + + temp1 = _mm_cvtsi128_si32(src_temp4); + temp2 = _mm_cvtsi128_si32(src_temp3); + temp3 = _mm_cvtsi128_si32(src_temp2); + temp4 = _mm_cvtsi128_si32(src_temp1); + + /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ + *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1; + *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2; + *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3; + *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4; + + } + else if(nt == 8) + { + /*pu1_ref[two_nt + col + idx + 1]*/ + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7)); + src_temp2 = _mm_srli_si128(src_temp1, 1); + src_temp3 = _mm_srli_si128(src_temp1, 2); + src_temp4 = _mm_srli_si128(src_temp1, 3); + src_temp5 = _mm_srli_si128(src_temp1, 4); + src_temp6 = _mm_srli_si128(src_temp1, 5); + src_temp7 = _mm_srli_si128(src_temp1, 6); + src_temp8 = _mm_srli_si128(src_temp1, 7); + + _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp8); + _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp7); + _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp6); + _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp5); + _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4); + _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp3); + _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp2); + _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1); + + + } + else if(nt == 16) + { + for(row = 0; row < nt; row += 8) + { + /*pu1_ref[two_nt + col + idx + 1]*/ + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 0))); + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 1))); + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 2))); + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 3))); + src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 4))); + src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 5))); + src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 6))); + src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 7))); + + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7); + _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp8); + + } + + } + else + { + __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16; + for(row = 0; row < nt; row += 8) + { + /*pu1_ref[two_nt + col + idx + 1]*/ + src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 0 + 0)); + src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 0 + 16)); + src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 + 0)); + src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 + 16)); + src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 2 + 0)); + src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 2 + 16)); + src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3 + 0)); + src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3 + 16)); + + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9); + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10); + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11); + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12); + + src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 4 + 0)); + src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 4 + 16)); + src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 5 + 0)); + src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 5 + 16)); + src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 6 + 0)); + src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 6 + 16)); + src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7 + 0)); + src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7 + 16)); + + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13); + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14); + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15); + _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8); + _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16); + + pu1_ref -= 8; + pu1_dst += 8 * dst_strd; + } + } + } +} + + +/** +******************************************************************************* +* +* @brief +* Intra prediction interpolation filter for luma mode 3 to mode 9 +* +* @par Description: +* Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with +* reference neighboring samples location pointed by 'pu1_ref' to the TU +* block location pointed by 'pu1_dst' +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] nt +* integer Transform Block size +* +* @param[in] mode +* integer intraprediction mode +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void ihevc_intra_pred_luma_mode_3_to_9_ssse3(UWORD8 *pu1_ref, + WORD32 src_strd, + UWORD8 *pu1_dst, + WORD32 dst_strd, + WORD32 nt, + WORD32 mode) +{ + WORD32 row, col; + WORD32 two_nt = 2 * nt; + WORD32 intra_pred_ang; + + + __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b, zero_8x16b; + __m128i fract_4x32b, intra_pred_ang_4x32b; + __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3; + UNUSED(src_strd); + + /* Intra Pred Angle according to the mode */ + intra_pred_ang = gai4_ihevc_ang_table[mode]; + + /* For the angles other then 45 degree, interpolation btw 2 neighboring */ + /* samples dependent on distance to obtain destination sample */ + + /* For the angles other then 45 degree, interpolation btw 2 neighboring */ + /* samples dependent on distance to obtain destination sample */ + + const_temp_4x32b = _mm_set1_epi16(16); + const_temp2_4x32b = _mm_set1_epi32(31); + const_temp3_4x32b = _mm_set1_epi32(32); + const_temp4_4x32b = _mm_set1_epi32(4); + + two_nt_4x32b = _mm_set1_epi32(two_nt - nt); + + + sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]); + + /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ + intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang); + + row_4x32b = _mm_set_epi32(4, 3, 2, 1); + + if(nt == 4) + { + + WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; + int temp11, temp21, temp31, temp41; + // WORD8 ai1_fract_temp_val[16], ai1_row_temp_val[16]; + + __m128i fract1_8x16b, fract2_8x16b, sign_8x16b; + __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; + + __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; + __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b; //, src_temp8_8x16b; + __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2; + + row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1); + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + zero_8x16b = _mm_setzero_si128(); + sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b); + res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b); + + /* idx = pos >> 5; */ + fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /* fract = pos & (31); */ + ref_main_idx_4x32b = _mm_sub_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); + + /*(32 - fract) */ + row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b); + + fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); + fract2_8x16b = _mm_slli_epi16(row_4x32b, 8); + + fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); + row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */ + + fract2_8x16b = _mm_unpackhi_epi8(row_4x32b, fract_4x32b); + fract1_8x16b = _mm_unpacklo_epi8(row_4x32b, fract_4x32b); + + temp1_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0x00); + temp2_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0xaa); + temp3_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0x00); + temp4_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0xaa); + + ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4); /* next 32 bit values */ + ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8); /* next 32 bit values */ + ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */ + ref_main_idx1 = _mm_cvtsi128_si32(ref_main_idx_4x32b); /* col=0*/ + ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* col=1*/ + ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* col=2*/ + ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* col=3*/ + + /* loding 8-bit 16 pixels */ + src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 - 1)); /* col=0*/ + src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2 - 1)); /* col=1*/ + src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3 - 1)); /* col=2*/ + src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4 - 1)); /* col=3*/ + + src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/ + src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/ + src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/ + src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/ + + /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ + src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); + src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); + src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); + src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); + src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); + src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); + src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ + src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ + src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ + src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ + + /* converting 16 bit to 8 bit */ + src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ + src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ + + + src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); + src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); + + src_temp3_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); + src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 4); + src_temp1_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); + src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 12); + + temp11 = _mm_cvtsi128_si32(src_temp7_8x16b); + temp21 = _mm_cvtsi128_si32(src_temp1_8x16b); + temp31 = _mm_cvtsi128_si32(src_temp2_8x16b); + temp41 = _mm_cvtsi128_si32(src_temp3_8x16b); + + /* loding 4-bit 8 pixels values */ + *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11; + *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21; + *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31; + *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41; + + } + + else if(nt == 16 || nt == 32) + { + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + const_temp2_4x32b = _mm_set1_epi16(31); + const_temp4_4x32b = _mm_set1_epi16(8); + const_temp3_4x32b = _mm_set1_epi16(32); + two_nt_4x32b = _mm_set1_epi16(two_nt); + + for(col = 0; col < nt; col += 8) + { + WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; + WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; + //WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16]; + + __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; + + __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; + __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + + /* idx = pos >> 5; */ + fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /*(32 - fract) */ + fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); + + fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); + fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ + + fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); + fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ + + + fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b); + fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b); + + temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); + temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); + temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); + temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); + + temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); + temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); + temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); + temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); + + /* fract = pos & (31); */ + ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); + + row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); + + pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ + pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ + pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ + pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ + + pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ + pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ + pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ + pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ + + for(row = 0; row < nt; row += 8) + { + __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; + __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; + + + __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; + __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; + + /* loding 8-bit 16 pixels */ + src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1 - (8 + row))); /* col=0*/ + src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1 - (8 + row))); /* col=1*/ + src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1 - (8 + row))); /* col=2*/ + src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1 - (8 + row))); /* col=3*/ + + /* loding 8-bit 16 pixels */ + src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1 - (8 + row))); /* col=5*/ + src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1 - (8 + row))); /* col=6*/ + src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1 - (8 + row))); /* col=7*/ + src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1 - (8 + row))); /* col=8*/ + + src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/ + src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/ + src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/ + src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/ + + src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/ + src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/ + src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/ + src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/ + + /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ + src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); + src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); + src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); + src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); + + /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ + src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); + src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); + src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); + src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); + src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); + src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); + src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ + src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ + src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ + src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); + src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); + src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); + src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ + src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ + src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ + src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ + + /* converting 16 bit to 8 bit */ + src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ + src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ + + /* converting 16 bit to 8 bit */ + src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/ + src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/ + + src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); + src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); + + src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); + src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); + + src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); + src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); + + src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); + src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); + + src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); + src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); + + src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); + src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); + + src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); + src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); + + src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); + src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp1_8x16b); /* row=7*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp5_8x16b); /* row=6*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp2_8x16b); /* row=5*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp6_8x16b); /* row=4*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp3_8x16b); /* row=3*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp7_8x16b); /* row=2*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp4_8x16b); /* row=1*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 0))), src_temp8_8x16b); /* row=0*/ + + } + } + } + else + { + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + const_temp2_4x32b = _mm_set1_epi16(31); + const_temp4_4x32b = _mm_set1_epi16(8); + const_temp3_4x32b = _mm_set1_epi16(32); + two_nt_4x32b = _mm_set1_epi16(two_nt - nt); + { + WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; + WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; + + __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; + + __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; + __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + + /* idx = pos >> 5; */ + fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /* fract = pos & (31); */ + ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); + + /*(32 - fract) */ + fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); + + fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); + fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ + + fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); + fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ + + + fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b); + fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b); + + temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); + temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); + temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); + temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); + + temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); + temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); + temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); + temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); + + pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ + pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ + pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ + pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ + + pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ + pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ + pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ + pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ + + { + __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; + __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; + + __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; + __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; + + /* loding 8-bit 16 pixels */ + src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1)); /* col=0*/ + src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1)); /* col=1*/ + src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1)); /* col=2*/ + src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1)); /* col=3*/ + + /* loding 8-bit 16 pixels */ + src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1)); /* col=5*/ + src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1)); /* col=6*/ + src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1)); /* col=7*/ + src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1)); /* col=8*/ + + src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/ + src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/ + src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/ + src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/ + + src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/ + src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/ + src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/ + src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/ + + /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ + src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); + src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); + src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); + src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); + + /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ + src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); + src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); + src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); + src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); + src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); + src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); + src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ + src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ + src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ + src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); + src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); + src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); + src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ + src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ + src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ + src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ + + /* converting 16 bit to 8 bit */ + src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ + src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ + + /* converting 16 bit to 8 bit */ + src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/ + src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/ + + src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); + src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); + + src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); + src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); + + src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); + src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); + + src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); + src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); + + src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); + src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); + + src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); + src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); + + src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); + src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); + + src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); + src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); + + _mm_storel_epi64((__m128i *)(pu1_dst), src_temp8_8x16b); /* row=0*/ + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 1)), src_temp4_8x16b); /* row=1*/ + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 2)), src_temp7_8x16b); /* row=2*/ + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 3)), src_temp3_8x16b); /* row=3*/ + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 4)), src_temp6_8x16b); /* row=4*/ + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 5)), src_temp2_8x16b); /* row=5*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 6)), src_temp5_8x16b); /* row=6*/ + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 7)), src_temp1_8x16b); /* row=7*/ + + } + } + } + +} + + + +/** +******************************************************************************* +* +* @brief +* Intra prediction interpolation filter for luma mode 11 to mode 17 +* +* @par Description: +* Intraprediction for mode 11 to 17 (negative angle, horizontal mode ) +* with reference neighboring samples location pointed by 'pu1_ref' to the +* TU block location pointed by 'pu1_dst' +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] nt +* integer Transform Block size +* +* @param[in] mode +* integer intraprediction mode +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + + +void ihevc_intra_pred_luma_mode_11_to_17_ssse3(UWORD8 *pu1_ref, + WORD32 src_strd, + UWORD8 *pu1_dst, + WORD32 dst_strd, + WORD32 nt, + WORD32 mode) +{ + + /* This function and ihevc_intra_pred_luma_mode_19_to_25 are same except*/ + /* for ref main & side samples assignment,can be combined for */ + /* optimzation*/ + + WORD32 row, col, k; + WORD32 two_nt; + WORD32 intra_pred_ang, inv_ang, inv_ang_sum; + WORD32 ref_idx; + + __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b; + __m128i fract_4x32b, intra_pred_ang_4x32b; + __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3; + + + UWORD8 ref_tmp[2 * MAX_CU_SIZE + 2]; + UWORD8 *ref_main; + UWORD8 *ref_temp; + UNUSED(src_strd); + inv_ang_sum = 128; + two_nt = 2 * nt; + ref_temp = ref_tmp + 1; + ref_main = ref_temp + nt - 1; + intra_pred_ang = gai4_ihevc_ang_table[mode]; + + /* For the angles other then 45 degree, interpolation btw 2 neighboring */ + /* samples dependent on distance to obtain destination sample */ + const_temp_4x32b = _mm_set1_epi16(16); + const_temp2_4x32b = _mm_set1_epi32(31); + const_temp3_4x32b = _mm_set1_epi32(32); + const_temp4_4x32b = _mm_set1_epi32(4); + + two_nt_4x32b = _mm_set1_epi32(1); + + + sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]); + + /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ + intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang); + + row_4x32b = _mm_set_epi32(4, 3, 2, 1); + + if(nt == 4) + { + + WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; + int temp11, temp21, temp31, temp41; +// WORD8 ai1_fract_temp_val[16], ai1_row_temp_val[16]; + + __m128i fract1_8x16b, fract2_8x16b; + __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; + + __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; + __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; + __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, zero_8x16b, sign_8x16b; + + /* Intermediate reference samples for negative angle modes */ + /* This have to be removed during optimization*/ + /* For horizontal modes, (ref main = ref left) (ref side = ref above) */ + inv_ang = gai4_ihevc_inv_ang_table[mode - 11]; + + ref_main = ref_temp + nt - 1; + for(k = 0; k < nt + 1; k++) + ref_temp[k + nt - 1] = pu1_ref[two_nt - k]; + + ref_main = ref_temp + nt - 1; + ref_idx = (nt * intra_pred_ang) >> 5; + zero_8x16b = _mm_setzero_si128(); + + row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1); + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + /* SIMD Optimization can be done using look-up table for the loop */ + /* For negative angled derive the main reference samples from side */ + /* reference samples refer to section 8.4.4.2.6 */ + for(k = -1; k > ref_idx; k--) + { + inv_ang_sum += inv_ang; + ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)]; + } + + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + + sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b); + res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b); + + /* idx = pos >> 5; */ + fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /* fract = pos & (31); */ + ref_main_idx_4x32b = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); + + /*(32 - fract) */ + row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b); + + fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); + fract2_8x16b = _mm_slli_epi16(row_4x32b, 8); + + fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); + row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */ + + fract2_8x16b = _mm_unpackhi_epi8(fract_4x32b, row_4x32b); + fract1_8x16b = _mm_unpacklo_epi8(fract_4x32b, row_4x32b); + + temp1_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0x00); + temp2_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0xaa); + temp3_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0x00); + temp4_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0xaa); + + ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4); /* next 32 bit values */ + ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8); /* next 32 bit values */ + ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */ + ref_main_idx1 = _mm_cvtsi128_si32(ref_main_idx_4x32b); /* col=0*/ + ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* col=1*/ + ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* col=2*/ + ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* col=3*/ + + /* loding 8-bit 16 pixels */ + src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col=0*/ + src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col=1*/ + src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col=2*/ + src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col=3*/ + + src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/ + src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/ + src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/ + src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/ + + /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ + src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); + src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); + src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); + src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); + src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); + src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); + src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ + src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ + src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ + src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ + + /* converting 16 bit to 8 bit */ + src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ + src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ + + + src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); + src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); + + src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); + src_temp1_8x16b = _mm_srli_si128(src_temp7_8x16b, 4); + src_temp2_8x16b = _mm_srli_si128(src_temp7_8x16b, 8); + src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 12); + + temp11 = _mm_cvtsi128_si32(src_temp7_8x16b); + temp21 = _mm_cvtsi128_si32(src_temp1_8x16b); + temp31 = _mm_cvtsi128_si32(src_temp2_8x16b); + temp41 = _mm_cvtsi128_si32(src_temp3_8x16b); + + /* loding 8-bit 4 pixels values */ + *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11; + *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21; + *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31; + *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41; + } + + else if(nt == 32) + { + + + __m128i temp1, temp2, temp3, temp11, temp12; + __m128i src_values0, src_values1; + /* Intermediate reference samples for negative angle modes */ + + ref_temp[two_nt - 1] = pu1_ref[two_nt - nt]; + temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1)); + temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 17)); + temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3); + + /* For negative angled derive the main reference samples from side */ + + src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/ + src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); /*(nt+16)-(two_nt-1)*/ + + temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode])); + temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16)); + + src_values0 = _mm_shuffle_epi8(src_values0, temp2); + src_values1 = _mm_shuffle_epi8(src_values1, temp2); + src_values0 = _mm_shuffle_epi8(src_values0, temp12); + src_values1 = _mm_shuffle_epi8(src_values1, temp11); + + temp1 = _mm_shuffle_epi8(temp1, temp2); + temp3 = _mm_shuffle_epi8(temp3, temp2); + + _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp3); + _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp1); + _mm_storeu_si128((__m128i *)(ref_main - 16), src_values0); + _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[17 - mode][0]), src_values1); + + + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + const_temp2_4x32b = _mm_set1_epi16(31); + const_temp4_4x32b = _mm_set1_epi16(8); + const_temp3_4x32b = _mm_set1_epi16(32); + two_nt_4x32b = _mm_set1_epi16(1); + + for(col = 0; col < nt; col += 8) + { + WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; + WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; + // WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16]; + + __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; + + __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; + __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + + /* idx = pos >> 5; */ + fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /* fract = pos & (31); */ + ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); + + row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); + /*(32 - fract) */ + fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); + + fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); + fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ + + fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); + fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ + + + fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b); + fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b); + + temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); + temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); + temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); + temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); + + temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); + temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); + temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); + temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); + + pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ + pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ + pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ + pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ + + pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ + pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ + pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ + pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ + + for(row = 0; row < nt; row += 8) + { + __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; + __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; + + + __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; + __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; + + /* loding 8-bit 16 pixels */ + src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/ + src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/ + src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/ + src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/ + + src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/ + src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/ + src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/ + src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/ + + /* loding 8-bit 16 pixels */ + src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/ + src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/ + src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/ + src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/ + + src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/ + src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/ + src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/ + src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/ + + src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/ + src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/ + src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/ + src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/ + + /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ + src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); + src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); + src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); + src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); + + /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ + src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); + src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); + src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); + src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); + src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); + src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); + src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ + src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ + src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ + src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); + src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); + src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); + src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ + src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ + src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ + src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ + + /* converting 16 bit to 8 bit */ + src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ + src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ + + /* converting 16 bit to 8 bit */ + src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/ + src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/ + + src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); + src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); + + src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); + src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); + + src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); + src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); + + src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); + src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); + + + src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); + src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); + + src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); + src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); + + src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); + src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); + src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); + src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b); /* row=0*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b); /* row=1*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b); /* row=2*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b); /* row=4*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b); /* row=5*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b); /* row=6*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b); /* row=7*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b); /* row=8*/ + + } + } + } + else if(nt == 16) + { + + __m128i temp1, temp2, temp11, src_values0; + /* Intermediate reference samples for negative angle modes */ + /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ + ref_temp[two_nt - 1] = pu1_ref[two_nt - nt]; + temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1)); + temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3); + src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/ + + temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16)); + + src_values0 = _mm_shuffle_epi8(src_values0, temp2); + temp1 = _mm_shuffle_epi8(temp1, temp2); + src_values0 = _mm_shuffle_epi8(src_values0, temp11); + + _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0); + _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1); + + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + const_temp2_4x32b = _mm_set1_epi16(31); + const_temp4_4x32b = _mm_set1_epi16(8); + const_temp3_4x32b = _mm_set1_epi16(32); + two_nt_4x32b = _mm_set1_epi16(1); + + for(col = 0; col < nt; col += 8) + { + WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; + WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; + // WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16]; + + __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; + + __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; + __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + + /* idx = pos >> 5; */ + fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /* fract = pos & (31); */ + ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); + + row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); + /*(32 - fract) */ + fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); + + fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); + fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ + + fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); + fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ + + + fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b); + fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b); + + temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); + temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); + temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); + temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); + + temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); + temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); + temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); + temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); + + pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ + pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ + pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ + pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ + + pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ + pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ + pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ + pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ + + for(row = 0; row < nt; row += 8) + { + __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; + __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; + + + __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; + __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; + + /* loding 8-bit 16 pixels */ + src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/ + src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/ + src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/ + src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/ + + src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/ + src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/ + src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/ + src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/ + + /* loding 8-bit 16 pixels */ + src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/ + src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/ + src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/ + src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/ + + src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/ + src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/ + src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/ + src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/ + + src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/ + src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/ + src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/ + src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/ + + /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ + src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); + src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); + src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); + src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); + + /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ + src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); + src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); + src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); + src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); + src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); + src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); + src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ + src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ + src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ + src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); + src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); + src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); + src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ + src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ + src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ + src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ + + /* converting 16 bit to 8 bit */ + src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ + src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ + + /* converting 16 bit to 8 bit */ + src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/ + src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/ + + src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); + src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); + + src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); + src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); + + src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); + src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); + + src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); + src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); + + + src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); + src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); + + src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); + src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); + + src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); + src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); + src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); + src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b); /* row=0*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b); /* row=1*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b); /* row=2*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b); /* row=4*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b); /* row=5*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b); /* row=6*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b); /* row=7*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b); /* row=8*/ + + } + } + } + else + { + + + __m128i temp1, temp2, temp11, src_values0; + /* Intermediate reference samples for negative angle modes */ + /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ + ref_temp[two_nt - 1] = pu1_ref[nt]; + temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 1)); + + /* For negative angled derive the main reference samples from side */ + + src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/ + temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3); + temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16)); + + src_values0 = _mm_shuffle_epi8(src_values0, temp2); + temp1 = _mm_shuffle_epi8(temp1, temp2); + src_values0 = _mm_shuffle_epi8(src_values0, temp11); + src_values0 = _mm_srli_si128(src_values0, 8); + + _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1); + _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0); + + + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + const_temp2_4x32b = _mm_set1_epi16(31); + const_temp4_4x32b = _mm_set1_epi16(8); + const_temp3_4x32b = _mm_set1_epi16(32); + two_nt_4x32b = _mm_set1_epi16(1); + + { + WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; + WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; + //WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16]; + + __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; + + __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; + __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + + /* idx = pos >> 5; */ + fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /* fract = pos & (31); */ + ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); + + /*(32 - fract) */ + fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); + + fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); + fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ + + fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); + fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ + + fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b); + fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b); + + temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); + temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); + temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); + temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); + + temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); + temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); + temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); + temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); + + pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ + pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ + pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ + pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ + + pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ + pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ + pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ + pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ + + { + __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; + __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; + + __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; + __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; + + /* loding 8-bit 16 pixels */ + src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/ + src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/ + src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/ + src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/ + + /* loding 8-bit 16 pixels */ + src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5)); /* col=5*/ + src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6)); /* col=6*/ + src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7)); /* col=7*/ + src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8)); /* col=8*/ + + src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/ + src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/ + src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/ + src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/ + + src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/ + src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/ + src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/ + src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/ + + /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ + src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); + src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); + src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); + src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); + + /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ + src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); + src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); + src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); + src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); + src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); + src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); + src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* row=0*/ + src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* row=1*/ + src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* row=2*/ + src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* row=3*/ + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); + src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); + src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); + src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ + src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ + src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ + src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ + + /* converting 16 bit to 8 bit */ + src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ + src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ + + /* converting 16 bit to 8 bit */ + src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=4*/ + src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=5*/ + + src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); + src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); + + src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); + src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); + + src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); + src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); + + src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); + src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); + + + src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); + src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); + + src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); + src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); + + src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); + src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); + src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); + src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); + + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp1_8x16b); /* row=0*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp5_8x16b); /* row=1*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp2_8x16b); /* row=2*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp6_8x16b); /* row=3*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (4))), src_temp3_8x16b); /* row=4*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (5))), src_temp7_8x16b); /* row=5*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (6))), src_temp4_8x16b); /* row=6*/ + + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (7))), src_temp8_8x16b); /* row=7*/ + + } + } + } + +} + + +/** +******************************************************************************* +* +* @brief +* Intra prediction interpolation filter for luma mode 19 to mode 25 +* +* @par Description: +* Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with +* reference neighboring samples location pointed by 'pu1_ref' to the TU +* block location pointed by 'pu1_dst' +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] nt +* integer Transform Block size +* +* @param[in] mode +* integer intraprediction mode +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void ihevc_intra_pred_luma_mode_19_to_25_ssse3(UWORD8 *pu1_ref, + WORD32 src_strd, + UWORD8 *pu1_dst, + WORD32 dst_strd, + WORD32 nt, + WORD32 mode) +{ + + WORD32 row, k; + WORD32 two_nt, intra_pred_ang; + WORD32 inv_ang, inv_ang_sum; + //WORD32 ref_main_idx, pos, fract, idx; + WORD32 ref_idx; + UWORD8 ref_tmp[(2 * MAX_CU_SIZE) + 2]; + UWORD8 *ref_main, *ref_temp; + + __m128i /*fract_8x16b,*/ const_temp_8x16b, sm3; + __m128i temp1, temp2, temp3, temp4; + __m128i temp11, temp12, temp13, temp14; + UNUSED(src_strd); + two_nt = 2 * nt; + intra_pred_ang = gai4_ihevc_ang_table[mode]; + inv_ang = gai4_ihevc_inv_ang_table[mode - 12]; + + /* Intermediate reference samples for negative angle modes */ + /* This have to be removed during optimization*/ + /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ + ref_temp = ref_tmp + 1; + ref_main = ref_temp + nt - 1; + + + sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]); + + + + const_temp_8x16b = _mm_set1_epi16(16); + + if(nt == 32) + { + + __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b; + __m128i src_values10, src_values11, intra_pred_ang_4x32b; + __m128i row_4x32b, two_nt_4x32b, src_values12; + + __m128i src_values0, src_values1, src_values2, src_values3; + __m128i src_values4, src_values5, src_values6, src_values7; + WORD32 col = 0; + + /* Intermediate reference samples for negative angle modes */ + /* This have to be removed during optimization*/ + /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ + ref_temp[two_nt - 1] = pu1_ref[two_nt + nt]; + temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt)); + temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 16)); + + /* SIMD Optimization can be done using look-up table for the loop */ + /* For negative angled derive the main reference samples from side */ + /* reference samples refer to section 8.4.4.2.6 */ + src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/ + src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); /*(nt+16)-(two_nt-1)*/ + + temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19])); + temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16)); + + src_values0 = _mm_shuffle_epi8(src_values0, temp11); + src_values1 = _mm_shuffle_epi8(src_values1, temp12); + + _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1); + _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp3); + _mm_storeu_si128((__m128i *)(ref_main - 16), src_values1); + _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[mode - 19][0]), src_values0); + + const_temp2_4x32b = _mm_set1_epi16(31); + const_temp3_4x32b = _mm_set1_epi16(32); + const_temp8_4x32b = _mm_set1_epi16(8); + + two_nt_4x32b = _mm_set1_epi16(1); + + /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + + row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + + for(row = 0; row < nt; row += 8) + { + + WORD16 ref_main_idx[9]; + + __m128i res_temp5_4x32b; + __m128i fract1_8x16b, fract2_8x16b; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + + /* fract = pos & (31); */ + src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); + + /* idx = pos >> 5; */ + src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /*(32 - fract) */ + src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); + + fract1_8x16b = _mm_slli_epi16(src_values11, 8); + fract2_8x16b = _mm_slli_epi16(src_values10, 8); + + src_values11 = _mm_or_si128(src_values11, fract1_8x16b); + src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ + + fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); + fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); + + temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); + temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); + temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); + temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); + + temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); + temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); + temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); + temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); + + row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b); + _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); + for(col = 0; col < nt; col += 16) + { + src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + col)); + src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + col)); + src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + col)); + src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + col)); + src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8 + col)); + src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8 + col)); + src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8 + col)); + src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8 + col)); + + src_values0 = _mm_shuffle_epi8(src_values0, sm3); + src_values1 = _mm_shuffle_epi8(src_values1, sm3); + src_values2 = _mm_shuffle_epi8(src_values2, sm3); + src_values3 = _mm_shuffle_epi8(src_values3, sm3); + src_values4 = _mm_shuffle_epi8(src_values4, sm3); + src_values5 = _mm_shuffle_epi8(src_values5, sm3); + src_values6 = _mm_shuffle_epi8(src_values6, sm3); + src_values7 = _mm_shuffle_epi8(src_values7, sm3); + + + src_values0 = _mm_maddubs_epi16(src_values0, temp1); + src_values1 = _mm_maddubs_epi16(src_values1, temp2); + src_values2 = _mm_maddubs_epi16(src_values2, temp3); + src_values3 = _mm_maddubs_epi16(src_values3, temp4); + src_values4 = _mm_maddubs_epi16(src_values4, temp1); + src_values5 = _mm_maddubs_epi16(src_values5, temp2); + src_values6 = _mm_maddubs_epi16(src_values6, temp3); + src_values7 = _mm_maddubs_epi16(src_values7, temp4); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); + src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); + src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); + src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); + src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); + src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); + src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); + src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_values0 = _mm_srai_epi16(src_values0, 5); + src_values1 = _mm_srai_epi16(src_values1, 5); + src_values2 = _mm_srai_epi16(src_values2, 5); + src_values3 = _mm_srai_epi16(src_values3, 5); + src_values4 = _mm_srai_epi16(src_values4, 5); + src_values5 = _mm_srai_epi16(src_values5, 5); + src_values6 = _mm_srai_epi16(src_values6, 5); + src_values7 = _mm_srai_epi16(src_values7, 5); + + /* converting 16 bit to 8 bit */ + src_values0 = _mm_packus_epi16(src_values0, src_values4); + src_values1 = _mm_packus_epi16(src_values1, src_values5); + src_values2 = _mm_packus_epi16(src_values2, src_values6); + src_values3 = _mm_packus_epi16(src_values3, src_values7); + + /* loading 8-bit 8 pixels values */ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0); /* row=0*/ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1); /* row=1*/ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2); /* row=2*/ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3); /* row=3*/ + + + src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + col)); + src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + col)); + src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + col)); + src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + col)); + src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8 + col)); + src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8 + col)); + src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8 + col)); + src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8 + col)); + + src_values0 = _mm_shuffle_epi8(src_values0, sm3); + src_values1 = _mm_shuffle_epi8(src_values1, sm3); + src_values2 = _mm_shuffle_epi8(src_values2, sm3); + src_values3 = _mm_shuffle_epi8(src_values3, sm3); + src_values4 = _mm_shuffle_epi8(src_values4, sm3); + src_values5 = _mm_shuffle_epi8(src_values5, sm3); + src_values6 = _mm_shuffle_epi8(src_values6, sm3); + src_values7 = _mm_shuffle_epi8(src_values7, sm3); + + + src_values0 = _mm_maddubs_epi16(src_values0, temp11); + src_values1 = _mm_maddubs_epi16(src_values1, temp12); + src_values2 = _mm_maddubs_epi16(src_values2, temp13); + src_values3 = _mm_maddubs_epi16(src_values3, temp14); + src_values4 = _mm_maddubs_epi16(src_values4, temp11); + src_values5 = _mm_maddubs_epi16(src_values5, temp12); + src_values6 = _mm_maddubs_epi16(src_values6, temp13); + src_values7 = _mm_maddubs_epi16(src_values7, temp14); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); + src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); + src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); + src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); + src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); + src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); + src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); + src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_values0 = _mm_srai_epi16(src_values0, 5); + src_values1 = _mm_srai_epi16(src_values1, 5); + src_values2 = _mm_srai_epi16(src_values2, 5); + src_values3 = _mm_srai_epi16(src_values3, 5); + src_values4 = _mm_srai_epi16(src_values4, 5); + src_values5 = _mm_srai_epi16(src_values5, 5); + src_values6 = _mm_srai_epi16(src_values6, 5); + src_values7 = _mm_srai_epi16(src_values7, 5); + + /* converting 16 bit to 8 bit */ + src_values0 = _mm_packus_epi16(src_values0, src_values4); + src_values1 = _mm_packus_epi16(src_values1, src_values5); + src_values2 = _mm_packus_epi16(src_values2, src_values6); + src_values3 = _mm_packus_epi16(src_values3, src_values7); + + /* loading 8-bit 8 pixels values */ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0); /* row=4*/ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1); /* row=5*/ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2); /* row=6*/ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3); /* row=7*/ + + } + pu1_dst += 8 * dst_strd; + } + + } + else if(nt == 16) /* for nt = 16 case */ + { + + __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b; + __m128i src_values10, src_values11, intra_pred_ang_4x32b; + __m128i row_4x32b, two_nt_4x32b, src_values12; + __m128i src_values0, src_values1, src_values2, src_values3; + __m128i src_values4, src_values5, src_values6, src_values7; + + + /* Intermediate reference samples for negative angle modes */ + /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ + ref_temp[two_nt - 1] = pu1_ref[two_nt + nt]; + temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt)); + + src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/ + + temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16)); + + src_values0 = _mm_shuffle_epi8(src_values0, temp11); + + _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0); + _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1); + + const_temp2_4x32b = _mm_set1_epi16(31); + const_temp3_4x32b = _mm_set1_epi16(32); + const_temp8_4x32b = _mm_set1_epi16(8); + + two_nt_4x32b = _mm_set1_epi16(1); + + /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + + row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + + for(row = 0; row < nt; row += 8) + { + + WORD16 ref_main_idx[9]; + + __m128i res_temp5_4x32b; + __m128i fract1_8x16b, fract2_8x16b; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + + /* fract = pos & (31); */ + src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); + + /* idx = pos >> 5; */ + src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /*(32 - fract) */ + src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); + + fract1_8x16b = _mm_slli_epi16(src_values11, 8); + fract2_8x16b = _mm_slli_epi16(src_values10, 8); + + src_values11 = _mm_or_si128(src_values11, fract1_8x16b); + src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ + + fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); + fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); + + temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); + temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); + temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); + temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); + + temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); + temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); + temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); + temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); + + row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b); + _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); + + { + src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0])); + src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1])); + src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2])); + src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3])); + src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8)); + src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8)); + src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8)); + src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8)); + + src_values0 = _mm_shuffle_epi8(src_values0, sm3); + src_values1 = _mm_shuffle_epi8(src_values1, sm3); + src_values2 = _mm_shuffle_epi8(src_values2, sm3); + src_values3 = _mm_shuffle_epi8(src_values3, sm3); + src_values4 = _mm_shuffle_epi8(src_values4, sm3); + src_values5 = _mm_shuffle_epi8(src_values5, sm3); + src_values6 = _mm_shuffle_epi8(src_values6, sm3); + src_values7 = _mm_shuffle_epi8(src_values7, sm3); + + + src_values0 = _mm_maddubs_epi16(src_values0, temp1); + src_values1 = _mm_maddubs_epi16(src_values1, temp2); + src_values2 = _mm_maddubs_epi16(src_values2, temp3); + src_values3 = _mm_maddubs_epi16(src_values3, temp4); + src_values4 = _mm_maddubs_epi16(src_values4, temp1); + src_values5 = _mm_maddubs_epi16(src_values5, temp2); + src_values6 = _mm_maddubs_epi16(src_values6, temp3); + src_values7 = _mm_maddubs_epi16(src_values7, temp4); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); + src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); + src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); + src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); + src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); + src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); + src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); + src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_values0 = _mm_srai_epi16(src_values0, 5); + src_values1 = _mm_srai_epi16(src_values1, 5); + src_values2 = _mm_srai_epi16(src_values2, 5); + src_values3 = _mm_srai_epi16(src_values3, 5); + src_values4 = _mm_srai_epi16(src_values4, 5); + src_values5 = _mm_srai_epi16(src_values5, 5); + src_values6 = _mm_srai_epi16(src_values6, 5); + src_values7 = _mm_srai_epi16(src_values7, 5); + + /* converting 16 bit to 8 bit */ + src_values0 = _mm_packus_epi16(src_values0, src_values4); + src_values1 = _mm_packus_epi16(src_values1, src_values5); + src_values2 = _mm_packus_epi16(src_values2, src_values6); + src_values3 = _mm_packus_epi16(src_values3, src_values7); + + /* loading 8-bit 8 pixels values */ + _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0); /* row=0*/ + _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1); /* row=1*/ + _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2); /* row=2*/ + _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3); /* row=3*/ + + + src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4])); + src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5])); + src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6])); + src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7])); + src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8)); + src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8)); + src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8)); + src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8)); + + src_values0 = _mm_shuffle_epi8(src_values0, sm3); + src_values1 = _mm_shuffle_epi8(src_values1, sm3); + src_values2 = _mm_shuffle_epi8(src_values2, sm3); + src_values3 = _mm_shuffle_epi8(src_values3, sm3); + src_values4 = _mm_shuffle_epi8(src_values4, sm3); + src_values5 = _mm_shuffle_epi8(src_values5, sm3); + src_values6 = _mm_shuffle_epi8(src_values6, sm3); + src_values7 = _mm_shuffle_epi8(src_values7, sm3); + + + src_values0 = _mm_maddubs_epi16(src_values0, temp11); + src_values1 = _mm_maddubs_epi16(src_values1, temp12); + src_values2 = _mm_maddubs_epi16(src_values2, temp13); + src_values3 = _mm_maddubs_epi16(src_values3, temp14); + src_values4 = _mm_maddubs_epi16(src_values4, temp11); + src_values5 = _mm_maddubs_epi16(src_values5, temp12); + src_values6 = _mm_maddubs_epi16(src_values6, temp13); + src_values7 = _mm_maddubs_epi16(src_values7, temp14); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); + src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); + src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); + src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); + src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); + src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); + src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); + src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_values0 = _mm_srai_epi16(src_values0, 5); + src_values1 = _mm_srai_epi16(src_values1, 5); + src_values2 = _mm_srai_epi16(src_values2, 5); + src_values3 = _mm_srai_epi16(src_values3, 5); + src_values4 = _mm_srai_epi16(src_values4, 5); + src_values5 = _mm_srai_epi16(src_values5, 5); + src_values6 = _mm_srai_epi16(src_values6, 5); + src_values7 = _mm_srai_epi16(src_values7, 5); + + /* converting 16 bit to 8 bit */ + src_values0 = _mm_packus_epi16(src_values0, src_values4); + src_values1 = _mm_packus_epi16(src_values1, src_values5); + src_values2 = _mm_packus_epi16(src_values2, src_values6); + src_values3 = _mm_packus_epi16(src_values3, src_values7); + + /* loading 8-bit 8 pixels values */ + _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0); /* row=4*/ + _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1); /* row=5*/ + _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2); /* row=6*/ + _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3); /* row=7*/ + + } + pu1_dst += 8 * dst_strd; + } + } + else if(nt == 8) + { + + + __m128i const_temp2_4x32b, const_temp3_4x32b; + __m128i src_values10, src_values11, intra_pred_ang_4x32b; + + __m128i row_4x32b, two_nt_4x32b, src_values12; + __m128i src_values0, src_values1, src_values2, src_values3; + __m128i src_values4, src_values5, src_values6, src_values7; + + + /* Intermediate reference samples for negative angle modes */ + /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ + ref_temp[two_nt - 1] = pu1_ref[two_nt + nt]; + temp1 = _mm_loadl_epi64((__m128i *)(pu1_ref + two_nt)); + + /* For negative angled derive the main reference samples from side */ + + src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref)); /*nt-(nt+15)*/ + + temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16)); + + src_values0 = _mm_shuffle_epi8(src_values0, temp11); + src_values0 = _mm_srli_si128(src_values0, 8); + _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1); + _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0); + + + + const_temp2_4x32b = _mm_set1_epi16(31); + const_temp3_4x32b = _mm_set1_epi16(32); + + + two_nt_4x32b = _mm_set1_epi16(1); + + + /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + + row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + + { + + WORD16 ref_main_idx[9]; + + __m128i res_temp5_4x32b; + __m128i fract1_8x16b, fract2_8x16b; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + + /* fract = pos & (31); */ + src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); + + /* idx = pos >> 5; */ + src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /*(32 - fract) */ + src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); + + fract1_8x16b = _mm_slli_epi16(src_values11, 8); + fract2_8x16b = _mm_slli_epi16(src_values10, 8); + + src_values11 = _mm_or_si128(src_values11, fract1_8x16b); + src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ + + fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); + fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); + + temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); + temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); + temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); + temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); + + temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); + temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); + temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); + temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); + + _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); + + src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0])); /* col = 0-7 */ + src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1])); /* col = 8-15 */ + src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2])); /* col = 16-23 */ + src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3])); /* col = 24-31 */ + src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4])); /* col = 32-39 */ + src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5])); /* col = 40-47 */ + src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6])); /* col = 48-55 */ + src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7])); /* col = 56-63*/ + + src_values0 = _mm_shuffle_epi8(src_values0, sm3); + src_values1 = _mm_shuffle_epi8(src_values1, sm3); + src_values2 = _mm_shuffle_epi8(src_values2, sm3); + src_values3 = _mm_shuffle_epi8(src_values3, sm3); + src_values4 = _mm_shuffle_epi8(src_values4, sm3); + src_values5 = _mm_shuffle_epi8(src_values5, sm3); + src_values6 = _mm_shuffle_epi8(src_values6, sm3); + src_values7 = _mm_shuffle_epi8(src_values7, sm3); + + + src_values0 = _mm_maddubs_epi16(src_values0, temp1); + src_values1 = _mm_maddubs_epi16(src_values1, temp2); + src_values2 = _mm_maddubs_epi16(src_values2, temp3); + src_values3 = _mm_maddubs_epi16(src_values3, temp4); + src_values4 = _mm_maddubs_epi16(src_values4, temp11); + src_values5 = _mm_maddubs_epi16(src_values5, temp12); + src_values6 = _mm_maddubs_epi16(src_values6, temp13); + src_values7 = _mm_maddubs_epi16(src_values7, temp14); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); + src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); + src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); + src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); + src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); + src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); + src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); + src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_values0 = _mm_srai_epi16(src_values0, 5); + src_values1 = _mm_srai_epi16(src_values1, 5); + src_values2 = _mm_srai_epi16(src_values2, 5); + src_values3 = _mm_srai_epi16(src_values3, 5); + src_values4 = _mm_srai_epi16(src_values4, 5); + src_values5 = _mm_srai_epi16(src_values5, 5); + src_values6 = _mm_srai_epi16(src_values6, 5); + src_values7 = _mm_srai_epi16(src_values7, 5); + + /* converting 16 bit to 8 bit */ + src_values0 = _mm_packus_epi16(src_values0, src_values1); + src_values2 = _mm_packus_epi16(src_values2, src_values3); + src_values1 = _mm_srli_si128(src_values0, 8); + src_values3 = _mm_srli_si128(src_values2, 8); + src_values4 = _mm_packus_epi16(src_values4, src_values5); + src_values6 = _mm_packus_epi16(src_values6, src_values7); + src_values5 = _mm_srli_si128(src_values4, 8); + src_values7 = _mm_srli_si128(src_values6, 8); + + /* loading 8-bit 8 pixels values */ + _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0); /* row=0*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1); /* row=1*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2); /* row=2*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3); /* row=3*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4); /* row=4*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5); /* row=5*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6); /* row=6*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7); /* row=7*/ + } + } + else /* if nt =4*/ + { + + __m128i const_temp2_4x32b, const_temp3_4x32b, zero_8x16b; + __m128i src_values10, src_values11, intra_pred_ang_4x32b, sign_8x16b; + + __m128i row_4x32b, two_nt_4x32b, src_values12; + + + for(k = 0; k < (nt + 1); k++) + ref_temp[k + nt - 1] = pu1_ref[two_nt + k]; + ref_idx = (nt * intra_pred_ang) >> 5; + inv_ang_sum = 128; + + for(k = -1; k > ref_idx; k--) + { + inv_ang_sum += inv_ang; + ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)]; + } + + + const_temp2_4x32b = _mm_set1_epi32(31); + const_temp3_4x32b = _mm_set1_epi32(32); + zero_8x16b = _mm_setzero_si128(); + two_nt_4x32b = _mm_set1_epi32(1); + + + /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ + row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1); + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + + { + WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; + int temp11, temp21, temp31, temp41; + + + __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b; + __m128i src_values0, src_values1, src_values2, src_values3; + __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b); + res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b); + + /* fract = pos & (31); */ + src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); + + ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */ + ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */ + ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */ + ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/ + ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/ + ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/ + ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/ + + /* idx = pos >> 5; */ + src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /*(32 - fract) */ + src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11); + + fract1_8x16b = _mm_slli_epi16(src_values11, 8); + fract2_8x16b = _mm_slli_epi16(src_values10, 8); + + src_values11 = _mm_or_si128(src_values11, fract1_8x16b); + src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ + + fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); + fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); + + temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); + temp2 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); + temp3 = _mm_shuffle_epi32(fract2_8x16b, 0x00); + temp4 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); + + src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */ + src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col = 8-15 */ + src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col = 16-23 */ + src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col = 24-31 */ + + src_values0 = _mm_shuffle_epi8(src_values0, sm3); + src_values1 = _mm_shuffle_epi8(src_values1, sm3); + src_values2 = _mm_shuffle_epi8(src_values2, sm3); + src_values3 = _mm_shuffle_epi8(src_values3, sm3); + + + src_values0 = _mm_maddubs_epi16(src_values0, temp1); + src_values1 = _mm_maddubs_epi16(src_values1, temp2); + src_values2 = _mm_maddubs_epi16(src_values2, temp3); + src_values3 = _mm_maddubs_epi16(src_values3, temp4); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); + src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); + src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); + src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_values0 = _mm_srai_epi16(src_values0, 5); + src_values1 = _mm_srai_epi16(src_values1, 5); + src_values2 = _mm_srai_epi16(src_values2, 5); + src_values3 = _mm_srai_epi16(src_values3, 5); + + /* converting 16 bit to 8 bit */ + src_values0 = _mm_packus_epi16(src_values0, src_values1); + src_values2 = _mm_packus_epi16(src_values2, src_values3); + src_values1 = _mm_srli_si128(src_values0, 8); + src_values3 = _mm_srli_si128(src_values2, 8); + + temp11 = _mm_cvtsi128_si32(src_values0); + temp21 = _mm_cvtsi128_si32(src_values1); + temp31 = _mm_cvtsi128_si32(src_values2); + temp41 = _mm_cvtsi128_si32(src_values3); + + /* loding 4-bit 8 pixels values */ + *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11; + *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21; + *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31; + *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41; + + } + } +} + + + + +/** +******************************************************************************* +* +* @brief +* Intra prediction interpolation filter for luma mode 27 to mode 33 +* +* @par Description: +* Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with +* reference neighboring samples location pointed by 'pu1_ref' to the TU +* block location pointed by 'pu1_dst' +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] nt +* integer Transform Block size +* +* @param[in] mode +* integer intraprediction mode +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + + +void ihevc_intra_pred_luma_mode_27_to_33_ssse3(UWORD8 *pu1_ref, + WORD32 src_strd, + UWORD8 *pu1_dst, + WORD32 dst_strd, + WORD32 nt, + WORD32 mode) +{ + WORD32 row; + WORD32 two_nt; + WORD32 intra_pred_ang; + + __m128i temp11, temp12, temp13, temp14; + + __m128i const_temp_8x16b; + __m128i temp1, temp2, temp3, temp4, sm3; + UNUSED(src_strd); + two_nt = 2 * nt; + intra_pred_ang = gai4_ihevc_ang_table[mode]; + + const_temp_8x16b = _mm_set1_epi16(16); + sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]); + if(nt == 32) + { + + __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b; + __m128i src_values10, src_values11, intra_pred_ang_4x32b; + __m128i row_4x32b, two_nt_4x32b, src_values12; + int col = 0; + + const_temp2_4x32b = _mm_set1_epi16(31); + const_temp3_4x32b = _mm_set1_epi16(32); + const_temp8_4x32b = _mm_set1_epi16(8); + + two_nt_4x32b = _mm_set1_epi16(two_nt + 1); + + /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + + row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + + for(row = 0; row < nt; row += 8) + { + + WORD16 ref_main_idx[9]; + + __m128i res_temp5_4x32b; + __m128i fract1_8x16b, fract2_8x16b; + __m128i src_values0, src_values1, src_values2, src_values3; + __m128i src_values4, src_values5, src_values6, src_values7; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + + /* fract = pos & (31); */ + src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); + + /* idx = pos >> 5; */ + src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /*(32 - fract) */ + src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); + + fract1_8x16b = _mm_slli_epi16(src_values11, 8); + fract2_8x16b = _mm_slli_epi16(src_values10, 8); + + src_values11 = _mm_or_si128(src_values11, fract1_8x16b); + src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ + + fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); + fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); + + temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); + temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); + temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); + temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); + + temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); + temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); + temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); + temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); + + row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b); + _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); + for(col = 0; col < nt; col += 16) + { + src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + col)); + src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + col)); + src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + col)); + src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + col)); + src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8 + col)); + src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8 + col)); + src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8 + col)); + src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8 + col)); + + src_values0 = _mm_shuffle_epi8(src_values0, sm3); + src_values1 = _mm_shuffle_epi8(src_values1, sm3); + src_values2 = _mm_shuffle_epi8(src_values2, sm3); + src_values3 = _mm_shuffle_epi8(src_values3, sm3); + src_values4 = _mm_shuffle_epi8(src_values4, sm3); + src_values5 = _mm_shuffle_epi8(src_values5, sm3); + src_values6 = _mm_shuffle_epi8(src_values6, sm3); + src_values7 = _mm_shuffle_epi8(src_values7, sm3); + + + src_values0 = _mm_maddubs_epi16(src_values0, temp1); + src_values1 = _mm_maddubs_epi16(src_values1, temp2); + src_values2 = _mm_maddubs_epi16(src_values2, temp3); + src_values3 = _mm_maddubs_epi16(src_values3, temp4); + src_values4 = _mm_maddubs_epi16(src_values4, temp1); + src_values5 = _mm_maddubs_epi16(src_values5, temp2); + src_values6 = _mm_maddubs_epi16(src_values6, temp3); + src_values7 = _mm_maddubs_epi16(src_values7, temp4); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); + src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); + src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); + src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); + src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); + src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); + src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); + src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_values0 = _mm_srai_epi16(src_values0, 5); + src_values1 = _mm_srai_epi16(src_values1, 5); + src_values2 = _mm_srai_epi16(src_values2, 5); + src_values3 = _mm_srai_epi16(src_values3, 5); + src_values4 = _mm_srai_epi16(src_values4, 5); + src_values5 = _mm_srai_epi16(src_values5, 5); + src_values6 = _mm_srai_epi16(src_values6, 5); + src_values7 = _mm_srai_epi16(src_values7, 5); + + /* converting 16 bit to 8 bit */ + src_values0 = _mm_packus_epi16(src_values0, src_values4); + src_values1 = _mm_packus_epi16(src_values1, src_values5); + src_values2 = _mm_packus_epi16(src_values2, src_values6); + src_values3 = _mm_packus_epi16(src_values3, src_values7); + + /* loading 8-bit 8 pixels values */ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0); /* row=0*/ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1); /* row=1*/ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2); /* row=2*/ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3); /* row=3*/ + + + src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + col)); + src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + col)); + src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + col)); + src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + col)); + src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8 + col)); + src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8 + col)); + src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8 + col)); + src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8 + col)); + + src_values0 = _mm_shuffle_epi8(src_values0, sm3); + src_values1 = _mm_shuffle_epi8(src_values1, sm3); + src_values2 = _mm_shuffle_epi8(src_values2, sm3); + src_values3 = _mm_shuffle_epi8(src_values3, sm3); + src_values4 = _mm_shuffle_epi8(src_values4, sm3); + src_values5 = _mm_shuffle_epi8(src_values5, sm3); + src_values6 = _mm_shuffle_epi8(src_values6, sm3); + src_values7 = _mm_shuffle_epi8(src_values7, sm3); + + + src_values0 = _mm_maddubs_epi16(src_values0, temp11); + src_values1 = _mm_maddubs_epi16(src_values1, temp12); + src_values2 = _mm_maddubs_epi16(src_values2, temp13); + src_values3 = _mm_maddubs_epi16(src_values3, temp14); + src_values4 = _mm_maddubs_epi16(src_values4, temp11); + src_values5 = _mm_maddubs_epi16(src_values5, temp12); + src_values6 = _mm_maddubs_epi16(src_values6, temp13); + src_values7 = _mm_maddubs_epi16(src_values7, temp14); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); + src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); + src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); + src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); + src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); + src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); + src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); + src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_values0 = _mm_srai_epi16(src_values0, 5); + src_values1 = _mm_srai_epi16(src_values1, 5); + src_values2 = _mm_srai_epi16(src_values2, 5); + src_values3 = _mm_srai_epi16(src_values3, 5); + src_values4 = _mm_srai_epi16(src_values4, 5); + src_values5 = _mm_srai_epi16(src_values5, 5); + src_values6 = _mm_srai_epi16(src_values6, 5); + src_values7 = _mm_srai_epi16(src_values7, 5); + + /* converting 16 bit to 8 bit */ + src_values0 = _mm_packus_epi16(src_values0, src_values4); + src_values1 = _mm_packus_epi16(src_values1, src_values5); + src_values2 = _mm_packus_epi16(src_values2, src_values6); + src_values3 = _mm_packus_epi16(src_values3, src_values7); + + /* loading 8-bit 8 pixels values */ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0); /* row=4*/ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1); /* row=5*/ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2); /* row=6*/ + _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3); /* row=7*/ + + } + pu1_dst += 8 * dst_strd; + } + + } + else if(nt == 16) /* for nt = 16 case */ + { + + __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b; + __m128i src_values10, src_values11, intra_pred_ang_4x32b; + __m128i row_4x32b, two_nt_4x32b, src_values12; + + + const_temp2_4x32b = _mm_set1_epi16(31); + const_temp3_4x32b = _mm_set1_epi16(32); + const_temp8_4x32b = _mm_set1_epi16(8); + + two_nt_4x32b = _mm_set1_epi16(two_nt + 1); + + /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + + row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + + for(row = 0; row < nt; row += 8) + { + + WORD16 ref_main_idx[9]; + + __m128i res_temp5_4x32b; + __m128i fract1_8x16b, fract2_8x16b; + __m128i src_values0, src_values1, src_values2, src_values3; + __m128i src_values4, src_values5, src_values6, src_values7; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + + /* fract = pos & (31); */ + src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); + + /* idx = pos >> 5; */ + src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /*(32 - fract) */ + src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); + + fract1_8x16b = _mm_slli_epi16(src_values11, 8); + fract2_8x16b = _mm_slli_epi16(src_values10, 8); + + src_values11 = _mm_or_si128(src_values11, fract1_8x16b); + src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ + + fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); + fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); + + temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); + temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); + temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); + temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); + + temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); + temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); + temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); + temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); + + row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b); + _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); + + { + src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0])); + src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1])); + src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2])); + src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3])); + src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8)); + src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8)); + src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8)); + src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8)); + + src_values0 = _mm_shuffle_epi8(src_values0, sm3); + src_values1 = _mm_shuffle_epi8(src_values1, sm3); + src_values2 = _mm_shuffle_epi8(src_values2, sm3); + src_values3 = _mm_shuffle_epi8(src_values3, sm3); + src_values4 = _mm_shuffle_epi8(src_values4, sm3); + src_values5 = _mm_shuffle_epi8(src_values5, sm3); + src_values6 = _mm_shuffle_epi8(src_values6, sm3); + src_values7 = _mm_shuffle_epi8(src_values7, sm3); + + + src_values0 = _mm_maddubs_epi16(src_values0, temp1); + src_values1 = _mm_maddubs_epi16(src_values1, temp2); + src_values2 = _mm_maddubs_epi16(src_values2, temp3); + src_values3 = _mm_maddubs_epi16(src_values3, temp4); + src_values4 = _mm_maddubs_epi16(src_values4, temp1); + src_values5 = _mm_maddubs_epi16(src_values5, temp2); + src_values6 = _mm_maddubs_epi16(src_values6, temp3); + src_values7 = _mm_maddubs_epi16(src_values7, temp4); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); + src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); + src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); + src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); + src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); + src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); + src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); + src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_values0 = _mm_srai_epi16(src_values0, 5); + src_values1 = _mm_srai_epi16(src_values1, 5); + src_values2 = _mm_srai_epi16(src_values2, 5); + src_values3 = _mm_srai_epi16(src_values3, 5); + src_values4 = _mm_srai_epi16(src_values4, 5); + src_values5 = _mm_srai_epi16(src_values5, 5); + src_values6 = _mm_srai_epi16(src_values6, 5); + src_values7 = _mm_srai_epi16(src_values7, 5); + + /* converting 16 bit to 8 bit */ + src_values0 = _mm_packus_epi16(src_values0, src_values4); + src_values1 = _mm_packus_epi16(src_values1, src_values5); + src_values2 = _mm_packus_epi16(src_values2, src_values6); + src_values3 = _mm_packus_epi16(src_values3, src_values7); + + /* loading 8-bit 8 pixels values */ + _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0); /* row=0*/ + _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1); /* row=1*/ + _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2); /* row=2*/ + _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3); /* row=3*/ + + + src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4])); + src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5])); + src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6])); + src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7])); + src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8)); + src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8)); + src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8)); + src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8)); + + src_values0 = _mm_shuffle_epi8(src_values0, sm3); + src_values1 = _mm_shuffle_epi8(src_values1, sm3); + src_values2 = _mm_shuffle_epi8(src_values2, sm3); + src_values3 = _mm_shuffle_epi8(src_values3, sm3); + src_values4 = _mm_shuffle_epi8(src_values4, sm3); + src_values5 = _mm_shuffle_epi8(src_values5, sm3); + src_values6 = _mm_shuffle_epi8(src_values6, sm3); + src_values7 = _mm_shuffle_epi8(src_values7, sm3); + + + src_values0 = _mm_maddubs_epi16(src_values0, temp11); + src_values1 = _mm_maddubs_epi16(src_values1, temp12); + src_values2 = _mm_maddubs_epi16(src_values2, temp13); + src_values3 = _mm_maddubs_epi16(src_values3, temp14); + src_values4 = _mm_maddubs_epi16(src_values4, temp11); + src_values5 = _mm_maddubs_epi16(src_values5, temp12); + src_values6 = _mm_maddubs_epi16(src_values6, temp13); + src_values7 = _mm_maddubs_epi16(src_values7, temp14); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); + src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); + src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); + src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); + src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); + src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); + src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); + src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_values0 = _mm_srai_epi16(src_values0, 5); + src_values1 = _mm_srai_epi16(src_values1, 5); + src_values2 = _mm_srai_epi16(src_values2, 5); + src_values3 = _mm_srai_epi16(src_values3, 5); + src_values4 = _mm_srai_epi16(src_values4, 5); + src_values5 = _mm_srai_epi16(src_values5, 5); + src_values6 = _mm_srai_epi16(src_values6, 5); + src_values7 = _mm_srai_epi16(src_values7, 5); + + /* converting 16 bit to 8 bit */ + src_values0 = _mm_packus_epi16(src_values0, src_values4); + src_values1 = _mm_packus_epi16(src_values1, src_values5); + src_values2 = _mm_packus_epi16(src_values2, src_values6); + src_values3 = _mm_packus_epi16(src_values3, src_values7); + + /* loading 8-bit 8 pixels values */ + _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0); /* row=4*/ + _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1); /* row=5*/ + _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2); /* row=6*/ + _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3); /* row=7*/ + + } + pu1_dst += 8 * dst_strd; + } + + } + else if(nt == 8) + { + + __m128i const_temp2_4x32b, const_temp3_4x32b; + __m128i src_values10, src_values11, intra_pred_ang_4x32b; + __m128i row_4x32b, two_nt_4x32b, src_values12; + + + const_temp2_4x32b = _mm_set1_epi16(31); + const_temp3_4x32b = _mm_set1_epi16(32); + + two_nt_4x32b = _mm_set1_epi16(two_nt + 1); + + + /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + + row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + + //for(row = 0; row < nt; row +=4) + { + + WORD16 ref_main_idx[9]; + + __m128i res_temp5_4x32b; + __m128i fract1_8x16b, fract2_8x16b; + __m128i src_values0, src_values1, src_values2, src_values3; + __m128i src_values4, src_values5, src_values6, src_values7; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + + /* fract = pos & (31); */ + src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); + + /* idx = pos >> 5; */ + src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /*(32 - fract) */ + src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); + + fract1_8x16b = _mm_slli_epi16(src_values11, 8); + fract2_8x16b = _mm_slli_epi16(src_values10, 8); + + src_values11 = _mm_or_si128(src_values11, fract1_8x16b); + src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ + + fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); + fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); + + temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); + temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); + temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); + temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); + + temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); + temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); + temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); + temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); + + _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); + + src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0])); /* col = 0-7 */ + src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1])); /* col = 8-15 */ + src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2])); /* col = 16-23 */ + src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3])); /* col = 24-31 */ + src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4])); /* col = 32-39 */ + src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5])); /* col = 40-47 */ + src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6])); /* col = 48-55 */ + src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7])); /* col = 56-63*/ + + src_values0 = _mm_shuffle_epi8(src_values0, sm3); + src_values1 = _mm_shuffle_epi8(src_values1, sm3); + src_values2 = _mm_shuffle_epi8(src_values2, sm3); + src_values3 = _mm_shuffle_epi8(src_values3, sm3); + src_values4 = _mm_shuffle_epi8(src_values4, sm3); + src_values5 = _mm_shuffle_epi8(src_values5, sm3); + src_values6 = _mm_shuffle_epi8(src_values6, sm3); + src_values7 = _mm_shuffle_epi8(src_values7, sm3); + + + src_values0 = _mm_maddubs_epi16(src_values0, temp1); + src_values1 = _mm_maddubs_epi16(src_values1, temp2); + src_values2 = _mm_maddubs_epi16(src_values2, temp3); + src_values3 = _mm_maddubs_epi16(src_values3, temp4); + src_values4 = _mm_maddubs_epi16(src_values4, temp11); + src_values5 = _mm_maddubs_epi16(src_values5, temp12); + src_values6 = _mm_maddubs_epi16(src_values6, temp13); + src_values7 = _mm_maddubs_epi16(src_values7, temp14); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); + src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); + src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); + src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); + src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); + src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); + src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); + src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_values0 = _mm_srai_epi16(src_values0, 5); + src_values1 = _mm_srai_epi16(src_values1, 5); + src_values2 = _mm_srai_epi16(src_values2, 5); + src_values3 = _mm_srai_epi16(src_values3, 5); + src_values4 = _mm_srai_epi16(src_values4, 5); + src_values5 = _mm_srai_epi16(src_values5, 5); + src_values6 = _mm_srai_epi16(src_values6, 5); + src_values7 = _mm_srai_epi16(src_values7, 5); + + /* converting 16 bit to 8 bit */ + src_values0 = _mm_packus_epi16(src_values0, src_values1); + src_values2 = _mm_packus_epi16(src_values2, src_values3); + src_values1 = _mm_srli_si128(src_values0, 8); + src_values3 = _mm_srli_si128(src_values2, 8); + src_values4 = _mm_packus_epi16(src_values4, src_values5); + src_values6 = _mm_packus_epi16(src_values6, src_values7); + src_values5 = _mm_srli_si128(src_values4, 8); + src_values7 = _mm_srli_si128(src_values6, 8); + + /* loading 8-bit 8 pixels values */ + _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0); /* row=0*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1); /* row=1*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2); /* row=2*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3); /* row=3*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4); /* row=4*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5); /* row=5*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6); /* row=6*/ + _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7); /* row=7*/ + } + + } + else /* if nt =4*/ + { + + __m128i const_temp2_4x32b, const_temp3_4x32b, zero_8x16b; + __m128i src_values10, src_values11, intra_pred_ang_4x32b; + + __m128i row_4x32b, two_nt_4x32b, src_values12, sign_8x16b; + + + const_temp2_4x32b = _mm_set1_epi32(31); + const_temp3_4x32b = _mm_set1_epi32(32); + zero_8x16b = _mm_setzero_si128(); + two_nt_4x32b = _mm_set1_epi32(two_nt + 1); + + + /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ + row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1); + intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); + { + int temp11, temp21, temp31, temp41; + + WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; + + __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b; + __m128i src_values0, src_values1, src_values2, src_values3; + __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2; + + /* pos = ((row + 1) * intra_pred_ang); */ + res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); + sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b); + res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b); + + /* fract = pos & (31); */ + src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); + + ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */ + ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */ + ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */ + ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/ + ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/ + ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/ + ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/ + + /* idx = pos >> 5; */ + src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); + + /*(32 - fract) */ + src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11); + + fract1_8x16b = _mm_slli_epi16(src_values11, 8); + fract2_8x16b = _mm_slli_epi16(src_values10, 8); + + src_values11 = _mm_or_si128(src_values11, fract1_8x16b); + src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ + + fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); + fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); + + temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); + temp2 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); + temp3 = _mm_shuffle_epi32(fract2_8x16b, 0x00); + temp4 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); + + src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */ + src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2)); /* col = 8-15 */ + src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3)); /* col = 16-23 */ + src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4)); /* col = 24-31 */ + + src_values0 = _mm_shuffle_epi8(src_values0, sm3); + src_values1 = _mm_shuffle_epi8(src_values1, sm3); + src_values2 = _mm_shuffle_epi8(src_values2, sm3); + src_values3 = _mm_shuffle_epi8(src_values3, sm3); + + src_values0 = _mm_maddubs_epi16(src_values0, temp1); + src_values1 = _mm_maddubs_epi16(src_values1, temp2); + src_values2 = _mm_maddubs_epi16(src_values2, temp3); + src_values3 = _mm_maddubs_epi16(src_values3, temp4); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ + src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); + src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); + src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); + src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); + + /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ + src_values0 = _mm_srai_epi16(src_values0, 5); + src_values1 = _mm_srai_epi16(src_values1, 5); + src_values2 = _mm_srai_epi16(src_values2, 5); + src_values3 = _mm_srai_epi16(src_values3, 5); + + /* converting 16 bit to 8 bit */ + src_values0 = _mm_packus_epi16(src_values0, src_values1); + src_values2 = _mm_packus_epi16(src_values2, src_values3); + src_values1 = _mm_srli_si128(src_values0, 8); + src_values3 = _mm_srli_si128(src_values2, 8); + + temp11 = _mm_cvtsi128_si32(src_values0); + temp21 = _mm_cvtsi128_si32(src_values1); + temp31 = _mm_cvtsi128_si32(src_values2); + temp41 = _mm_cvtsi128_si32(src_values3); + + /* loding 4-bit 8 pixels values */ + *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11; + *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21; + *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31; + *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41; + + } + } +} |