/****************************************************************************** * * Copyright (C) 2015 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** ******************************************************************************* * @file * ih264_chroma_intra_pred_filters_ssse3.c * * @brief * Contains function definitions for chroma intra prediction filters in x86 * intrinsics * * @author * Ittiam * * @par List of Functions: * -ih264_intra_pred_chroma_8x8_mode_horz_ssse3 * -ih264_intra_pred_chroma_8x8_mode_vert_ssse3 * -ih264_intra_pred_chroma_8x8_mode_plane_ssse3 * * @remarks * None * ******************************************************************************* */ /*****************************************************************************/ /* File Includes */ /*****************************************************************************/ /* System include files */ #include #include #include /* User include files */ #include "ih264_defs.h" #include "ih264_typedefs.h" #include "ih264_macros.h" #include "ih264_platform_macros.h" #include "ih264_intra_pred_filters.h" /*****************************************************************************/ /* Chroma Intra prediction 8x8 filters */ /*****************************************************************************/ /** ******************************************************************************* * * ih264_intra_pred_chroma_8x8_mode_horz_ssse3 * * @brief * Perform Intra prediction for chroma_8x8 mode:Horizontal * * @par Description: * Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2 * * @param[in] pu1_src * UWORD8 pointer to the source containing alternate U and V samples * * @param[out] pu1_dst * UWORD8 pointer to the destination with alternate U and V samples * * @param[in] src_strd * integer source stride * * @param[in] dst_strd * integer destination stride * * @param[in] ngbr_avail * availability of neighbouring pixels(Not used in this function) * * @returns * * @remarks * None * ****************************************************************************** */ void ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 *pu1_src, UWORD8 *pu1_dst, WORD32 src_strd, WORD32 dst_strd, WORD32 ngbr_avail) { UWORD8 *pu1_left; /* Pointer to start of top predictors */ WORD32 dst_strd2; __m128i row1_16x8b, row2_16x8b; UNUSED(src_strd); UNUSED(ngbr_avail); pu1_left = pu1_src + 2 * BLK8x8SIZE - 2; dst_strd2 = dst_strd << 1; row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left))); row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 2))); _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); pu1_dst += dst_strd2; row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 4))); row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 6))); _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); pu1_dst += dst_strd2; row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 8))); row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 10))); _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); pu1_dst += dst_strd2; row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 12))); row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 14))); _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); } /** ******************************************************************************* * * ih264_intra_pred_chroma_8x8_mode_vert_ssse3 * * @brief * Perform Intra prediction for chroma_8x8 mode:vertical * * @par Description: * Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3 * * @param[in] pu1_src * UWORD8 pointer to the source containing alternate U and V samples * * @param[out] pu1_dst * UWORD8 pointer to the destination with alternate U and V samples * * @param[in] src_strd * integer source stride * * @param[in] dst_strd * integer destination stride * * @param[in] ngbr_avail * availability of neighbouring pixels(Not used in this function) * * @returns * * @remarks * None * ******************************************************************************* */ void ih264_intra_pred_chroma_8x8_mode_vert_ssse3(UWORD8 *pu1_src, UWORD8 *pu1_dst, WORD32 src_strd, WORD32 dst_strd, WORD32 ngbr_avail) { UWORD8 *pu1_top; /* Pointer to start of top predictors */ WORD32 dst_strd2; __m128i top_16x8b; UNUSED(src_strd); UNUSED(ngbr_avail); pu1_top = pu1_src + 2 * BLK8x8SIZE + 2; top_16x8b = _mm_loadu_si128((__m128i *)pu1_top); dst_strd2 = dst_strd << 1; _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); pu1_dst += dst_strd2; _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); pu1_dst += dst_strd2; _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); pu1_dst += dst_strd2; _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); } /** ******************************************************************************* * * ih264_intra_pred_chroma_8x8_mode_plane_ssse3 * * @brief * Perform Intra prediction for chroma_8x8 mode:PLANE * * @par Description: * Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4 * * @param[in] pu1_src * UWORD8 pointer to the source containing alternate U and V samples * * @param[out] pu1_dst * UWORD8 pointer to the destination with alternate U and V samples * * @param[in] src_strd * integer source stride * * @param[in] dst_strd * integer destination stride * * @param[in] ngbr_avail * availability of neighbouring pixels(Not used in this function) * * @returns * * @remarks * None * ****************************************************************************** */ void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src, UWORD8 *pu1_dst, WORD32 src_strd, WORD32 dst_strd, WORD32 ngbr_avail) { UWORD8 *pu1_left, *pu1_top; WORD32 a_u, a_v, b_u, b_v, c_u, c_v; __m128i mul_8x16b, shuffle_8x16b; UNUSED(src_strd); UNUSED(ngbr_avail); pu1_top = pu1_src + MB_SIZE + 2; pu1_left = pu1_src + MB_SIZE - 2; mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4); shuffle_8x16b = _mm_setr_epi16(0xff00, 0xff02, 0xff04, 0xff06, 0xff01, 0xff03, 0xff05, 0xff07); //calculating a, b and c { WORD32 h_u, h_v, v_u, v_v; __m128i h_val1_16x8b, h_val2_16x8b; __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b; __m128i v_val1_16x8b, v_val2_16x8b; __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b; __m128i hv_val_4x32b; h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8)); h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 2)); v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 14)); v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 4)); // reversing the order h_val2_16x8b = _mm_shufflelo_epi16(h_val2_16x8b, 0x1b); v_val1_16x8b = _mm_shufflelo_epi16(v_val1_16x8b, 0x1b); // separating u and v and 8-bit to 16-bit conversion h_val1_8x16b = _mm_shuffle_epi8(h_val1_16x8b, shuffle_8x16b); h_val2_8x16b = _mm_shuffle_epi8(h_val2_16x8b, shuffle_8x16b); v_val1_8x16b = _mm_shuffle_epi8(v_val1_16x8b, shuffle_8x16b); v_val2_8x16b = _mm_shuffle_epi8(v_val2_16x8b, shuffle_8x16b); h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b); v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b); h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b); v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b); hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b); a_u = (pu1_left[7 * (-2)] + pu1_top[14]) << 4; a_v = (pu1_left[7 * (-2) + 1] + pu1_top[15]) << 4; h_u = _mm_extract_epi16(hv_val_4x32b, 0); h_v = _mm_extract_epi16(hv_val_4x32b, 2); v_u = _mm_extract_epi16(hv_val_4x32b, 4); v_v = _mm_extract_epi16(hv_val_4x32b, 6); h_u = (h_u << 16) >> 15; // sign-extension and multiplication by 2 h_v = (h_v << 16) >> 15; v_u = (v_u << 16) >> 15; v_v = (v_v << 16) >> 15; b_u = ((h_u << 4) + h_u + 32) >> 6; b_v = ((h_v << 4) + h_v + 32) >> 6; c_u = ((v_u << 4) + v_u + 32) >> 6; c_v = ((v_v << 4) + v_v + 32) >> 6; } //using a, b and c to compute the fitted plane values { __m128i const_8x16b, c2_8x16b; __m128i res1_l_8x16b, res1_h_8x16b; __m128i res2_l_8x16b, res2_h_8x16b; __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b; __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b; WORD32 b_u2, b_v2, b_u3, b_v3; WORD32 const_u, const_v; WORD32 dst_strd2; const_u = a_u - (c_u << 1) - c_u + 16; const_v = a_v - (c_v << 1) - c_v + 16; b_u2 = b_u << 1; b_v2 = b_v << 1; b_u3 = b_u + b_u2; b_v3 = b_v + b_v2; const_8x16b = _mm_setr_epi16(const_u, const_v, const_u, const_v, const_u, const_v, const_u, const_v); res1_l_8x16b = _mm_setr_epi16(-b_u3, -b_v3, -b_u2, -b_v2, -b_u, -b_v, 0, 0); //contains {-b*3, -b*2, -b*1, b*0} res1_h_8x16b = _mm_setr_epi16(b_u, b_v, b_u2, b_v2, b_u3, b_v3, b_u << 2, b_v << 2); //contains {b*1, b*2, b*3, b*4} c2_8x16b = _mm_setr_epi16(c_u, c_v, c_u, c_v, c_u, c_v, c_u, c_v); // rows 1, 2 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b); res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b); res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); dst_strd2 = dst_strd << 1; c2_8x16b = _mm_slli_epi16(c2_8x16b, 1); res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); // rows 3, 4 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); pu1_dst += dst_strd2; res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); // rows 5, 6 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); pu1_dst += dst_strd2; res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); // rows 7, 8 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); pu1_dst += dst_strd2; res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); } }