summaryrefslogtreecommitdiffstats
path: root/common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c
diff options
context:
space:
mode:
Diffstat (limited to 'common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c')
-rw-r--r--common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c486
1 files changed, 486 insertions, 0 deletions
diff --git a/common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c b/common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c
new file mode 100644
index 0000000..1de4253
--- /dev/null
+++ b/common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c
@@ -0,0 +1,486 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_chroma_intra_pred_filters_x86_intr.c
+*
+* @brief
+* Contains function Definition for intra prediction interpolation filters
+*
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* ihevc_intra_pred_chroma_planar_sse42()
+*
+* ihevc_intra_pred_chroma_dc_sse42()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_chroma_intra_pred.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_tables_x86_intr.h"
+
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
+
+/****************************************************************************/
+/* Constant Macros */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+#define T16C_4NT 64
+#define T8C_4NT 32
+/****************************************************************************/
+/* Function Macros */
+/****************************************************************************/
+
+#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
+
+/* tables to shuffle 8-bit values */
+
+/*****************************************************************************/
+/* Function Definition */
+/*****************************************************************************/
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Planar Intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.4 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_planar_sse42(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col;
+ WORD32 log2nt = 5;
+ WORD32 two_nt, three_nt;
+
+ __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+ __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ switch(nt)
+ {
+ case 16:
+ log2nt = 4;
+ break;
+ case 8:
+ log2nt = 3;
+ break;
+ case 4:
+ log2nt = 2;
+ break;
+ default:
+ break;
+ }
+ two_nt = 2 * nt;
+ three_nt = 3 * nt;
+
+ /* Planar filtering */
+
+/* setting vallues in registera*/
+
+// pu1_ref[2*(two_nt - 1 - row)]
+// pu1_ref[2 * (three_nt + 1)]
+// pu1_ref[2 * (two_nt + 1) + col]
+// pu1_ref[2 * (nt - 1)]
+
+ const_temp_4x32b = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
+ pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
+ pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);
+
+ const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
+ pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);
+
+ const_temp4_4x32b = _mm_set1_epi16(nt - 1);
+ const_temp6_4x32b = _mm_set1_epi16(nt);
+ const_temp7_4x32b = _mm_set1_epi16(4);
+
+ zero_8x16b = _mm_set1_epi32(0);
+
+ if(nt % 4 == 0)
+ {
+ const_temp7_4x32b = _mm_set1_epi16(4);
+
+ for(row = 0; row < nt; row++)
+ {
+ __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
+ __m128i res_temp3_8x16b;
+
+ const_temp2_4x32b = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
+ pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
+ pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);
+
+ const_temp3_4x32b = _mm_set1_epi16((row + 1));
+ row_8x16b = _mm_set1_epi16((nt - 1 - row));
+
+ const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
+ col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);
+
+ const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
+
+ /*(row + 1) * pu1_ref[nt - 1]*/
+ res_temp_8x16b = _mm_mullo_epi16(const_temp3_4x32b, const_temp1_4x32b);
+
+ /*(row + 1) * pu1_ref[nt - 1] + nt)*/
+ res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+
+ for(col = 0; col < 2 * nt; col += 8)
+ {
+ __m128i src_temp_8x16b;
+
+ /* loding 8bit 16 pixles*/
+ src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));
+
+ src_temp_8x16b = _mm_cvtepu8_epi16(src_temp_8x16b); /* row=0*/
+
+ /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
+ res_temp1_8x16b = _mm_mullo_epi16(src_temp_8x16b, row_8x16b);
+
+ /*(col + 1) * pu1_ref[three_nt + 1]*/
+ res_temp2_8x16b = _mm_mullo_epi16(const_temp_4x32b, col_8x16b);
+
+ /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
+ res_temp3_8x16b = _mm_mullo_epi16(const_temp2_4x32b, const_temp5_4x32b);
+
+ res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
+
+ res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
+ res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);
+
+ const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
+ col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
+ } /* inner loop ends here */
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for DC mode with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.5 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size (Chroma)
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_dc_sse42(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 acc_dc_u, acc_dc_v;
+ WORD32 dc_val_u, dc_val_v;
+ WORD32 row;
+ WORD32 log2nt = 5;
+ __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
+ __m128i src_temp7, src_temp8, src_temp9, src_temp10;
+ __m128i m_zero = _mm_set1_epi32(0);
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ switch(nt)
+ {
+ case 32:
+ log2nt = 5;
+ break;
+ case 16:
+ log2nt = 4;
+ break;
+ case 8:
+ log2nt = 3;
+ break;
+ case 4:
+ log2nt = 2;
+ break;
+ default:
+ break;
+ }
+
+ acc_dc_u = 0;
+ acc_dc_v = 0;
+
+ /* Calculate DC value for the transform block */
+
+ m_mask = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);
+
+ if(nt == 16)
+ {
+ __m128i temp_sad;
+
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));
+
+ src_temp5 = _mm_cvtepu8_epi16(src_temp3);
+ src_temp6 = _mm_cvtepu8_epi16(src_temp4);
+ src_temp9 = _mm_cvtepu8_epi16(src_temp7);
+ src_temp10 = _mm_cvtepu8_epi16(src_temp8);
+
+ src_temp3 = _mm_srli_si128(src_temp3, 8);
+ src_temp4 = _mm_srli_si128(src_temp4, 8);
+ src_temp7 = _mm_srli_si128(src_temp7, 8);
+ src_temp8 = _mm_srli_si128(src_temp8, 8);
+
+ src_temp3 = _mm_cvtepu8_epi16(src_temp3);
+ src_temp4 = _mm_cvtepu8_epi16(src_temp4);
+ src_temp7 = _mm_cvtepu8_epi16(src_temp7);
+ src_temp8 = _mm_cvtepu8_epi16(src_temp8);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
+ src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
+ src_temp10 = _mm_add_epi16(src_temp9, src_temp10);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp8 = _mm_add_epi16(src_temp8, src_temp10);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ src_temp4 = _mm_cvtepi16_epi32(src_temp4);
+ temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+ acc_dc_u = _mm_cvtsi128_si32(src_temp4);
+ acc_dc_v = _mm_cvtsi128_si32(temp_sad);
+ }
+
+ else if(nt == 8)
+ {
+ __m128i temp_sad;
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
+
+ src_temp5 = _mm_cvtepu8_epi16(src_temp3);
+ src_temp6 = _mm_cvtepu8_epi16(src_temp4);
+
+ src_temp3 = _mm_srli_si128(src_temp3, 8);
+ src_temp4 = _mm_srli_si128(src_temp4, 8);
+
+ src_temp3 = _mm_cvtepu8_epi16(src_temp3);
+ src_temp4 = _mm_cvtepu8_epi16(src_temp4);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ src_temp4 = _mm_cvtepi16_epi32(src_temp4);
+ temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+ acc_dc_u = _mm_cvtsi128_si32(src_temp4);
+ acc_dc_v = _mm_cvtsi128_si32(temp_sad);
+ }
+
+ else if(nt == 4)
+ {
+ __m128i temp_sad;
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+
+ src_temp5 = _mm_cvtepu8_epi16(src_temp3);
+ src_temp4 = _mm_srli_si128(src_temp3, 8);
+ src_temp4 = _mm_cvtepu8_epi16(src_temp4);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
+
+ src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ src_temp4 = _mm_cvtepi16_epi32(src_temp4);
+ temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+ acc_dc_u = _mm_cvtsi128_si32(src_temp4);
+ acc_dc_v = _mm_cvtsi128_si32(temp_sad);
+ }
+
+
+ acc_dc_u += pu1_ref[6 * nt];
+ acc_dc_v += pu1_ref[6 * nt + 1];
+
+ acc_dc_u -= pu1_ref[4 * nt];
+ acc_dc_v -= pu1_ref[4 * nt + 1];
+
+ dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
+ dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
+
+ dc_val_u = dc_val_u | (dc_val_v << 8);
+
+ /* Fill the remaining rows with DC value*/
+
+ if(nt == 4)
+ {
+ src_temp1 = _mm_set1_epi16(dc_val_u);
+
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+
+ }
+ else if(nt == 8)
+ {
+ src_temp1 = _mm_set1_epi16(dc_val_u);
+
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+
+ }
+
+ else /* nt == 16 */
+ {
+
+ src_temp1 = _mm_set1_epi16(dc_val_u);
+
+ for(row = 0; row < nt; row += 8)
+ {
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);
+
+ pu1_dst += 8 * dst_strd;
+ }
+
+
+ }
+
+}