summaryrefslogtreecommitdiffstats
path: root/decoder/x86/ihevcd_it_rec_dc_sse42_intr.c
diff options
context:
space:
mode:
Diffstat (limited to 'decoder/x86/ihevcd_it_rec_dc_sse42_intr.c')
-rw-r--r--decoder/x86/ihevcd_it_rec_dc_sse42_intr.c401
1 files changed, 401 insertions, 0 deletions
diff --git a/decoder/x86/ihevcd_it_rec_dc_sse42_intr.c b/decoder/x86/ihevcd_it_rec_dc_sse42_intr.c
new file mode 100644
index 0000000..55fa21b
--- /dev/null
+++ b/decoder/x86/ihevcd_it_rec_dc_sse42_intr.c
@@ -0,0 +1,401 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_it_rec_dc_x86_intr.c
+*
+* @brief
+* Platform specific intrinsic implementation of certain functions
+*
+* @author
+* Ittiam
+* @par List of Functions:
+* - ihevcd_itrans_recon_dc
+* - ihevcd_fmt_conv_420sp_to_420p
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevcd_function_selector.h"
+
+#include <immintrin.h>
+
+
+void ihevcd_itrans_recon_dc_luma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
+ WORD32 log2_trans_size, WORD16 i2_coeff_value)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_5;
+ __m128i m_temp_reg_6;
+ __m128i m_temp_reg_7;
+ __m128i m_temp_reg_8;
+ __m128i m_temp_reg_9;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_20, zero_8x16b;
+ __m128i *pi4_dst = (__m128i *)pu1_dst;
+
+
+ //WORD32 row,col;
+ WORD32 add, shift;
+ WORD32 dc_value, quant_out;
+ WORD32 trans_size;
+
+
+
+
+ trans_size = (1 << log2_trans_size);
+
+ quant_out = i2_coeff_value;
+
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+ /*Replicate the DC value within 16 bits in 128 bit register*/
+ m_temp_reg_20 = _mm_set1_epi16(dc_value);
+ zero_8x16b = _mm_setzero_si128();
+
+ if(trans_size == 4)
+ {
+ WORD32 *pi4_dst = (WORD32 *)pu1_dst;
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1);
+ m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3);
+
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
+
+ m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+ m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+
+ m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
+
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8);
+ m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4);
+ m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8);
+ m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+ }
+ else
+ {
+ WORD32 i, j;
+
+ for(i = 1; i <= trans_size; i += 4)
+ {
+ for(j = 1; j <= trans_size; j += 8)
+ {
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b);
+ m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b);
+ m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b);
+
+ m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+ m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+ m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
+ m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
+
+ pi4_dst = (__m128i *)(pu1_dst);
+
+ m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+
+ pi4_dst = (__m128i *)(pu1_dst + dst_strd);
+
+ m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_13);
+
+ pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
+
+ m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_14);
+
+ pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
+
+ m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_15);
+
+ pu1_pred += 8;
+ pu1_dst += 8;
+ }
+ pu1_pred += 4 * pred_strd - trans_size;
+ pu1_dst += 4 * dst_strd - trans_size;
+ }
+ }
+
+
+}
+
+void ihevcd_itrans_recon_dc_chroma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
+ WORD32 log2_trans_size, WORD16 i2_coeff_value)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_5;
+ __m128i m_temp_reg_6;
+ __m128i m_temp_reg_7;
+ __m128i m_temp_reg_8;
+ __m128i m_temp_reg_9;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_20, zero_8x16b;
+ __m128i *pi4_dst = (__m128i *)pu1_dst;
+
+
+ //WORD32 row,col;
+ WORD32 add, shift;
+ WORD32 dc_value, quant_out;
+ WORD32 trans_size;
+
+
+ WORD32 shuffle_mask_4x4 = 0x06040200;
+ WORD32 unchanged_mask_4x4 = 0x07050301;
+ LWORD64 shuffle_mask = 0x0E0C0A0806040200LL;
+ LWORD64 unchanged_mask = 0x0F0D0B0907050301LL;
+
+ trans_size = (1 << log2_trans_size);
+
+ quant_out = i2_coeff_value;
+
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+ /*Replicate the DC value within 16 bits in 128 bit register*/
+ m_temp_reg_20 = _mm_set1_epi16(dc_value);
+ zero_8x16b = _mm_setzero_si128();
+
+ if(trans_size == 4)
+ {
+ __m128i chroma_shuffle_mask_16x8b;
+ __m128i chroma_unchanged_mask_16x8b;
+ chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4);
+ chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4);
+
+ /*Load the prediction data*/
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+ m_temp_reg_10 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
+ m_temp_reg_11 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
+ m_temp_reg_12 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
+ m_temp_reg_13 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
+
+ m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b);
+
+ m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+ m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+
+ /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst);
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd));
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd));
+
+ m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
+ m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
+ m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
+ m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
+
+
+ m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
+ m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0);
+ m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+ m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1);
+ m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+ m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2);
+ m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+ m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3);
+
+ /*Store the result in the destination*/
+ _mm_storel_epi64(pi4_dst, m_temp_reg_9);
+ pu1_dst += dst_strd;
+ pi4_dst = (__m128i *)(pu1_dst);
+
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_10);
+ pu1_dst += dst_strd;
+ pi4_dst = (__m128i *)(pu1_dst);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_11);
+ pu1_dst += dst_strd;
+ pi4_dst = (__m128i *)(pu1_dst);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+ }
+ else
+ {
+ WORD32 i, j;
+ __m128i chroma_shuffle_mask_16x8b;
+ __m128i chroma_unchanged_mask_16x8b;
+ chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask));
+ chroma_unchanged_mask_16x8b =
+ _mm_loadl_epi64((__m128i *)(&unchanged_mask));
+
+ for(i = 0; i < trans_size; i += 4)
+ {
+ for(j = 0; j < trans_size; j += 8)
+ {
+
+ m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred);
+ m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd));
+ m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd));
+ m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd));
+
+ /*Retain only one chroma component*/
+ m_temp_reg_4 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
+ m_temp_reg_5 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
+ m_temp_reg_6 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
+ m_temp_reg_7 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
+
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
+ m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b);
+ m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b);
+
+ m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+ m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+ m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
+ m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
+
+
+ /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
+ m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst);
+ m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd));
+ m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd));
+ m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd));
+
+ m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
+ m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
+ m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
+ m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
+
+ m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
+ m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
+
+ m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0);
+ m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8);
+ m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1);
+
+ m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2);
+ m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8);
+ m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3);
+
+ /*Store the result in the destination*/
+ pi4_dst = (__m128i *)(pu1_dst);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+ m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8);
+
+ pi4_dst = (__m128i *)(pu1_dst + 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_8);
+
+ pi4_dst = (__m128i *)(pu1_dst + dst_strd);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_13);
+ m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8);
+
+ pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_9);
+
+ pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_14);
+ m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8);
+
+ pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_10);
+
+ pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_15);
+ m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8);
+
+ pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_11);
+
+ pu1_pred += 16;
+ pu1_dst += 16;
+ }
+
+ pu1_pred += 4 * pred_strd - 2 * trans_size;
+ pu1_dst += 4 * dst_strd - 2 * trans_size;
+ }
+ }
+
+
+}