diff options
author | Naveen Kumar Ponnusamy <naveenkumar.p@ittiam.com> | 2015-05-21 15:27:10 +0530 |
---|---|---|
committer | Marco Nelissen <marcone@google.com> | 2015-06-25 08:25:42 -0700 |
commit | 796e3c8de825c078c77b9fd83abca8c7f79d1127 (patch) | |
tree | 182fc97ccdeefe8b11e0a0f694a6d82fe5f3f8eb /common | |
parent | cf91c87b25ad49fc7e307932754d188e3ba2a479 (diff) | |
download | android_external_libavc-796e3c8de825c078c77b9fd83abca8c7f79d1127.tar.gz android_external_libavc-796e3c8de825c078c77b9fd83abca8c7f79d1127.tar.bz2 android_external_libavc-796e3c8de825c078c77b9fd83abca8c7f79d1127.zip |
SSSE3/SSE4 Intrinsics Optimizations
Change-Id: I55c74d9b2a0b323886d8a80eaad899b187c22cd7
Diffstat (limited to 'common')
-rw-r--r-- | common/x86/ih264_chroma_intra_pred_filters_ssse3.c | 36 | ||||
-rw-r--r-- | common/x86/ih264_inter_pred_filters_ssse3.c | 148 | ||||
-rw-r--r-- | common/x86/ih264_iquant_itrans_recon_dc_ssse3.c | 31 | ||||
-rw-r--r-- | common/x86/ih264_iquant_itrans_recon_sse42.c | 31 | ||||
-rw-r--r-- | common/x86/ih264_iquant_itrans_recon_ssse3.c | 4 | ||||
-rw-r--r-- | common/x86/ih264_luma_intra_pred_filters_ssse3.c | 288 | ||||
-rw-r--r-- | common/x86/ih264_padding_ssse3.c | 26 | ||||
-rw-r--r-- | common/x86/ih264_weighted_pred_sse42.c | 84 |
8 files changed, 223 insertions, 425 deletions
diff --git a/common/x86/ih264_chroma_intra_pred_filters_ssse3.c b/common/x86/ih264_chroma_intra_pred_filters_ssse3.c index 45101a4..d43ce20 100644 --- a/common/x86/ih264_chroma_intra_pred_filters_ssse3.c +++ b/common/x86/ih264_chroma_intra_pred_filters_ssse3.c @@ -103,47 +103,35 @@ void ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 *pu1_src, UWORD8 *pu1_left; /* Pointer to start of top predictors */ WORD32 dst_strd2; - __m128i left_16x8b, left_sh_16x8b; __m128i row1_16x8b, row2_16x8b; - __m128i const_14_15_16x8b; UNUSED(src_strd); UNUSED(ngbr_avail); pu1_left = pu1_src + 2 * BLK8x8SIZE - 2; - left_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 14)); - - const_14_15_16x8b = _mm_set1_epi16(0x0f0e); dst_strd2 = dst_strd << 1; - left_sh_16x8b = _mm_slli_si128(left_16x8b, 2); - row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); - row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); + row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left))); + row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 2))); _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); - left_16x8b = _mm_slli_si128(left_16x8b, 4); - left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); pu1_dst += dst_strd2; - row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); - row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); + row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 4))); + row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 6))); _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); - left_16x8b = _mm_slli_si128(left_16x8b, 4); - left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); pu1_dst += dst_strd2; - row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); - row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); + row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 8))); + row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 10))); _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); - left_16x8b = _mm_slli_si128(left_16x8b, 4); - left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); pu1_dst += dst_strd2; - row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); - row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); + row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 12))); + row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 14))); _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); } @@ -273,7 +261,6 @@ void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src, //calculating a, b and c { WORD32 h_u, h_v, v_u, v_v; - WORD32 temp1, temp2; __m128i h_val1_16x8b, h_val2_16x8b; __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b; @@ -302,13 +289,10 @@ void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src, h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b); v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b); - temp1 = _mm_extract_epi16(h_val1_16x8b, 3); - temp2 = _mm_extract_epi16(v_val1_16x8b, 3); - hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b); - a_u = ((temp1 & 0xff) + (temp2 & 0xff)) << 4; - a_v = ((temp1 >> 8) + (temp2 >> 8)) << 4; + a_u = (pu1_left[7 * (-2)] + pu1_top[14]) << 4; + a_v = (pu1_left[7 * (-2) + 1] + pu1_top[15]) << 4; h_u = _mm_extract_epi16(hv_val_4x32b, 0); h_v = _mm_extract_epi16(hv_val_4x32b, 2); diff --git a/common/x86/ih264_inter_pred_filters_ssse3.c b/common/x86/ih264_inter_pred_filters_ssse3.c index 6d318c9..c8537da 100644 --- a/common/x86/ih264_inter_pred_filters_ssse3.c +++ b/common/x86/ih264_inter_pred_filters_ssse3.c @@ -111,23 +111,12 @@ void ih264_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src, if(wd == 4) { - __m128i mask_full_128b, mask_low_32b; - - mask_full_128b = _mm_set1_epi8(0xff); - mask_low_32b = _mm_srli_si128(mask_full_128b, 12); - // mask for first four bytes - do { - y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); - y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); - y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2)); - y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3)); - - _mm_maskmoveu_si128(y_0_16x8b, mask_low_32b, (char*)pu1_dst); - _mm_maskmoveu_si128(y_1_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); - _mm_maskmoveu_si128(y_2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); - _mm_maskmoveu_si128(y_3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); + *((WORD32 *)(pu1_dst)) = *((WORD32 *)(pu1_src)); + *((WORD32 *)(pu1_dst + dst_strd)) = *((WORD32 *)(pu1_src + src_strd)); + *((WORD32 *)(pu1_dst + dst_strd2)) = *((WORD32 *)(pu1_src + src_strd2)); + *((WORD32 *)(pu1_dst + dst_strd3)) = *((WORD32 *)(pu1_src + src_strd3)); ht -= 4; pu1_src += src_strd4; @@ -255,11 +244,6 @@ void ih264_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src, __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; __m128i res_r0r1_16x8b; - __m128i mask_full_16x8b, mask_low32b; - - mask_full_16x8b = _mm_set1_epi8(0xff); - mask_low32b = _mm_srli_si128(mask_full_16x8b, 12); // mask for first four bytes - //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... @@ -307,9 +291,9 @@ void ih264_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src, res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b); - _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b); res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4); - _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd)); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b); ht -= 2; pu1_src += src_strd << 1; @@ -525,10 +509,6 @@ void ih264_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src, if(wd == 4) { - __m128i mask_low32b; - - mask_low32b = _mm_set1_epi8(0xff); - //Epilogue: Load all the pred rows except sixth and seventh row // for the first and second row processing. src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); @@ -542,8 +522,6 @@ void ih264_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src, src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); pu1_src += src_strd; - mask_low32b = _mm_srli_si128(mask_low32b, 12); // mask for first four bytes - src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b); src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b); src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b); @@ -572,9 +550,9 @@ void ih264_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src, res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); - _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b); res_16x8b = _mm_srli_si128(res_16x8b, 4); - _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd)); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b); src_r0_16x8b = src_r2_16x8b; src_r1_16x8b = src_r3_16x8b; @@ -893,15 +871,12 @@ void ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3(UWORD8 *pu1_src, __m128i res_8x16b, res_16x8b; __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; - __m128i const_val512_4x32b, mask_low32b; - - mask_low32b = _mm_set1_epi8(0xff); + __m128i const_val512_4x32b; coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); coeff2_3_8x16b = _mm_set1_epi32(0x00140014); coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); - mask_low32b = _mm_srli_si128(mask_low32b, 12); const_val512_4x32b = _mm_set1_epi32(512); src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp)); @@ -947,9 +922,9 @@ void ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3(UWORD8 *pu1_src, res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); - _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b); res_16x8b = _mm_srli_si128(res_16x8b, 4); - _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd)); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b); src_r0_8x16b = src_r2_8x16b; src_r1_8x16b = src_r3_8x16b; @@ -1551,11 +1526,6 @@ void ih264_inter_pred_luma_horz_qpel_ssse3(UWORD8 *pu1_src, __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; __m128i res_r0r1_16x8b; - __m128i mask_full_16x8b, mask_low32b; - - mask_full_16x8b = _mm_set1_epi8(0xff); - mask_low32b = _mm_srli_si128(mask_full_16x8b, 12); // mask for first four bytes - //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... @@ -1607,9 +1577,9 @@ void ih264_inter_pred_luma_horz_qpel_ssse3(UWORD8 *pu1_src, res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b); res_r0r1_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_r0r1_16x8b); //computing q-pel - _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b); res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4); - _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd)); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b); ht -= 2; pu1_src += src_strd << 1; @@ -1849,10 +1819,6 @@ void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src, if(wd == 4) { - __m128i mask_low32b; - - mask_low32b = _mm_set1_epi8(0xff); - //Epilogue: Load all the pred rows except sixth and seventh row // for the first and second row processing. src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); @@ -1866,8 +1832,6 @@ void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src, src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); pu1_src += src_strd; - mask_low32b = _mm_srli_si128(mask_low32b, 12); // mask for first four bytes - src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b); src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b); src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b); @@ -1904,9 +1868,9 @@ void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src, res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel - _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b); res_16x8b = _mm_srli_si128(res_16x8b, 4); - _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd)); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b); src_r0_16x8b = src_r2_16x8b; src_r1_16x8b = src_r3_16x8b; @@ -2257,11 +2221,6 @@ void ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3(UWORD8 *pu1_src, __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; __m128i res_r0r1_16x8b; - __m128i mask_low32b; - - mask_low32b = _mm_set1_epi8(0xff); - mask_low32b = _mm_srli_si128(mask_low32b, 12); - //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... @@ -2313,9 +2272,9 @@ void ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3(UWORD8 *pu1_src, res_r0r1_16x8b = _mm_avg_epu8(res_r0r1_16x8b,src_r0r1_vpel_16x8b); - _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b); res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4); - _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd)); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b); ht -= 2; pu1_pred_horiz += src_strd << 1; @@ -2852,16 +2811,11 @@ void ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3(UWORD8 *pu1_src, __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; __m128i const_val512_4x32b, const_val16_8x16b; - __m128i mask_low32b; - - mask_low32b = _mm_set1_epi8(0xff); coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); coeff2_3_8x16b = _mm_set1_epi32(0x00140014); coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); - mask_low32b = _mm_srli_si128(mask_low32b, 12); - const_val512_4x32b = _mm_set1_epi32(512); const_val16_8x16b = _mm_set1_epi16(16); @@ -2897,7 +2851,7 @@ void ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3(UWORD8 *pu1_src, res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); - _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b); ht--; pi2_temp2 = pi2_temp2 + 4 + 5; @@ -3424,12 +3378,9 @@ void ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3(UWORD8 *pu1_src, __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; __m128i const_val512_4x32b, const_val16_8x16b; - __m128i mask_low32b; - mask_low32b = _mm_set1_epi8(0xff); const_val512_4x32b = _mm_set1_epi32(512); const_val16_8x16b = _mm_set1_epi16(16); - mask_low32b = _mm_srli_si128(mask_low32b, 12); coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); coeff2_3_8x16b = _mm_set1_epi32(0x00140014); @@ -3483,9 +3434,9 @@ void ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3(UWORD8 *pu1_src, res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); - _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst)); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b); res_16x8b = _mm_srli_si128(res_16x8b, 4); - _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst + dst_strd)); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b); src_r0_8x16b = src_r2_8x16b; src_r1_8x16b = src_r3_8x16b; @@ -4106,65 +4057,6 @@ void ih264_inter_pred_chroma_ssse3(UWORD8 *pu1_src, } while(ht > 0); - /* - WORD32 AB, CD; - - __m128i src_r1_16x8b, src_r2_16x8b, src_r3_16x8b; - __m128i src_r1r2_16x8b, src_r2r3_16x8b; - __m128i res_AB_8x16b, res_CD_8x16b, res_8x16b, res_16x8b; - __m128i mask_low32b; - - __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b; - __m128i const_shuff_16x8b; - - AB = (B << 8) + A; - CD = (D << 8) + C; - - coeffAB_16x8b = _mm_set1_epi16(AB); - coeffCD_16x8b = _mm_set1_epi16(CD); - - round_add32_8x16b = _mm_set1_epi16(32); - - mask_low32b = _mm_set1_epi8(0xff); - src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); //u1[0] v1[0] u1[1] v1[1] u1[2] v1[2] u1[3] v1[3] - pu1_src += src_strd; - - const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x0b090a08, 0x0d0b0c0a); - mask_low32b = _mm_srli_si128(mask_low32b, 12); - - do - { - src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); //u2[0] v2[0] u2[1] v2[1] u1[2] v2[2] u2[3] v2[3] - src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); //u3[0] v3[0] u3[1] v3[1] u3[2] v3[2] u3[3] v3[3] - - src_r1r2_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b); - src_r2r3_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b); - - src_r1r2_16x8b = _mm_shuffle_epi8(src_r1r2_16x8b, const_shuff_16x8b); //u1[0] u1[1] v1[0] v1[1] u1[1] u1[2] v1[1] v1[2] - //u2[0] u2[1] v2[0] v2[1] u2[1] u2[2] v2[1] v2[2] - src_r2r3_16x8b = _mm_shuffle_epi8(src_r2r3_16x8b, const_shuff_16x8b); //u2[0] u2[1] v2[0] v2[1] u2[1] u2[2] v2[1] v2[2] - //u3[0] u3[1] v3[0] v3[1] u3[1] u3[2] v3[1] v3[2] - res_AB_8x16b = _mm_maddubs_epi16(src_r1r2_16x8b, coeffAB_16x8b); - res_CD_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeffCD_16x8b); - - res_8x16b = _mm_add_epi16(res_AB_8x16b, round_add32_8x16b); - res_8x16b = _mm_add_epi16(res_8x16b, res_CD_8x16b); - res_8x16b = _mm_srai_epi16(res_8x16b, 6); - res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); - - _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)pu1_dst); - - ht -= 2; - pu1_src += src_strd << 1; - res_16x8b = _mm_srli_si128(res_16x8b, 4); - src_r1_16x8b = src_r3_16x8b; - - _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst + dst_strd)); - - pu1_dst += dst_strd << 1; - } - while(ht > 0); - */ } else if(wd == 4) { diff --git a/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c index 565cc75..83a23ac 100644 --- a/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c +++ b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c @@ -30,8 +30,8 @@ * Mohit [100664] * * @par List of Functions: - * - ihevc_iquant_itrans_recon_4x4_dc_ssse3() - * - ihevc_iquant_itrans_recon_8x8_dc_ssse3() + * - ih264_iquant_itrans_recon_4x4_dc_ssse3() + * - ih264_iquant_itrans_recon_8x8_dc_ssse3() * * @remarks * None @@ -397,6 +397,7 @@ void ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 *pi2_src, __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero __m128i chroma_mask = _mm_set1_epi16 (0xFF); __m128i value_add = _mm_set1_epi16(i_macro); + __m128i out_r0, out_r1, out_r2, out_r3; UNUSED (pi2_src); UNUSED (pu2_iscal_mat); @@ -438,12 +439,26 @@ void ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 *pi2_src, pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b); //p20 p21 p22 p23 -- all 16 bits pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b); //p30 p31 p32 p33 -- all 16 bits - chroma_mask = _mm_unpacklo_epi64(chroma_mask, zero_8x16b); //1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 -- 8 bits - - _mm_maskmoveu_si128(pred_r0, chroma_mask, (char *)(&pu1_out[0])); - _mm_maskmoveu_si128(pred_r1, chroma_mask, (char *)(&pu1_out[out_strd])); - _mm_maskmoveu_si128(pred_r2, chroma_mask, (char *)(&pu1_out[2*out_strd])); - _mm_maskmoveu_si128(pred_r3, chroma_mask, (char *)(&pu1_out[3*out_strd])); + chroma_mask = _mm_set1_epi16 (0xFF00); + out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0])); + out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[out_strd])); + out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * out_strd])); + out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * out_strd])); + + out_r0 = _mm_and_si128(out_r0, chroma_mask); + out_r1 = _mm_and_si128(out_r1, chroma_mask); + out_r2 = _mm_and_si128(out_r2, chroma_mask); + out_r3 = _mm_and_si128(out_r3, chroma_mask); + + out_r0 = _mm_add_epi8(out_r0, pred_r0); + out_r1 = _mm_add_epi8(out_r1, pred_r1); + out_r2 = _mm_add_epi8(out_r2, pred_r2); + out_r3 = _mm_add_epi8(out_r3, pred_r3); + + _mm_storel_epi64((__m128i *)(&pu1_out[0]), out_r0); + _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), out_r1); + _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), out_r2); + _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), out_r3); } diff --git a/common/x86/ih264_iquant_itrans_recon_sse42.c b/common/x86/ih264_iquant_itrans_recon_sse42.c index 6399b65..f27111f 100644 --- a/common/x86/ih264_iquant_itrans_recon_sse42.c +++ b/common/x86/ih264_iquant_itrans_recon_sse42.c @@ -30,8 +30,8 @@ * Mohit [100664] * * @par List of Functions: - * - ihevc_iquant_itrans_recon_4x4_sse42() - * - ihevc_iquant_itrans_recon_chroma_4x4_sse42() + * - ih264_iquant_itrans_recon_4x4_sse42() + * - ih264_iquant_itrans_recon_chroma_4x4_sse42() * * @remarks * None @@ -370,6 +370,7 @@ void ih264_iquant_itrans_recon_chroma_4x4_sse42(WORD16 *pi2_src, __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6))); __m128i value_32 = _mm_set1_epi32(32); __m128i chroma_mask = _mm_set1_epi16 (0xFF); + __m128i out_r0, out_r1, out_r2, out_r3; UNUSED (pi2_tmp); /*************************************************************/ @@ -548,10 +549,24 @@ void ih264_iquant_itrans_recon_chroma_4x4_sse42(WORD16 *pi2_src, resq_r2 = _mm_cvtepu8_epi16(resq_r2); //p20 p21 p22 p23 -- all 16 bits resq_r3 = _mm_cvtepu8_epi16(resq_r3); //p30 p31 p32 p33 -- all 16 bits - chroma_mask = _mm_unpacklo_epi64(chroma_mask, zero_8x16b); - - _mm_maskmoveu_si128(resq_r0, chroma_mask, (char *)(&pu1_out[0])); - _mm_maskmoveu_si128(resq_r1, chroma_mask, (char *)(&pu1_out[out_strd])); - _mm_maskmoveu_si128(resq_r2, chroma_mask, (char *)(&pu1_out[2*out_strd])); - _mm_maskmoveu_si128(resq_r3, chroma_mask, (char *)(&pu1_out[3*out_strd])); + chroma_mask = _mm_set1_epi16 (0xFF00); + out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0])); + out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[out_strd])); + out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * out_strd])); + out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * out_strd])); + + out_r0 = _mm_and_si128(out_r0, chroma_mask); + out_r1 = _mm_and_si128(out_r1, chroma_mask); + out_r2 = _mm_and_si128(out_r2, chroma_mask); + out_r3 = _mm_and_si128(out_r3, chroma_mask); + + out_r0 = _mm_add_epi8(out_r0, resq_r0); + out_r1 = _mm_add_epi8(out_r1, resq_r1); + out_r2 = _mm_add_epi8(out_r2, resq_r2); + out_r3 = _mm_add_epi8(out_r3, resq_r3); + + _mm_storel_epi64((__m128i *)(&pu1_out[0]), out_r0); + _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), out_r1); + _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), out_r2); + _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), out_r3); } diff --git a/common/x86/ih264_iquant_itrans_recon_ssse3.c b/common/x86/ih264_iquant_itrans_recon_ssse3.c index 388cafe..30f7e59 100644 --- a/common/x86/ih264_iquant_itrans_recon_ssse3.c +++ b/common/x86/ih264_iquant_itrans_recon_ssse3.c @@ -30,8 +30,8 @@ * Mohit [100664] * * @par List of Functions: - * - ihevc_iquant_itrans_recon_4x4_ssse3() - * - ihevc_iquant_itrans_recon_8x8_ssse3() + * - ih264_iquant_itrans_recon_4x4_ssse3() + * - ih264_iquant_itrans_recon_8x8_ssse3() * * @remarks * None diff --git a/common/x86/ih264_luma_intra_pred_filters_ssse3.c b/common/x86/ih264_luma_intra_pred_filters_ssse3.c index 5a35372..a1721d5 100644 --- a/common/x86/ih264_luma_intra_pred_filters_ssse3.c +++ b/common/x86/ih264_luma_intra_pred_filters_ssse3.c @@ -122,28 +122,22 @@ void ih264_intra_pred_luma_4x4_mode_vert_ssse3(UWORD8 *pu1_src, { UWORD8 *pu1_top; WORD32 dst_strd2, dst_strd3; - - __m128i top_16x8b; - __m128i mask_full_128b, mask_low_32b; + WORD32 i4_top; UNUSED(src_strd); UNUSED(ngbr_avail); - mask_full_128b = _mm_set1_epi8(0xff); - pu1_top = pu1_src + BLK_SIZE + 1; - mask_low_32b = _mm_srli_si128(mask_full_128b, 12); - - top_16x8b = _mm_loadl_epi64((__m128i *)pu1_top); + i4_top = *((WORD32 *)pu1_top); dst_strd2 = dst_strd << 1; dst_strd3 = dst_strd + dst_strd2; - _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)pu1_dst); - _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); - _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); - _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); + *((WORD32 *)(pu1_dst)) = i4_top; + *((WORD32 *)(pu1_dst + dst_strd)) = i4_top; + *((WORD32 *)(pu1_dst + dst_strd2)) = i4_top; + *((WORD32 *)(pu1_dst + dst_strd3)) = i4_top; } /** @@ -185,39 +179,31 @@ void ih264_intra_pred_luma_4x4_mode_horz_ssse3(UWORD8 *pu1_src, WORD32 dst_strd, WORD32 ngbr_avail) { - UWORD8 *pu1_left; + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + WORD32 row1,row2,row3,row4; + UWORD8 val; WORD32 dst_strd2, dst_strd3; - WORD32 val1, val2; - - __m128i left_16x8b; - __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; - __m128i mask_full_128b, mask_low_32b; UNUSED(src_strd); UNUSED(ngbr_avail); - - mask_full_128b = _mm_set1_epi8(0xff); - pu1_left = pu1_src + BLK_SIZE - 1; - mask_low_32b = _mm_srli_si128(mask_full_128b, 12); - left_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3)); - - val1 = _mm_extract_epi16(left_16x8b, 1); - val2 = _mm_extract_epi16(left_16x8b, 0); - - row1_16x8b = _mm_set1_epi8(val1 >> 8); - row2_16x8b = _mm_set1_epi8(val1 & 0xff); - row3_16x8b = _mm_set1_epi8(val2 >> 8); - row4_16x8b = _mm_set1_epi8(val2 & 0xff); + val = *pu1_left; + row1 = val + (val << 8) + (val << 16) + (val << 24); + val = *(pu1_left - 1); + row2 = val + (val << 8) + (val << 16) + (val << 24); + val = *(pu1_left - 2); + row3 = val + (val << 8) + (val << 16) + (val << 24); + val = *(pu1_left - 3); + row4 = val + (val << 8) + (val << 16) + (val << 24); dst_strd2 = dst_strd << 1; dst_strd3 = dst_strd + dst_strd2; - _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst); - _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); - _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); - _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); + *((WORD32 *)(pu1_dst)) = row1; + *((WORD32 *)(pu1_dst + dst_strd)) = row2; + *((WORD32 *)(pu1_dst + dst_strd2)) = row3; + *((WORD32 *)(pu1_dst + dst_strd3)) = row4; } /** @@ -259,72 +245,43 @@ void ih264_intra_pred_luma_4x4_mode_dc_ssse3(UWORD8 *pu1_src, WORD32 ngbr_avail) { UWORD8 u1_useleft; /* availability of left predictors (only for DC) */ - UWORD8 u1_usetop; /* availability of top predictors (only for DC) */ - UWORD8 *pu1_left, *pu1_top; - WORD32 dc_val, flag; + UWORD8 u1_usetop; /* availability of top predictors (only for DC) */ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ WORD32 dst_strd2, dst_strd3; - - __m128i mask_full_128b, mask_low_32b; - __m128i dcval_16x8b; - + WORD32 val = 0; UNUSED(src_strd); UNUSED(ngbr_avail); - - mask_full_128b = _mm_set1_epi8(0xff); - u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK); u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK); - - pu1_left = pu1_src + BLK_SIZE - 1; pu1_top = pu1_src + BLK_SIZE + 1; + pu1_left = pu1_src + BLK_SIZE - 1; - mask_low_32b = _mm_srli_si128(mask_full_128b, 12); - - flag = u1_useleft + u1_usetop; - - if(flag) + if(u1_useleft) { - WORD32 shft, ofst = 0; - - __m128i left_16x8b, top_16x8b, val_16x8b, tmp_8x16b, zero_vector; - - if(u1_useleft) - { - left_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3)); - ofst += 2; - } - else - left_16x8b = _mm_setzero_si128(); - - zero_vector = _mm_setzero_si128(); - - if(u1_usetop) - { - top_16x8b = _mm_loadl_epi64((__m128i *)pu1_top); - ofst += 2; - } - else - top_16x8b = _mm_setzero_si128(); - - shft = flag + 1; - val_16x8b = _mm_unpacklo_epi32(left_16x8b, top_16x8b); - tmp_8x16b = _mm_sad_epu8(val_16x8b, zero_vector); - - dc_val = _mm_extract_epi16(tmp_8x16b, 0); - dc_val = (dc_val + ofst) >> shft; + val += *pu1_left--; + val += *pu1_left--; + val += *pu1_left--; + val += *pu1_left + 2; } - else - dc_val = 128; + if(u1_usetop) + { + val += *pu1_top + *(pu1_top + 1) + *(pu1_top + 2) + *(pu1_top + 3) + + 2; + } + /* Since 2 is added if either left/top pred is there, + val still being zero implies both preds are not there */ + val = (val) ? (val >> (1 + u1_useleft + u1_usetop)) : 128; + + val = val + (val << 8) + (val << 16) + (val << 24); dst_strd2 = dst_strd << 1; dst_strd3 = dst_strd + dst_strd2; - dcval_16x8b = _mm_set1_epi8(dc_val); - - _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)pu1_dst); - _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); - _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); - _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); + *((WORD32 *)(pu1_dst)) = val; + *((WORD32 *)(pu1_dst + dst_strd)) = val; + *((WORD32 *)(pu1_dst + dst_strd2)) = val; + *((WORD32 *)(pu1_dst + dst_strd3)) = val; } /** @@ -371,7 +328,7 @@ void ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3(UWORD8 *pu1_src, __m128i top_16x8b, top_8x16b, top_sh_8x16b; __m128i res1_8x16b, res2_8x16b, res_16x8b; __m128i zero_vector, const_2_8x16b; - __m128i mask_full_128b, mask_low_32b; + WORD32 row1,row2,row3,row4; UNUSED(src_strd); UNUSED(ngbr_avail); @@ -382,13 +339,11 @@ void ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3(UWORD8 *pu1_src, zero_vector = _mm_setzero_si128(); top_8x16b = _mm_unpacklo_epi8(top_16x8b, zero_vector); //t0 t1 t2 t3 t4 t5 t6 t7 - mask_full_128b = _mm_set1_epi8(0xff); top_sh_8x16b = _mm_srli_si128(top_8x16b, 2); //t1 t2 t3 t4 t5 t6 t7 0 const_2_8x16b = _mm_set1_epi16(2); top_sh_8x16b = _mm_shufflehi_epi16(top_sh_8x16b, 0xa4); //t1 t2 t3 t4 t5 t6 t7 t7 res1_8x16b = _mm_add_epi16(top_8x16b, top_sh_8x16b); - mask_low_32b = _mm_srli_si128(mask_full_128b, 12); res2_8x16b = _mm_srli_si128(res1_8x16b, 2); res1_8x16b = _mm_add_epi16(res1_8x16b, const_2_8x16b); @@ -399,13 +354,18 @@ void ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3(UWORD8 *pu1_src, dst_strd3 = dst_strd + dst_strd2; res_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b); - _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)pu1_dst); + row1 = _mm_cvtsi128_si32(res_16x8b); res_16x8b = _mm_srli_si128(res_16x8b, 1); - _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + row2 = _mm_cvtsi128_si32(res_16x8b); res_16x8b = _mm_srli_si128(res_16x8b, 1); - _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); + row3 = _mm_cvtsi128_si32(res_16x8b); res_16x8b = _mm_srli_si128(res_16x8b, 1); - _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); + row4 = _mm_cvtsi128_si32(res_16x8b); + + *((WORD32 *)(pu1_dst)) = row1; + *((WORD32 *)(pu1_dst + dst_strd)) = row2; + *((WORD32 *)(pu1_dst + dst_strd2)) = row3; + *((WORD32 *)(pu1_dst + dst_strd3)) = row4; } /** @@ -454,7 +414,7 @@ void ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3(UWORD8 *pu1_src, __m128i res1_8x16b, res2_8x16b; __m128i res1_16x8b, res2_16x8b; __m128i zero_vector, const_2_8x16b; - __m128i mask_full_128b, mask_low_32b; + WORD32 row1,row2,row3,row4; UNUSED(src_strd); UNUSED(ngbr_avail); @@ -468,13 +428,11 @@ void ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3(UWORD8 *pu1_src, top_left_8x16b = _mm_unpacklo_epi8(top_left_16x8b, zero_vector); top_left_sh_8x16b = _mm_unpacklo_epi8(top_left_sh_16x8b, zero_vector); - mask_full_128b = _mm_set1_epi8(0xff); res1_8x16b = _mm_add_epi16(top_left_8x16b, top_left_sh_8x16b); //l3+l2 l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3... const_2_8x16b = _mm_set1_epi16(2); res2_8x16b = _mm_srli_si128(res1_8x16b, 2); //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3... res1_8x16b = _mm_add_epi16(res1_8x16b, const_2_8x16b); - mask_low_32b = _mm_srli_si128(mask_full_128b, 12); res1_8x16b = _mm_add_epi16(res2_8x16b, res1_8x16b); //l3+2*l2+l1+2 l2+2*l1+l0+2... res1_8x16b = _mm_srai_epi16(res1_8x16b, 2); res1_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b); @@ -483,12 +441,18 @@ void ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3(UWORD8 *pu1_src, dst_strd3 = dst_strd + dst_strd2; res2_16x8b = _mm_srli_si128(res1_16x8b, 3); - _mm_maskmoveu_si128(res2_16x8b, mask_low_32b, (char*)pu1_dst); + + row1 = _mm_cvtsi128_si32(res2_16x8b); res2_16x8b = _mm_srli_si128(res1_16x8b, 2); - _mm_maskmoveu_si128(res2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + row2 = _mm_cvtsi128_si32(res2_16x8b); res2_16x8b = _mm_srli_si128(res1_16x8b, 1); - _mm_maskmoveu_si128(res2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); - _mm_maskmoveu_si128(res1_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); + row3 = _mm_cvtsi128_si32(res2_16x8b); + row4 = _mm_cvtsi128_si32(res1_16x8b); + + *((WORD32 *)(pu1_dst)) = row1; + *((WORD32 *)(pu1_dst + dst_strd)) = row2; + *((WORD32 *)(pu1_dst + dst_strd2)) = row3; + *((WORD32 *)(pu1_dst + dst_strd3)) = row4; } /** @@ -537,14 +501,11 @@ void ih264_intra_pred_luma_4x4_mode_vert_r_ssse3(UWORD8 *pu1_src, __m128i w121_a1_8x16b, w121_a2_8x16b, w121_sh_8x16b; __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; __m128i zero_vector, const_2_8x16b; - __m128i mask_full_128b, mask_low_32b; + WORD32 row1,row2,row3,row4; UNUSED(src_strd); UNUSED(ngbr_avail); - mask_full_128b = _mm_set1_epi8(0xff); - mask_low_32b = _mm_srli_si128(mask_full_128b, 12); - pu1_left = pu1_src + BLK_SIZE - 1; val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 2)); @@ -575,10 +536,15 @@ void ih264_intra_pred_luma_4x4_mode_vert_r_ssse3(UWORD8 *pu1_src, dst_strd2 = dst_strd << 1; dst_strd3 = dst_strd + dst_strd2; - _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst); - _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); - _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); - _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); + row1 = _mm_cvtsi128_si32(row1_16x8b); + row2 = _mm_cvtsi128_si32(row2_16x8b); + row3 = _mm_cvtsi128_si32(row3_16x8b); + row4 = _mm_cvtsi128_si32(row4_16x8b); + + *((WORD32 *)(pu1_dst)) = row1; + *((WORD32 *)(pu1_dst + dst_strd)) = row2; + *((WORD32 *)(pu1_dst + dst_strd2)) = row3; + *((WORD32 *)(pu1_dst + dst_strd3)) = row4; } /* @@ -629,14 +595,11 @@ void ih264_intra_pred_luma_4x4_mode_horz_d_ssse3(UWORD8 *pu1_src, __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; __m128i zero_vector, const_2_8x16b; - __m128i mask_full_128b, mask_low_32b; + WORD32 row1,row2,row3,row4; UNUSED(src_strd); UNUSED(ngbr_avail); - mask_full_128b = _mm_set1_epi8(0xff); - mask_low_32b = _mm_srli_si128(mask_full_128b, 12); - pu1_left = pu1_src + BLK_SIZE - 1; val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3)); @@ -669,10 +632,15 @@ void ih264_intra_pred_luma_4x4_mode_horz_d_ssse3(UWORD8 *pu1_src, row2_16x8b = _mm_srli_si128(row4_16x8b, 4); row3_16x8b = _mm_srli_si128(row4_16x8b, 2); - _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst); - _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); - _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); - _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); + row1 = _mm_cvtsi128_si32(row1_16x8b); + row2 = _mm_cvtsi128_si32(row2_16x8b); + row3 = _mm_cvtsi128_si32(row3_16x8b); + row4 = _mm_cvtsi128_si32(row4_16x8b); + + *((WORD32 *)(pu1_dst)) = row1; + *((WORD32 *)(pu1_dst + dst_strd)) = row2; + *((WORD32 *)(pu1_dst + dst_strd2)) = row3; + *((WORD32 *)(pu1_dst + dst_strd3)) = row4; } /** @@ -721,14 +689,11 @@ void ih264_intra_pred_luma_4x4_mode_vert_l_ssse3(UWORD8 *pu1_src, __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; __m128i zero_vector, const_2_8x16b; - __m128i mask_full_128b, mask_low_32b; + WORD32 row1,row2,row3,row4; UNUSED(src_strd); UNUSED(ngbr_avail); - mask_full_128b = _mm_set1_epi8(0xff); - mask_low_32b = _mm_srli_si128(mask_full_128b, 12); - pu1_top = pu1_src +BLK_SIZE + 1; val_16x8b = _mm_loadl_epi64((__m128i *)pu1_top); @@ -756,10 +721,15 @@ void ih264_intra_pred_luma_4x4_mode_vert_l_ssse3(UWORD8 *pu1_src, row3_16x8b = _mm_srli_si128(row1_16x8b, 1); row4_16x8b = _mm_srli_si128(row2_16x8b, 1); - _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst); - _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); - _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); - _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); + row1 = _mm_cvtsi128_si32(row1_16x8b); + row2 = _mm_cvtsi128_si32(row2_16x8b); + row3 = _mm_cvtsi128_si32(row3_16x8b); + row4 = _mm_cvtsi128_si32(row4_16x8b); + + *((WORD32 *)(pu1_dst)) = row1; + *((WORD32 *)(pu1_dst + dst_strd)) = row2; + *((WORD32 *)(pu1_dst + dst_strd2)) = row3; + *((WORD32 *)(pu1_dst + dst_strd3)) = row4; } /** @@ -809,14 +779,11 @@ void ih264_intra_pred_luma_4x4_mode_horz_u_ssse3(UWORD8 *pu1_src, __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; __m128i zero_vector, const_2_8x16b, rev_16x8b; - __m128i mask_full_128b, mask_low_32b; + WORD32 row1,row2,row3,row4; UNUSED(src_strd); UNUSED(ngbr_avail); - mask_full_128b = _mm_set1_epi8(0xff); - mask_low_32b = _mm_srli_si128(mask_full_128b, 12); - pu1_left = pu1_src + BLK_SIZE - 1; zero_vector = _mm_setzero_si128(); @@ -851,10 +818,15 @@ void ih264_intra_pred_luma_4x4_mode_horz_u_ssse3(UWORD8 *pu1_src, row3_16x8b = _mm_srli_si128(row1_16x8b, 4); row4_16x8b = _mm_srli_si128(row1_16x8b, 6); - _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst); - _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); - _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); - _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); + row1 = _mm_cvtsi128_si32(row1_16x8b); + row2 = _mm_cvtsi128_si32(row2_16x8b); + row3 = _mm_cvtsi128_si32(row3_16x8b); + row4 = _mm_cvtsi128_si32(row4_16x8b); + + *((WORD32 *)(pu1_dst)) = row1; + *((WORD32 *)(pu1_dst + dst_strd)) = row2; + *((WORD32 *)(pu1_dst + dst_strd2)) = row3; + *((WORD32 *)(pu1_dst + dst_strd3)) = row4; } /******************* 8x8 Modes *******************/ @@ -1814,9 +1786,7 @@ void ih264_intra_pred_luma_16x16_mode_horz_ssse3(UWORD8 *pu1_src, { UWORD8 *pu1_left; WORD32 dst_strd2, dst_strd3, dst_strd4; - WORD32 val1, val2; - __m128i val_16x8b; __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; UNUSED(src_strd); @@ -1826,60 +1796,46 @@ void ih264_intra_pred_luma_16x16_mode_horz_ssse3(UWORD8 *pu1_src, dst_strd4 = dst_strd << 2; - val_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 15)); - dst_strd2 = dst_strd << 1; dst_strd3 = dst_strd4 - dst_strd; - val1 = _mm_extract_epi16(val_16x8b, 7); - val2 = _mm_extract_epi16(val_16x8b, 6); - - row1_16x8b = _mm_set1_epi8(val1 >> 8); - row2_16x8b = _mm_set1_epi8(val1 & 0xff); - row3_16x8b = _mm_set1_epi8(val2 >> 8); - row4_16x8b = _mm_set1_epi8(val2 & 0xff); + row1_16x8b = _mm_set1_epi8(*(pu1_left)); + row2_16x8b = _mm_set1_epi8(*(pu1_left - 1)); + row3_16x8b = _mm_set1_epi8(*(pu1_left - 2)); + row4_16x8b = _mm_set1_epi8(*(pu1_left - 3)); _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b); - val1 = _mm_extract_epi16(val_16x8b, 5); - val2 = _mm_extract_epi16(val_16x8b, 4); - pu1_dst += dst_strd4; - row1_16x8b = _mm_set1_epi8(val1 >> 8); - row2_16x8b = _mm_set1_epi8(val1 & 0xff); - row3_16x8b = _mm_set1_epi8(val2 >> 8); - row4_16x8b = _mm_set1_epi8(val2 & 0xff); + row1_16x8b = _mm_set1_epi8(*(pu1_left - 4)); + row2_16x8b = _mm_set1_epi8(*(pu1_left - 5)); + row3_16x8b = _mm_set1_epi8(*(pu1_left - 6)); + row4_16x8b = _mm_set1_epi8(*(pu1_left - 7)); _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b); - val1 = _mm_extract_epi16(val_16x8b, 3); - val2 = _mm_extract_epi16(val_16x8b, 2); - pu1_dst += dst_strd4; - row1_16x8b = _mm_set1_epi8(val1 >> 8); - row2_16x8b = _mm_set1_epi8(val1 & 0xff); - row3_16x8b = _mm_set1_epi8(val2 >> 8); - row4_16x8b = _mm_set1_epi8(val2 & 0xff); + row1_16x8b = _mm_set1_epi8(*(pu1_left - 8)); + row2_16x8b = _mm_set1_epi8(*(pu1_left - 9)); + row3_16x8b = _mm_set1_epi8(*(pu1_left - 10)); + row4_16x8b = _mm_set1_epi8(*(pu1_left - 11)); _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b); - val1 = _mm_extract_epi16(val_16x8b, 1); - val2 = _mm_extract_epi16(val_16x8b, 0); - pu1_dst += dst_strd4; - row1_16x8b = _mm_set1_epi8(val1 >> 8); - row2_16x8b = _mm_set1_epi8(val1 & 0xff); - row3_16x8b = _mm_set1_epi8(val2 >> 8); - row4_16x8b = _mm_set1_epi8(val2 & 0xff); + row1_16x8b = _mm_set1_epi8(*(pu1_left - 12)); + row2_16x8b = _mm_set1_epi8(*(pu1_left - 13)); + row3_16x8b = _mm_set1_epi8(*(pu1_left - 14)); + row4_16x8b = _mm_set1_epi8(*(pu1_left - 15)); _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); diff --git a/common/x86/ih264_padding_ssse3.c b/common/x86/ih264_padding_ssse3.c index 6dadd39..43ded8e 100644 --- a/common/x86/ih264_padding_ssse3.c +++ b/common/x86/ih264_padding_ssse3.c @@ -97,9 +97,6 @@ void ih264_pad_left_luma_ssse3(UWORD8 *pu1_src, WORD32 row; WORD32 i; UWORD8 *pu1_dst; - __m128i const0_16x8b; - - const0_16x8b = _mm_setzero_si128(); ASSERT(pad_size % 8 == 0); @@ -107,9 +104,8 @@ void ih264_pad_left_luma_ssse3(UWORD8 *pu1_src, { __m128i src_temp0_16x8b; - src_temp0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); pu1_dst = pu1_src - pad_size; - src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b); + src_temp0_16x8b = _mm_set1_epi8(*pu1_src); for(i = 0; i < pad_size; i += 8) { _mm_storel_epi64((__m128i *)(pu1_dst + i), src_temp0_16x8b); @@ -168,20 +164,14 @@ void ih264_pad_left_chroma_ssse3(UWORD8 *pu1_src, WORD32 row; WORD32 col; UWORD8 *pu1_dst; - __m128i const0_16x8b, const1_16x8b; - const0_16x8b = _mm_setzero_si128(); - const1_16x8b = _mm_set1_epi8(1); - const0_16x8b = _mm_unpacklo_epi8(const0_16x8b, const1_16x8b); ASSERT(pad_size % 8 == 0); for(row = 0; row < ht; row++) { __m128i src_temp0_16x8b; - src_temp0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); pu1_dst = pu1_src - pad_size; - src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b); - + src_temp0_16x8b = _mm_set1_epi16(*((UWORD16 *)pu1_src)); for(col = 0; col < pad_size; col += 8) { _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b); @@ -240,7 +230,6 @@ void ih264_pad_right_luma_ssse3(UWORD8 *pu1_src, WORD32 row; WORD32 col; UWORD8 *pu1_dst; - __m128i const0_16x8b; ASSERT(pad_size % 8 == 0); @@ -248,10 +237,8 @@ void ih264_pad_right_luma_ssse3(UWORD8 *pu1_src, { __m128i src_temp0_16x8b; - src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 1)); - const0_16x8b = _mm_setzero_si128(); pu1_dst = pu1_src; - src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b); + src_temp0_16x8b = _mm_set1_epi8(*(pu1_src - 1)); for(col = 0; col < pad_size; col += 8) { _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b); @@ -310,10 +297,6 @@ void ih264_pad_right_chroma_ssse3(UWORD8 *pu1_src, WORD32 row; WORD32 col; UWORD8 *pu1_dst; - __m128i const0_16x8b, const1_16x8b; - const0_16x8b = _mm_setzero_si128(); - const1_16x8b = _mm_set1_epi8(1); - const0_16x8b = _mm_unpacklo_epi8(const0_16x8b, const1_16x8b); ASSERT(pad_size % 8 == 0); @@ -321,9 +304,8 @@ void ih264_pad_right_chroma_ssse3(UWORD8 *pu1_src, { __m128i src_temp0_16x8b; - src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2)); pu1_dst = pu1_src; - src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b); + src_temp0_16x8b = _mm_set1_epi16(*((UWORD16 *)(pu1_src - 2))); for(col = 0; col < pad_size; col += 8) { _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b); diff --git a/common/x86/ih264_weighted_pred_sse42.c b/common/x86/ih264_weighted_pred_sse42.c index b1684b7..48f1f54 100644 --- a/common/x86/ih264_weighted_pred_sse42.c +++ b/common/x86/ih264_weighted_pred_sse42.c @@ -96,12 +96,6 @@ void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1, if(wd == 4) { - __m128i mask_full_16x8b, mask_ll4B_16x8b; - - mask_full_16x8b = _mm_set1_epi8(0xff); - mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12); - // mask for first four bytes - do { y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); @@ -121,13 +115,10 @@ void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1, y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); - _mm_maskmoveu_si128(y0_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst); - _mm_maskmoveu_si128(y0_1_16x8b, mask_ll4B_16x8b, - (char*)(pu1_dst + dst_strd)); - _mm_maskmoveu_si128(y0_2_16x8b, mask_ll4B_16x8b, - (char*)(pu1_dst + (dst_strd << 1))); - _mm_maskmoveu_si128(y0_3_16x8b, mask_ll4B_16x8b, - (char*)(pu1_dst + dst_strd * 3)); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b); + *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b); + *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b); ht -= 4; pu1_src1 += src_strd1 << 2; @@ -268,12 +259,6 @@ void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1, if(wd == 2) { - __m128i mask_full_16x8b, mask_ll4B_16x8b; - - mask_full_16x8b = _mm_set1_epi8(0xff); - mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12); - // mask for first four bytes - do { uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); @@ -285,9 +270,8 @@ void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1, uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); - _mm_maskmoveu_si128(uv0_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst); - _mm_maskmoveu_si128(uv0_1_16x8b, mask_ll4B_16x8b, - (char*)(pu1_dst + dst_strd)); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b); ht -= 2; pu1_src1 += src_strd1 << 1; @@ -419,12 +403,6 @@ void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src, { __m128i y_0_8x16b, y_2_8x16b; - __m128i mask_full_16x8b, mask_ll4B_16x8b; - - mask_full_16x8b = _mm_set1_epi8(0xff); - mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12); - // mask for first four bytes - do { y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); @@ -455,13 +433,10 @@ void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src, y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8); y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12); - _mm_maskmoveu_si128(y_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst); - _mm_maskmoveu_si128(y_1_16x8b, mask_ll4B_16x8b, - (char*)(pu1_dst + dst_strd)); - _mm_maskmoveu_si128(y_2_16x8b, mask_ll4B_16x8b, - (char*)(pu1_dst + (dst_strd << 1))); - _mm_maskmoveu_si128(y_3_16x8b, mask_ll4B_16x8b, - (char*)(pu1_dst + dst_strd * 3)); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b); + *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b); + *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b); ht -= 4; pu1_src += src_strd << 2; @@ -660,12 +635,6 @@ void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src, { __m128i y_0_8x16b; - __m128i mask_full_16x8b, mask_ll4B_16x8b; - - mask_full_16x8b = _mm_set1_epi8(0xff); - mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12); - // mask for first four bytes - do { y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); @@ -686,9 +655,8 @@ void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src, y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b); y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4); - _mm_maskmoveu_si128(y_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst); - _mm_maskmoveu_si128(y_1_16x8b, mask_ll4B_16x8b, - (char*)(pu1_dst + dst_strd)); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b); ht -= 2; pu1_src += src_strd << 1; @@ -890,12 +858,6 @@ void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1, __m128i y1_0_8x16b, y1_2_8x16b; __m128i y2_0_8x16b, y2_2_8x16b; - __m128i mask_ll4B_16x8b; - - mask_ll4B_16x8b = _mm_set1_epi8(0xff); - mask_ll4B_16x8b = _mm_srli_si128(mask_ll4B_16x8b, 12); - // mask for first four bytes - do { y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); @@ -942,13 +904,11 @@ void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1, y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8); y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12); - _mm_maskmoveu_si128(y1_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst); - _mm_maskmoveu_si128(y1_1_16x8b, mask_ll4B_16x8b, - (char*)(pu1_dst + dst_strd)); - _mm_maskmoveu_si128(y1_2_16x8b, mask_ll4B_16x8b, - (char*)(pu1_dst + (dst_strd << 1))); - _mm_maskmoveu_si128(y1_3_16x8b, mask_ll4B_16x8b, - (char*)(pu1_dst + dst_strd * 3)); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b); + *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b); + *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b); + ht -= 4; pu1_src1 += src_strd1 << 2; @@ -1187,11 +1147,6 @@ void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1, { __m128i y1_0_8x16b, y2_0_8x16b; - __m128i mask_full_16x8b, mask_ll4B_16x8b; - - mask_full_16x8b = _mm_set1_epi8(0xff); - mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12); - do { y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); @@ -1218,9 +1173,8 @@ void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1, y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b); y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4); - _mm_maskmoveu_si128(y1_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst); - _mm_maskmoveu_si128(y1_1_16x8b, mask_ll4B_16x8b, - (char*)(pu1_dst + dst_strd)); + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b); ht -= 2; pu1_src1 += src_strd1 << 1; |