libaom: Pull from upstream

Current HEAD: 978ab9e6cd19904cdd54b69a4c30b10c747eb55a git log from upstream: 978ab9e6c AV1 levels: add min frame width and height 62133bf3e AV1 levels: add max superres tile width dee839cea AV1 levels: add max tile rate 1ca3b2652 Correct ref frame buffer in scaled subpixel simple_motion_search ceb16a2e6 Introduce early exit for partition none 5bdd95475 Implement av1_get_seq_level_idx() d06d2d5d3 Refactor check_level_constraints() 352263271 Move some data from AV1LevelSpec to AV1LevelStats bfe92612d AV1 levels: add check for min compression ratio 87a8394ac FIRSTPASS_STATS: Add comments for struct members. a1cf38d09 Disable two pass partition search on lowres and midres ecf5a3c12 Level test: add testcase for target level index 19 37fa0e848 AV1 levels: add header, display and decode rate eff7d3079 Remove unused parameters in tpl experiment 13cccf2db Update border for ref buffer to allow scaled pred fa946afbf Temp fix for ctrl based resize setting 81a59f162 Add data structure to store frame info. 98bb9d649 GF length reduction: respect min_gf_interval. 42f22cce2 Speed feature for adaptive-tx-search 987055e30 Update level info when show_existing_frame is ture <...> Bug: 124137416 Test: video playback Change-Id: I710b863d81cc663c8e286732f32e9b56ab35a5a0
author: Ray Essick <essick@google.com> 2019-03-29 15:30:55 -0700
committer: Ray Essick <essick@google.com> 2019-05-03 21:28:43 +0000
commit: ec6586dd308c18c15b581e3579894b4204c834bc (patch)
tree: 97c131f6ce3576d63a07f047e11bc004ed19f117 /libaom/av1/common/x86
parent: b2a64d5cd5a1ee0c01456cbeb86c45a72eca9618 (diff)
download: platform_external_libaom-ec6586dd308c18c15b581e3579894b4204c834bc.tar.gz
platform_external_libaom-ec6586dd308c18c15b581e3579894b4204c834bc.tar.bz2
platform_external_libaom-ec6586dd308c18c15b581e3579894b4204c834bc.zip
19 files changed, 1819 insertions, 1358 deletions
diff --git a/libaom/av1/common/x86/av1_convolve_scale_sse4.c b/libaom/av1/common/x86/av1_convolve_scale_sse4.c
index d9fb537..8f44238 100644
--- a/libaom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/libaom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -175,7 +175,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
             const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
             const __m128i shifted_32 =
@@ -207,7 +207,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           int32_t tmp = dst16[y * dst16_stride + x];
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
             tmp = tmp >> DIST_PRECISION_BITS;
           } else {
@@ -408,7 +408,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
           __m128i p_32 =
               _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
 
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
                                     _mm_mullo_epi32(shifted, wt1));
             shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
@@ -443,7 +443,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           int32_t tmp = dst16[y * dst16_stride + x];
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
             tmp = tmp >> DIST_PRECISION_BITS;
           } else {
diff --git a/libaom/av1/common/x86/av1_inv_txfm_ssse3.c b/libaom/av1/common/x86/av1_inv_txfm_ssse3.c
index 9841bf3..de0a561 100644
--- a/libaom/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/libaom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -2920,8 +2920,18 @@ void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
                             const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   if (!txfm_param->lossless) {
-    av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
-                                   txfm_param->tx_size, txfm_param->eob);
+    switch (txfm_param->tx_size) {
+      case TX_4X16:
+      case TX_16X4:
+        // TODO(http://crbug.com/aomedia/2350): the ssse3 versions cause test
+        // vector mismatches.
+        av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+        break;
+      default:
+        av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
+                                       txfm_param->tx_size, txfm_param->eob);
+        break;
+    }
   } else {
     av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
   }
diff --git a/libaom/av1/common/x86/av1_inv_txfm_ssse3.h b/libaom/av1/common/x86/av1_inv_txfm_ssse3.h
index 66bd339..7d5055d 100644
--- a/libaom/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/libaom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -72,13 +72,13 @@ static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
 }
 
 // 1D itx types
-typedef enum ATTRIBUTE_PACKED {
+enum {
   IDCT_1D,
   IADST_1D,
   IFLIPADST_1D = IADST_1D,
   IIDENTITY_1D,
   ITX_TYPES_1D,
-} ITX_TYPE_1D;
+} UENUM1BYTE(ITX_TYPE_1D);
 
 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
   IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
diff --git a/libaom/av1/common/x86/av1_txfm_sse4.c b/libaom/av1/common/x86/av1_txfm_sse4.c
index 90b9879..65ccd19 100644
--- a/libaom/av1/common/x86/av1_txfm_sse4.c
+++ b/libaom/av1/common/x86/av1_txfm_sse4.c
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "av1/common/av1_txfm.h"
 #include "av1/common/x86/av1_txfm_sse4.h"
diff --git a/libaom/av1/common/x86/convolve_2d_avx2.c b/libaom/av1/common/x86/convolve_2d_avx2.c
index 0acafd0..ae12a60 100644
--- a/libaom/av1/common/x86/convolve_2d_avx2.c
+++ b/libaom/av1/common/x86/convolve_2d_avx2.c
@@ -27,31 +27,15 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                              const int subpel_x_q4, const int subpel_y_q4,
                              ConvolveParams *conv_params) {
   const int bd = 8;
-
-  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
-  int im_h = h + filter_params_y->taps - 1;
   int im_stride = 8;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
+  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   const int bits =
       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
 
-  __m256i filt[4], coeffs_h[4], coeffs_v[4];
-
   assert(conv_params->round_0 > 0);
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
-
   const __m256i round_const_h = _mm256_set1_epi16(
       ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
   const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -65,58 +49,96 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
       ((1 << (offset_bits - conv_params->round_1)) >> 1));
   const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
 
-  for (j = 0; j < w; j += 8) {
-    for (i = 0; i < im_h; i += 2) {
-      __m256i data = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+  __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
 
-      // Load the next line
-      if (i + 1 < im_h)
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_h[0], coeffs_h[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_v[0], coeffs_v[3]), 0)))
+    is_vert_4tap = 1;
+
+  // horz_filt as 4 tap and vert_filt as 8 tap
+  if (is_horiz_4tap) {
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    // horz-filter
+    for (int j = 0; j < w; j += 8) {
+      for (i = 0; i < (im_h - 2); i += 2) {
+        __m256i data = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+        // Load the next line
         data = _mm256_inserti128_si256(
             data,
             _mm_loadu_si128(
                 (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
             1);
+        __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt);
 
-      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+                               round_shift_h);
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
 
+      __m256i data_1 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+      __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt);
       res =
           _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
-
       _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
-    }
 
-    /* Vertical filter */
-    {
+      // vert filter
+      CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+    }
+  } else if (is_vert_4tap) {
+    int im_h = h + 3;
+    const int fo_vert = 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      // horz_filter
+      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
+      // vert_filter
+      __m256i s[6];
       __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
       __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
       __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
       __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
-      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
-      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
 
-      __m256i s[8];
       s[0] = _mm256_unpacklo_epi16(src_0, src_1);
       s[1] = _mm256_unpacklo_epi16(src_2, src_3);
-      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
-
-      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
-      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
-      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+      s[3] = _mm256_unpackhi_epi16(src_0, src_1);
+      s[4] = _mm256_unpackhi_epi16(src_2, src_3);
 
       for (i = 0; i < h; i += 2) {
         const int16_t *data = &im_block[i * im_stride];
 
-        const __m256i s6 =
-            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
-        const __m256i s7 =
-            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+        const __m256i s4 =
+            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+        const __m256i s5 =
+            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
 
-        s[3] = _mm256_unpacklo_epi16(s6, s7);
-        s[7] = _mm256_unpackhi_epi16(s6, s7);
+        s[2] = _mm256_unpacklo_epi16(s4, s5);
+        s[5] = _mm256_unpackhi_epi16(s4, s5);
 
-        __m256i res_a = convolve(s, coeffs_v);
-        __m256i res_b = convolve(s + 4, coeffs_v);
+        __m256i res_a = convolve_4tap(s, coeffs_v + 1);
+        __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1);
 
         // Combine V round and 2F-H-V round into a single rounding
         res_a =
@@ -154,13 +176,25 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
 
         s[0] = s[1];
         s[1] = s[2];
-        s[2] = s[3];
-
+        s[3] = s[4];
         s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
       }
     }
+  } else {
+    int j;
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (j = 0; j < w; j += 8) {
+      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
+
+      CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+    }
   }
 }
 
@@ -195,20 +229,20 @@ void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
 
   if (w == 2) {
     do {
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
     } while (h);
   } else if (w == 4) {
     do {
-      memcpy(dst, src, 4 * sizeof(*src));
+      memmove(dst, src, 4 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 4 * sizeof(*src));
+      memmove(dst, src, 4 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
diff --git a/libaom/av1/common/x86/convolve_2d_sse2.c b/libaom/av1/common/x86/convolve_2d_sse2.c
index b1a62a4..369922b 100644
--- a/libaom/av1/common/x86/convolve_2d_sse2.c
+++ b/libaom/av1/common/x86/convolve_2d_sse2.c
@@ -255,20 +255,20 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
 
   if (w == 2) {
     do {
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
     } while (h);
   } else if (w == 4) {
     do {
-      memcpy(dst, src, 4 * sizeof(*src));
+      memmove(dst, src, 4 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 4 * sizeof(*src));
+      memmove(dst, src, 4 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
@@ -354,12 +354,11 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
   }
 }
 
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
-                                   uint8_t *dst0, int dst_stride0, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
-                                   ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_sse2(
+    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -371,7 +370,7 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const __m128i zero = _mm_setzero_si128();
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   int i, j;
@@ -411,14 +410,14 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
           const __m128i data_ref_0_hi =
               _mm_loadu_si128((__m128i *)(&dst[j + 8]));
 
-          const __m128i comp_avg_res_lo =
-              comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg);
+          const __m128i comp_avg_res_lo = comp_avg(
+              &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result_lo = convolve_rounding(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
 
-          const __m128i comp_avg_res_hi =
-              comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg);
+          const __m128i comp_avg_res_hi = comp_avg(
+              &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result_hi = convolve_rounding(
               &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
@@ -449,7 +448,7 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
           const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/convolve_avx2.c b/libaom/av1/common/x86/convolve_avx2.c
index 0e91ea9..21b9fe4 100644
--- a/libaom/av1/common/x86/convolve_avx2.c
+++ b/libaom/av1/common/x86/convolve_avx2.c
@@ -23,153 +23,239 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride;
-
+  int i, j, is_vert_4tap = 0;
   // right shift is F-1 because we are already dividing
   // filter co-efficients by 2
   const int right_shift_bits = (FILTER_BITS - 1);
   const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
   const __m256i right_shift_const =
       _mm256_set1_epi16((1 << right_shift_bits) >> 1);
-  __m256i coeffs[4], s[8];
 
   assert(conv_params->round_0 <= FILTER_BITS);
   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
-  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
-
   (void)filter_params_x;
   (void)subpel_x_q4;
   (void)conv_params;
+  __m256i coeffs[4], s[8];
+  __m128i d[6];
 
-  for (j = 0; j < w; j += 16) {
-    const uint8_t *data = &src_ptr[j];
-    __m256i src6;
-
-    // Load lines a and b. Line a to lower 128, line b to upper 128
-    const __m256i src_01a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-        0x20);
-
-    const __m256i src_12a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-        0x20);
-
-    const __m256i src_23a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-        0x20);
-
-    const __m256i src_34a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-        0x20);
-
-    const __m256i src_45a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-        0x20);
-
-    src6 = _mm256_castsi128_si256(
-        _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
-    const __m256i src_56a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-        src6, 0x20);
-
-    s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
-    s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
-    s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
-    s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
-    s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
-    s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
-    for (i = 0; i < h; i += 2) {
-      data = &src_ptr[i * src_stride + j];
-      const __m256i src_67a = _mm256_permute2x128_si256(
-          src6,
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-          0x20);
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
 
-      src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
-      const __m256i src_78a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-          src6, 0x20);
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_vert_4tap = 1;
+
+  // vert_filt as 4 tap
+  if (is_vert_4tap) {
+    const int fo_vert = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      const __m256i src_01a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+      const __m256i src_12a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
 
-      s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
-      s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+      const __m256i src_23a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
 
-      const __m256i res_lo = convolve_lowbd(s, coeffs);
+      const __m256i src_34a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
 
-      /* rounding code */
-      // shift by F - 1
-      const __m256i res_16b_lo = _mm256_sra_epi16(
-          _mm256_add_epi16(res_lo, right_shift_const), right_shift);
-      // 8 bit conversion and saturation to uint8
-      __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
 
-      if (w - j > 8) {
-        const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
 
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+        d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+        const __m256i src_45a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+        d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+        const __m256i src_56a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
+
+        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+        const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
         /* rounding code */
         // shift by F - 1
-        const __m256i res_16b_hi = _mm256_sra_epi16(
-            _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+        const __m256i res_16b_lo = _mm256_sra_epi16(
+            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
         // 8 bit conversion and saturation to uint8
-        __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
-
-        __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
-
-        const __m128i res_0 = _mm256_castsi256_si128(res_a);
-        const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
-
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                         res_1);
-      } else {
-        const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
-        if (w - j > 4) {
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+        if (w - j > 8) {
+          const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+          /* rounding code */
+          // shift by F - 1
+          const __m256i res_16b_hi = _mm256_sra_epi16(
+              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_a);
+          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
                            res_1);
-        } else if (w - j > 2) {
-          xx_storel_32(&dst[i * dst_stride + j], res_0);
-          xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
         } else {
-          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
-          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_1);
+          } else if (w - j > 2) {
+            xx_storel_32(&dst[i * dst_stride + j], res_0);
+            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+          } else {
+            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+            __m128i *const p_1 =
+                (__m128i *)&dst[i * dst_stride + j + dst_stride];
+            *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+            *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+          }
         }
+        s[0] = s[1];
+        s[1] = s[2];
+
+        s[3] = s[4];
+        s[4] = s[5];
       }
+    }
+  } else {
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src6;
+
+      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      const __m256i src_01a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
 
-      s[0] = s[1];
-      s[1] = s[2];
-      s[2] = s[3];
+      const __m256i src_12a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+      const __m256i src_23a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+      const __m256i src_34a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+      const __m256i src_45a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      const __m256i src_56a =
+          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+      s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+      s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+      s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+      s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+        const __m256i src_67a = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+        const __m256i src_78a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+        const __m256i res_lo = convolve_lowbd(s, coeffs);
 
-      s[4] = s[5];
-      s[5] = s[6];
-      s[6] = s[7];
+        /* rounding code */
+        // shift by F - 1
+        const __m256i res_16b_lo = _mm256_sra_epi16(
+            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+        if (w - j > 8) {
+          const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+          /* rounding code */
+          // shift by F - 1
+          const __m256i res_16b_hi = _mm256_sra_epi16(
+              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_a);
+          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_1);
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_1);
+          } else if (w - j > 2) {
+            xx_storel_32(&dst[i * dst_stride + j], res_0);
+            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+          } else {
+            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+            __m128i *const p_1 =
+                (__m128i *)&dst[i * dst_stride + j + dst_stride];
+            *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+            *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
     }
   }
 }
@@ -180,26 +266,14 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
-  int i, j;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_0;
 
-  __m256i filt[4], coeffs[4];
-
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
-
   const __m256i round_0_const =
       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
   const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
   const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(bits);
-
+  int i, is_horiz_4tap = 0;
   (void)filter_params_y;
   (void)subpel_y_q4;
 
@@ -208,51 +282,101 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
   assert(conv_params->round_0 > 0);
 
-  if (w <= 8) {
-    for (i = 0; i < h; i += 2) {
-      const __m256i data = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
-          _mm256_castsi128_si256(_mm_loadu_si128(
-              (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
-          0x20);
-
-      __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
-
-      res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
-                                 round_0_shift);
-
-      res_16b =
-          _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift);
-
-      /* rounding code */
-      // 8 bit conversion and saturation to uint8
-      __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
-
-      const __m128i res_0 = _mm256_castsi256_si128(res_8b);
-      const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
-      if (w > 4) {
-        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
-        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
-      } else if (w > 2) {
-        xx_storel_32(&dst[i * dst_stride], res_0);
-        xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
-      } else {
-        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
-        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
-        *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-        *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+  __m256i coeffs[4], filt[4];
+  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // horz_filt as 4 tap
+  if (is_horiz_4tap) {
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    if (w <= 8) {
+      for (i = 0; i < h; i += 2) {
+        const __m256i data = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+            0x20);
+
+        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                   round_0_shift);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                   round_shift);
+
+        /* rounding code */
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+        } else if (w > 2) {
+          xx_storel_32(&dst[i * dst_stride], res_0);
+          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+      }
+    } else {
+      for (i = 0; i < h; ++i) {
+        for (int j = 0; j < w; j += 16) {
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+          // 18 19 20 21 22 23
+          const __m256i data = _mm256_inserti128_si256(
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+              1);
+
+          __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                     round_0_shift);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                     round_shift);
+
+          /* rounding code */
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+          // Store values into the destination buffer
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+          __m128i res = _mm256_castsi256_si128(res_8b);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        }
       }
     }
   } else {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 16) {
-        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
-        // 19 20 21 22 23
-        const __m256i data = _mm256_inserti128_si256(
-            _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
-            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
-            1);
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    if (w <= 8) {
+      for (i = 0; i < h; i += 2) {
+        const __m256i data = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+            0x20);
 
         __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
 
@@ -266,11 +390,49 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
         // 8 bit conversion and saturation to uint8
         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
 
-        // Store values into the destination buffer
-        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-        res_8b = _mm256_permute4x64_epi64(res_8b, 216);
-        __m128i res = _mm256_castsi256_si128(res_8b);
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+        } else if (w > 2) {
+          xx_storel_32(&dst[i * dst_stride], res_0);
+          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+      }
+    } else {
+      for (i = 0; i < h; ++i) {
+        for (int j = 0; j < w; j += 16) {
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+          // 18 19 20 21 22 23
+          const __m256i data = _mm256_inserti128_si256(
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+              1);
+
+          __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                     round_0_shift);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                     round_shift);
+
+          /* rounding code */
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+          // Store values into the destination buffer
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+          __m128i res = _mm256_castsi256_si128(res_8b);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        }
       }
     }
   }
diff --git a/libaom/av1/common/x86/highbd_convolve_2d_avx2.c b/libaom/av1/common/x86/highbd_convolve_2d_avx2.c
index ae68f0b..357df12 100644
--- a/libaom/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/libaom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -238,10 +238,10 @@ void av1_highbd_convolve_2d_copy_sr_avx2(
 
   if (w == 2) {
     do {
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
diff --git a/libaom/av1/common/x86/highbd_convolve_2d_sse4.c b/libaom/av1/common/x86/highbd_convolve_2d_sse4.c
index 3f8dafb..3c1d5d1 100644
--- a/libaom/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/libaom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -21,7 +21,7 @@
 #include "aom_dsp/x86/convolve_sse4_1.h"
 #include "av1/common/convolve.h"
 
-void av1_highbd_jnt_convolve_2d_copy_sse4_1(
+void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -37,7 +37,7 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m128i wt0 = _mm_set1_epi32(w0);
@@ -75,15 +75,17 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
           const __m128i res_unsigned_lo =
               _mm_add_epi32(res_32b_lo, offset_const);
 
-          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
-              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_lo =
+              highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
 
           const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
           const __m128i res_unsigned_hi =
               _mm_add_epi32(res_32b_hi, offset_const);
 
-          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
-              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_hi =
+              highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
 
           const __m128i round_result_lo = highbd_convolve_rounding_sse2(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -132,9 +134,9 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
               _mm_add_epi32(res_32b_hi, offset_const);
 
           const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
-              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
           const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
-              &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+              &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg);
 
           const __m128i round_result_lo = highbd_convolve_rounding_sse2(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -166,7 +168,7 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
   }
 }
 
-void av1_highbd_jnt_convolve_2d_sse4_1(
+void av1_highbd_dist_wtd_convolve_2d_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -179,7 +181,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
   int im_stride = MAX_SB_SIZE;
   int i, j;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
@@ -359,8 +361,9 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
 
             const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
 
-            const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
-                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res =
+                highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
 
             const __m128i round_result = highbd_convolve_rounding_sse2(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -391,10 +394,12 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
             const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
             const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
 
-            const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
-                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-            const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
-                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_lo =
+                highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
+            const __m128i comp_avg_res_hi =
+                highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
 
             const __m128i round_result_lo =
                 highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
diff --git a/libaom/av1/common/x86/highbd_inv_txfm_avx2.c b/libaom/av1/common/x86/highbd_inv_txfm_avx2.c
index 5418057..fe22465 100644
--- a/libaom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/libaom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -4309,213 +4309,17 @@ void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
       highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
                                              stride, tx_type, tx_size, eob, bd);
       break;
-    default: assert(0); break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_16x16_avx2(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
-    default:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  const int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-      // Assembly version doesn't support IDTX, so use C version for it.
-    case IDTX:
-      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
-
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_16x32_avx2(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x16_avx2(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-void av1_highbd_inv_txfm_add_8x8_avx2(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
     case IDTX:
-      av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-  }
-}
-void av1_highbd_inv_txfm_add_8x32_avx2(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x8_avx2(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-void av1_highbd_inv_txfm_add_16x8_avx2(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
     case H_DCT:
-    case V_ADST:
     case H_ADST:
-    case V_FLIPADST:
     case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_8x16_avx2(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
     case V_DCT:
-    case H_DCT:
     case V_ADST:
-    case H_ADST:
     case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
+                                                tx_size, eob, bd);
       break;
+    default: assert(0); break;
   }
 }
 void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
@@ -4523,33 +4327,12 @@ void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
-    case TX_32X32:
-      av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_16X16:
-      av1_highbd_inv_txfm_add_16x16_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_8X8:
-      av1_highbd_inv_txfm_add_8x8_avx2(input, dest, stride, txfm_param);
-      break;
     case TX_4X8:
       av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
       av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
       break;
-    case TX_8X16:
-      av1_highbd_inv_txfm_add_8x16_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_16X8:
-      av1_highbd_inv_txfm_add_16x8_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_16X32:
-      av1_highbd_inv_txfm_add_16x32_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_32X16:
-      av1_highbd_inv_txfm_add_32x16_avx2(input, dest, stride, txfm_param);
-      break;
     case TX_4X4:
       av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
       break;
@@ -4559,21 +4342,10 @@ void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
     case TX_4X16:
       av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
       break;
-    case TX_8X32:
-      av1_highbd_inv_txfm_add_8x32_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_32X8:
-      av1_highbd_inv_txfm_add_32x8_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_64X64:
-    case TX_32X64:
-    case TX_64X32:
-    case TX_16X64:
-    case TX_64X16:
+    default:
       av1_highbd_inv_txfm2d_add_universe_avx2(
           input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
           txfm_param->eob, txfm_param->bd);
       break;
-    default: assert(0 && "Invalid transform size"); break;
   }
 }
diff --git a/libaom/av1/common/x86/highbd_inv_txfm_sse4.c b/libaom/av1/common/x86/highbd_inv_txfm_sse4.c
index 12c6350..8a8641d 100644
--- a/libaom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/libaom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -583,7 +583,66 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
   _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
   _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
 }
+static void highbd_clamp_epi32_sse4_1(const __m128i *in, __m128i *out,
+                                      const __m128i *clamp_lo,
+                                      const __m128i *clamp_hi, int size) {
+  __m128i a0, a1;
+  for (int i = 0; i < size; i += 4) {
+    a0 = _mm_max_epi32(in[i], *clamp_lo);
+    out[i] = _mm_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
+    out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
+
+    a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
+    out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
+    out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
+  }
+}
+static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  (void)bit;
+  (void)out_shift;
+  __m128i v[4];
+  __m128i fact = _mm_set1_epi32(NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a0, a1;
+
+  a0 = _mm_mullo_epi32(in[0], fact);
+  a1 = _mm_mullo_epi32(in[1], fact);
+  a0 = _mm_add_epi32(a0, offset);
+  a1 = _mm_add_epi32(a1, offset);
+  out[0] = _mm_srai_epi32(a0, NewSqrt2Bits);
+  out[1] = _mm_srai_epi32(a1, NewSqrt2Bits);
+
+  a0 = _mm_mullo_epi32(in[2], fact);
+  a1 = _mm_mullo_epi32(in[3], fact);
+  a0 = _mm_add_epi32(a0, offset);
+  a1 = _mm_add_epi32(a1, offset);
+  out[2] = _mm_srai_epi32(a0, NewSqrt2Bits);
+  out[3] = _mm_srai_epi32(a1, NewSqrt2Bits);
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+  }
+
+  // Transpose for 4x4
+  v[0] = _mm_unpacklo_epi32(out[0], out[1]);
+  v[1] = _mm_unpackhi_epi32(out[0], out[1]);
+  v[2] = _mm_unpacklo_epi32(out[2], out[3]);
+  v[3] = _mm_unpackhi_epi32(out[2], out[3]);
 
+  out[0] = _mm_unpacklo_epi64(v[0], v[2]);
+  out[1] = _mm_unpackhi_epi64(v[0], v[2]);
+  out[2] = _mm_unpacklo_epi64(v[1], v[3]);
+  out[3] = _mm_unpackhi_epi64(v[1], v[3]);
+}
 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
@@ -646,6 +705,48 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
       iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
+    case IDTX:
+      load_buffer_4x4(coeff, in);
+      iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_DCT:
+      load_buffer_4x4(coeff, in);
+      iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case H_DCT:
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_ADST:
+      load_buffer_4x4(coeff, in);
+      iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case H_ADST:
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(coeff, in);
+      iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
     default: assert(0);
   }
 }
@@ -1116,6 +1217,61 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                      &clamp_hi_out, out_shift);
   }
 }
+static void shift_sse4_1(const __m128i *in, __m128i *out,
+                         const __m128i *clamp_lo, const __m128i *clamp_hi,
+                         int shift, int size) {
+  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+  __m128i shift_vec = _mm_cvtsi32_si128(shift);
+  __m128i a0, a1;
+  for (int i = 0; i < size; i += 4) {
+    a0 = _mm_add_epi32(in[i], offset);
+    a1 = _mm_add_epi32(in[i + 1], offset);
+    a0 = _mm_sra_epi32(a0, shift_vec);
+    a1 = _mm_sra_epi32(a1, shift_vec);
+    a0 = _mm_max_epi32(a0, *clamp_lo);
+    a1 = _mm_max_epi32(a1, *clamp_lo);
+    out[i] = _mm_min_epi32(a0, *clamp_hi);
+    out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
+
+    a0 = _mm_add_epi32(in[i + 2], offset);
+    a1 = _mm_add_epi32(in[i + 3], offset);
+    a0 = _mm_sra_epi32(a0, shift_vec);
+    a1 = _mm_sra_epi32(a1, shift_vec);
+    a0 = _mm_max_epi32(a0, *clamp_lo);
+    a1 = _mm_max_epi32(a1, *clamp_lo);
+    out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
+    out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
+  }
+}
+
+static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  (void)bit;
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i v[8];
+  v[0] = _mm_add_epi32(in[0], in[0]);
+  v[1] = _mm_add_epi32(in[1], in[1]);
+  v[2] = _mm_add_epi32(in[2], in[2]);
+  v[3] = _mm_add_epi32(in[3], in[3]);
+  v[4] = _mm_add_epi32(in[4], in[4]);
+  v[5] = _mm_add_epi32(in[5], in[5]);
+  v[6] = _mm_add_epi32(in[6], in[6]);
+  v[7] = _mm_add_epi32(in[7], in[7]);
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 8);
+  } else {
+    highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 8);
+  }
+}
 
 static void round_shift_8x8(__m128i *in, int shift) {
   round_shift_4x4(&in[0], shift);
@@ -3000,7 +3156,59 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     }
   }
 }
+static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+  (void)bit;
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i v[16];
+  __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a0, a1, a2, a3;
+
+  for (int i = 0; i < 16; i += 8) {
+    a0 = _mm_mullo_epi32(in[i], fact);
+    a1 = _mm_mullo_epi32(in[i + 1], fact);
+    a0 = _mm_add_epi32(a0, offset);
+    a1 = _mm_add_epi32(a1, offset);
+    v[i] = _mm_srai_epi32(a0, NewSqrt2Bits);
+    v[i + 1] = _mm_srai_epi32(a1, NewSqrt2Bits);
+
+    a2 = _mm_mullo_epi32(in[i + 2], fact);
+    a3 = _mm_mullo_epi32(in[i + 3], fact);
+    a2 = _mm_add_epi32(a2, offset);
+    a3 = _mm_add_epi32(a3, offset);
+    v[i + 2] = _mm_srai_epi32(a2, NewSqrt2Bits);
+    v[i + 3] = _mm_srai_epi32(a3, NewSqrt2Bits);
+
+    a0 = _mm_mullo_epi32(in[i + 4], fact);
+    a1 = _mm_mullo_epi32(in[i + 5], fact);
+    a0 = _mm_add_epi32(a0, offset);
+    a1 = _mm_add_epi32(a1, offset);
+    v[i + 4] = _mm_srai_epi32(a0, NewSqrt2Bits);
+    v[i + 5] = _mm_srai_epi32(a1, NewSqrt2Bits);
+
+    a2 = _mm_mullo_epi32(in[i + 6], fact);
+    a3 = _mm_mullo_epi32(in[i + 7], fact);
+    a2 = _mm_add_epi32(a2, offset);
+    a3 = _mm_add_epi32(a3, offset);
+    v[i + 6] = _mm_srai_epi32(a2, NewSqrt2Bits);
+    v[i + 7] = _mm_srai_epi32(a3, NewSqrt2Bits);
+  }
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
 
+    shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 16);
+  } else {
+    highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 16);
+  }
+}
 static INLINE void idct64_stage8_sse4_1(
     __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
@@ -5020,207 +5228,23 @@ void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
     case IDTX:
-      av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                    tx_type, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                         int stride,
-                                         const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
     case H_DCT:
-    case V_ADST:
     case H_ADST:
-    case V_FLIPADST:
     case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                         int stride,
-                                         const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
     case V_DCT:
-    case H_DCT:
     case V_ADST:
-    case H_ADST:
     case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
       av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
                                                 txfm_param->tx_size,
                                                 txfm_param->eob, bd);
       break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
-                                          uint8_t *dest, int stride,
-                                          const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
     default:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
-                                          uint8_t *dest, int stride,
-                                          const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-      // Assembly version doesn't support IDTX, so use C version for it.
-    case IDTX:
-      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_16x32_sse4_1(const tran_low_t *input,
-                                          uint8_t *dest, int stride,
-                                          const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x16_sse4_1(const tran_low_t *input,
-                                          uint8_t *dest, int stride,
-                                          const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_8x32_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                         int stride,
-                                         const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x8_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                         int stride,
-                                         const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
+      av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, bd);
       break;
-    default: assert(0);
   }
 }
-
 void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
                                         int stride,
                                         const TxfmParam *txfm_param) {
@@ -5235,53 +5259,271 @@ void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
     av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
     return;
   }
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                    tx_type, bd);
-      break;
-  }
+  av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                                bd);
 }
+static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+  (void)bit;
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i v[32];
+  for (int i = 0; i < 32; i += 16) {
+    v[i] = _mm_slli_epi32(in[i], 2);
+    v[i + 1] = _mm_slli_epi32(in[i + 1], 2);
+    v[i + 2] = _mm_slli_epi32(in[i + 2], 2);
+    v[i + 3] = _mm_slli_epi32(in[i + 3], 2);
+    v[i + 4] = _mm_slli_epi32(in[i + 4], 2);
+    v[i + 5] = _mm_slli_epi32(in[i + 5], 2);
+    v[i + 6] = _mm_slli_epi32(in[i + 6], 2);
+    v[i + 7] = _mm_slli_epi32(in[i + 7], 2);
+    v[i + 8] = _mm_slli_epi32(in[i + 8], 2);
+    v[i + 9] = _mm_slli_epi32(in[i + 9], 2);
+    v[i + 10] = _mm_slli_epi32(in[i + 10], 2);
+    v[i + 11] = _mm_slli_epi32(in[i + 11], 2);
+    v[i + 12] = _mm_slli_epi32(in[i + 12], 2);
+    v[i + 13] = _mm_slli_epi32(in[i + 13], 2);
+    v[i + 14] = _mm_slli_epi32(in[i + 14], 2);
+    v[i + 15] = _mm_slli_epi32(in[i + 15], 2);
+  }
 
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 32);
+  } else {
+    highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 32);
+  }
+}
 static const transform_1d_sse4_1
     highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
       {
           { idct4x4_sse4_1, NULL, NULL, NULL },
           { iadst4x4_sse4_1, NULL, NULL, NULL },
-          { NULL, NULL, NULL, NULL },
+          { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
       },
       { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
         { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
-        { NULL, NULL, NULL, NULL } },
+        { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
       {
           { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
             NULL },
           { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
             NULL },
-          { NULL, NULL, NULL, NULL },
+          { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
       },
       { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
           idct32x32_sse4_1 },
         { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL } },
+        { iidentity32_sse4_1, NULL, NULL, NULL } },
       { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
           idct64x64_sse4_1 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } }
     };
+static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div4 = input_stride >> 2;
+  const int buf_size_h_div8 = (eoby + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
+    __m128i buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_w_div4; ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
+                                           NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+
+    for (int j = 0; j < buf_size_w_div4; ++j) {
+      _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+      _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+      _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+      _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+    }
+  }
+  for (int i = 0; i < buf_size_w_div4; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
+
+  // write to buffer
+  for (int i = 0; i < (txfm_size_col >> 3); i++) {
+    highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                   stride, ud_flip, txfm_size_row, bd);
+  }
+}
+static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div8 = input_stride >> 2;
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    __m128i buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(
+          buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, ud_flip,
+                                     txfm_size_row, bd);
+    }
+  }
+}
+static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
+                                              uint16_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob, const int bd) {
+  (void)eob;
+  __m128i buf1[64 * 4];
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    __m128i buf0[32];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (input_stride >> 2); ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
+                                           NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    for (int j = 0; j < (input_stride >> 2); ++j) {
+      _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+      _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+      _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+      _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+    }
+  }
+  for (int i = 0; i < (input_stride >> 2); i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
 
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, 0, txfm_size_row,
+                                     bd);
+    }
+  }
+}
 static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
                                                     uint16_t *output,
                                                     int stride, TX_TYPE tx_type,
@@ -5613,6 +5855,24 @@ void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
           bd);
       break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      highbd_inv_txfm2d_add_h_identity_ssse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      highbd_inv_txfm2d_add_v_identity_ssse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
+      break;
+    case IDTX:
+      highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
+                                        stride, tx_type, tx_size, eob, bd);
+      break;
     default: assert(0); break;
   }
 }
@@ -5623,26 +5883,9 @@ void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const TX_SIZE tx_size = txfm_param->tx_size;
-  const int32_t *src = cast_to_int32(input);
   int eob = txfm_param->eob;
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
-                                      tx_type, tx_size, eob, bd);
-      break;
-  }
+  highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, tx_size, eob, bd);
 }
 
 void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
@@ -5651,26 +5894,9 @@ void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const TX_SIZE tx_size = txfm_param->tx_size;
-  const int32_t *src = cast_to_int32(input);
   int eob = txfm_param->eob;
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
-                                      tx_type, tx_size, eob, bd);
-      break;
-  }
+  highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, tx_size, eob, bd);
 }
 
 void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
@@ -5679,26 +5905,9 @@ void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const TX_SIZE tx_size = txfm_param->tx_size;
-  const int32_t *src = cast_to_int32(input);
   int eob = txfm_param->eob;
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                                bd);
-      break;
-    default:
-      highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest),
-                                        stride, tx_type, tx_size, eob, bd);
-      break;
-  }
+  highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, tx_size, eob, bd);
 }
 
 void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
@@ -5707,26 +5916,9 @@ void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const TX_SIZE tx_size = txfm_param->tx_size;
-  const int32_t *src = cast_to_int32(input);
   int eob = txfm_param->eob;
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                                bd);
-      break;
-    default:
-      highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest),
-                                        stride, tx_type, tx_size, eob, bd);
-      break;
-  }
+  highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, tx_size, eob, bd);
 }
 
 void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
@@ -5734,57 +5926,16 @@ void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
-    case TX_32X32:
-      av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X16:
-      av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_8X8:
-      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
-      break;
     case TX_4X8:
       av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
       av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
       break;
-    case TX_8X16:
-      av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X8:
-      av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X32:
-      av1_highbd_inv_txfm_add_16x32_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_32X16:
-      av1_highbd_inv_txfm_add_32x16_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_4X4:
-      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X4:
-      av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_4X16:
-      av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_8X32:
-      av1_highbd_inv_txfm_add_8x32_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_32X8:
-      av1_highbd_inv_txfm_add_32x8_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_64X64:
-    case TX_32X64:
-    case TX_64X32:
-    case TX_16X64:
-    case TX_64X16:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(
-          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
-          txfm_param->eob, txfm_param->bd);
+    default:
+      // TODO(http://crbug.com/aomedia/2350): the remaining sse4_1 versions
+      // cause test vector mismatches.
+      av1_highbd_inv_txfm_add_c(input, dest, stride, txfm_param);
       break;
-    default: assert(0 && "Invalid transform size"); break;
   }
 }
diff --git a/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c b/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c
index e298cf6..c5040c4 100644
--- a/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -22,7 +22,7 @@
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
 
-void av1_highbd_jnt_convolve_2d_copy_avx2(
+void av1_highbd_dist_wtd_convolve_2d_copy_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -38,7 +38,7 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m256i wt0 = _mm256_set1_epi32(w0);
@@ -78,15 +78,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
           const __m256i res_unsigned_lo =
               _mm256_add_epi32(res_32b_lo, offset_const);
 
-          const __m256i comp_avg_res_lo = highbd_comp_avg(
-              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m256i comp_avg_res_lo =
+              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
 
           const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
           const __m256i res_unsigned_hi =
               _mm256_add_epi32(res_32b_hi, offset_const);
 
-          const __m256i comp_avg_res_hi = highbd_comp_avg(
-              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+          const __m256i comp_avg_res_hi =
+              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
 
           const __m256i round_result_lo = highbd_convolve_rounding(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -135,8 +137,9 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
             const __m256i res_unsigned_lo =
                 _mm256_add_epi32(res_32b, offset_const);
 
-            const __m256i comp_avg_res = highbd_comp_avg(
-                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res =
+                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result = highbd_convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -179,15 +182,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
             const __m256i res_unsigned_lo =
                 _mm256_add_epi32(res_32b_lo, offset_const);
 
-            const __m256i comp_avg_res_lo = highbd_comp_avg(
-                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_lo =
+                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
             const __m256i res_unsigned_hi =
                 _mm256_add_epi32(res_32b_hi, offset_const);
 
-            const __m256i comp_avg_res_hi = highbd_comp_avg(
-                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_hi =
+                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result_lo =
                 highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
@@ -223,7 +228,7 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
   }
 }
 
-void av1_highbd_jnt_convolve_2d_avx2(
+void av1_highbd_dist_wtd_convolve_2d_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -244,7 +249,7 @@ void av1_highbd_jnt_convolve_2d_avx2(
 
   __m256i s[8], coeffs_y[4], coeffs_x[4];
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
@@ -364,8 +369,9 @@ void av1_highbd_jnt_convolve_2d_avx2(
 
             const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
 
-            const __m256i comp_avg_res = highbd_comp_avg(
-                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res =
+                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result = highbd_convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -409,10 +415,12 @@ void av1_highbd_jnt_convolve_2d_avx2(
             const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
             const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
 
-            const __m256i comp_avg_res_lo = highbd_comp_avg(
-                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-            const __m256i comp_avg_res_hi = highbd_comp_avg(
-                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_lo =
+                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+            const __m256i comp_avg_res_hi =
+                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result_lo =
                 highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
@@ -456,7 +464,7 @@ void av1_highbd_jnt_convolve_2d_avx2(
   }
 }
 
-void av1_highbd_jnt_convolve_x_avx2(
+void av1_highbd_dist_wtd_convolve_x_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -473,7 +481,7 @@ void av1_highbd_jnt_convolve_x_avx2(
   __m256i s[4], coeffs_x[4];
 
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m256i wt0 = _mm256_set1_epi32(w0);
@@ -548,7 +556,7 @@ void av1_highbd_jnt_convolve_x_avx2(
           const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
 
           const __m256i comp_avg_res = highbd_comp_avg(
-              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
 
           const __m256i round_result = highbd_convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -588,10 +596,12 @@ void av1_highbd_jnt_convolve_x_avx2(
           const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
           const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
 
-          const __m256i comp_avg_res_lo = highbd_comp_avg(
-              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-          const __m256i comp_avg_res_hi = highbd_comp_avg(
-              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+          const __m256i comp_avg_res_lo =
+              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
+          const __m256i comp_avg_res_hi =
+              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
 
           const __m256i round_result_lo = highbd_convolve_rounding(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -623,7 +633,7 @@ void av1_highbd_jnt_convolve_x_avx2(
   }
 }
 
-void av1_highbd_jnt_convolve_y_avx2(
+void av1_highbd_dist_wtd_convolve_y_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -640,7 +650,7 @@ void av1_highbd_jnt_convolve_y_avx2(
   int i, j;
   __m256i s[8], coeffs_y[4];
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
@@ -753,8 +763,9 @@ void av1_highbd_jnt_convolve_y_avx2(
 
             const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
 
-            const __m256i comp_avg_res = highbd_comp_avg(
-                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res =
+                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result = highbd_convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -799,10 +810,12 @@ void av1_highbd_jnt_convolve_y_avx2(
             const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
             const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
 
-            const __m256i comp_avg_res_lo = highbd_comp_avg(
-                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-            const __m256i comp_avg_res_hi = highbd_comp_avg(
-                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_lo =
+                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+            const __m256i comp_avg_res_hi =
+                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result_lo =
                 highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
diff --git a/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c b/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c
index 1a29985..7fea36a 100644
--- a/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -17,7 +17,7 @@
 #include "aom_dsp/x86/convolve_sse2.h"
 #include "aom_dsp/x86/convolve_sse4_1.h"
 
-void av1_highbd_jnt_convolve_y_sse4_1(
+void av1_highbd_dist_wtd_convolve_y_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -33,7 +33,7 @@ void av1_highbd_jnt_convolve_y_sse4_1(
   assert(bits >= 0);
   int i, j;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
@@ -121,10 +121,12 @@ void av1_highbd_jnt_convolve_y_sse4_1(
             const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
             const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
 
-            const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1(
-                &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg);
-            const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1(
-                &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_0 =
+                highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
+            const __m128i comp_avg_res_1 =
+                highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
 
             const __m128i round_result_0 =
                 highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
@@ -186,16 +188,16 @@ void av1_highbd_jnt_convolve_y_sse4_1(
 
             const __m128i comp_avg_res_lo_0 =
                 highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
-                                       &wt0, &wt1, use_jnt_comp_avg);
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
             const __m128i comp_avg_res_lo_1 =
                 highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
-                                       &wt0, &wt1, use_jnt_comp_avg);
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
             const __m128i comp_avg_res_hi_0 =
                 highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
-                                       &wt0, &wt1, use_jnt_comp_avg);
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
             const __m128i comp_avg_res_hi_1 =
                 highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
-                                       &wt0, &wt1, use_jnt_comp_avg);
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
 
             const __m128i round_result_lo_0 =
                 highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
@@ -257,7 +259,7 @@ void av1_highbd_jnt_convolve_y_sse4_1(
   }
 }
 
-void av1_highbd_jnt_convolve_x_sse4_1(
+void av1_highbd_dist_wtd_convolve_x_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -274,7 +276,7 @@ void av1_highbd_jnt_convolve_x_sse4_1(
   __m128i s[4], coeffs_x[4];
 
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m128i wt0 = _mm_set1_epi32(w0);
@@ -339,7 +341,7 @@ void av1_highbd_jnt_convolve_x_sse4_1(
           const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
 
           const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
-              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
           const __m128i round_result = highbd_convolve_rounding_sse2(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
 
@@ -359,10 +361,12 @@ void av1_highbd_jnt_convolve_x_sse4_1(
           const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
           const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
 
-          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
-              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
-              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_lo =
+              highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
+          const __m128i comp_avg_res_hi =
+              highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
 
           const __m128i round_result_lo = highbd_convolve_rounding_sse2(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/highbd_warp_plane_sse4.c b/libaom/av1/common/x86/highbd_warp_plane_sse4.c
index 4bcab05..3765c5e 100644
--- a/libaom/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/libaom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -537,7 +537,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
             __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
             __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
 
-            if (conv_params->use_jnt_comp_avg) {
+            if (conv_params->use_dist_wtd_comp_avg) {
               res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
                                      _mm_mullo_epi32(res_lo, wt1));
               res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
@@ -570,7 +570,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
                   (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
               __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
 
-              if (conv_params->use_jnt_comp_avg) {
+              if (conv_params->use_dist_wtd_comp_avg) {
                 res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
                                        _mm_mullo_epi32(res_hi, wt1));
                 res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
diff --git a/libaom/av1/common/x86/jnt_convolve_avx2.c b/libaom/av1/common/x86/jnt_convolve_avx2.c
index 9f2e2b4..23cd6ab 100644
--- a/libaom/av1/common/x86/jnt_convolve_avx2.c
+++ b/libaom/av1/common/x86/jnt_convolve_avx2.c
@@ -35,22 +35,20 @@ static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
       _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
 }
 
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                             int dst_stride0, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
-  int i, j;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_horiz;
+  int i, j, is_horiz_4tap = 0;
   const int bits = FILTER_BITS - conv_params->round_1;
   const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -58,18 +56,10 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const int rounding_shift =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
-  __m256i filt[4], coeffs[4];
 
   assert(bits >= 0);
   assert(conv_params->round_0 > 0);
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
-
   const __m256i round_const =
       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -77,68 +67,136 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   (void)filter_params_y;
   (void)subpel_y_q4;
 
-  for (i = 0; i < h; i += 2) {
-    const uint8_t *src_data = src_ptr + i * src_stride;
-    CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
-    for (j = 0; j < w; j += 8) {
-      const __m256i data =
-          load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+  __m256i filt[4], coeffs[4];
 
-      __m256i res = convolve_lowbd_x(data, coeffs, filt);
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
 
-      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
 
-      res = _mm256_slli_epi16(res, bits);
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_horiz_4tap = 1;
 
-      const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+  // horz_filt as 4 tap
+  if (is_horiz_4tap) {
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    for (i = 0; i < h; i += 2) {
+      const uint8_t *src_data = src_ptr + i * src_stride;
+      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+      for (j = 0; j < w; j += 8) {
+        const __m256i data =
+            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
 
-      // Accumulate values into the destination buffer
-      if (do_average) {
-        const __m256i data_ref_0 =
-            load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
-        const __m256i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+        __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+        res = _mm256_slli_epi16(res, bits);
 
-        const __m256i round_result = convolve_rounding(
-            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
 
-        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
-        const __m128i res_0 = _mm256_castsi256_si128(res_8);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m256i data_ref_0 =
+              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
 
-        if (w > 4) {
-          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-          _mm_storel_epi64(
-              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
         } else {
-          *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
-          *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-              _mm_cvtsi128_si32(res_1);
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
         }
-      } else {
-        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
-        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+      }
+    }
+  } else {
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+    for (i = 0; i < h; i += 2) {
+      const uint8_t *src_data = src_ptr + i * src_stride;
+      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+      for (j = 0; j < w; j += 8) {
+        const __m256i data =
+            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+        __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+        res = _mm256_slli_epi16(res, bits);
+
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m256i data_ref_0 =
+              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
 
-        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
-        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                        res_1);
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
       }
     }
   }
 }
 
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                             int dst_stride0, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride;
+  int i, j, is_vert_4tap = 0;
   // +1 to compensate for dividing the filter coeffs by 2
   const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
   const __m256i round_const =
@@ -146,7 +204,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
   const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -168,195 +226,389 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   (void)filter_params_x;
   (void)subpel_x_q4;
 
-  for (j = 0; j < w; j += 16) {
-    const uint8_t *data = &src_ptr[j];
-    __m256i src6;
-    // Load lines a and b. Line a to lower 128, line b to upper 128
-    {
-      __m256i src_ab[7];
-      __m256i src_a[7];
-      src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-      for (int kk = 0; kk < 6; ++kk) {
-        data += src_stride;
-        src_a[kk + 1] =
-            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-        src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_vert_4tap = 1;
+
+  if (is_vert_4tap) {
+    const int fo_vert = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src4;
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      {
+        __m256i src_ab[4];
+        __m256i src_a[5];
+        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        for (int kk = 0; kk < 4; ++kk) {
+          data += src_stride;
+          src_a[kk + 1] =
+              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+          src_ab[kk] =
+              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+        }
+        src4 = src_a[4];
+        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+
+        s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+        s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
       }
-      src6 = src_a[6];
-      s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
-      s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
-      s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
-      s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
-      s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
-      s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
-    }
 
-    for (i = 0; i < h; i += 2) {
-      data = &src_ptr[(i + 7) * src_stride + j];
-      const __m256i src7 =
-          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-      const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[(i + 5) * src_stride + j];
+        const __m256i src5 =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20);
 
-      src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + src_stride)));
-      const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
+        src4 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + src_stride)));
+        const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20);
 
-      s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
-      s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
 
-      __m256i res_lo = convolve_lowbd(s, coeffs);
+        __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
 
-      res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
 
-      const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
-      const __m256i res_lo_0_shift =
-          _mm256_slli_epi32(res_lo_0_32b, left_shift);
-      const __m256i res_lo_0_round = _mm256_sra_epi32(
-          _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+        const __m256i res_lo_0_shift =
+            _mm256_slli_epi32(res_lo_0_32b, left_shift);
+        const __m256i res_lo_0_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
 
-      const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
-      const __m256i res_lo_1_shift =
-          _mm256_slli_epi32(res_lo_1_32b, left_shift);
-      const __m256i res_lo_1_round = _mm256_sra_epi32(
-          _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+        const __m256i res_lo_1_shift =
+            _mm256_slli_epi32(res_lo_1_32b, left_shift);
+        const __m256i res_lo_1_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
 
-      const __m256i res_lo_round =
-          _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+        const __m256i res_lo_round =
+            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
 
-      const __m256i res_lo_unsigned =
-          _mm256_add_epi16(res_lo_round, offset_const_2);
+        const __m256i res_lo_unsigned =
+            _mm256_add_epi16(res_lo_round, offset_const_2);
 
-      if (w - j < 16) {
-        if (do_average) {
-          const __m256i data_ref_0 = load_line2_avx2(
-              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
-          const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+        if (w - j < 16) {
+          if (do_average) {
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
 
-          const __m256i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
 
-          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
-          const __m128i res_0 = _mm256_castsi256_si128(res_8);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
 
-          if (w - j > 4) {
-            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_storel_epi64(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+            if (w - j > 4) {
+              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+              _mm_storel_epi64(
+                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+                  res_1);
+            } else {
+              *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                  _mm_cvtsi128_si32(res_0);
+              *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                  _mm_cvtsi128_si32(res_1);
+            }
           } else {
-            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
-                _mm_cvtsi128_si32(res_0);
-            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-                _mm_cvtsi128_si32(res_1);
+            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
           }
         } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+          __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
 
-          const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                          res_1);
+          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+          const __m256i res_hi_0_shift =
+              _mm256_slli_epi32(res_hi_0_32b, left_shift);
+          const __m256i res_hi_0_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+          const __m256i res_hi_1_shift =
+              _mm256_slli_epi32(res_hi_1_32b, left_shift);
+          const __m256i res_hi_1_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+          const __m256i res_hi_round =
+              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+          const __m256i res_hi_unsigned =
+              _mm256_add_epi16(res_hi_round, offset_const_2);
+
+          if (do_average) {
+            const __m256i data_ref_0_lo =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+
+            const __m256i data_ref_0_hi =
+                load_line2_avx2(&dst[i * dst_stride + j + 8],
+                                &dst[i * dst_stride + j + 8 + dst_stride]);
+
+            const __m256i comp_avg_res_lo = comp_avg(
+                &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i comp_avg_res_hi = comp_avg(
+                &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i round_result_lo =
+                convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i round_result_hi =
+                convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result_lo, round_result_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+          } else {
+            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+            const __m128i res_lo_1 =
+                _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_lo_1);
+
+            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+                            res_hi_0);
+
+            const __m128i res_hi_1 =
+                _mm256_extracti128_si256(res_hi_unsigned, 1);
+            _mm_store_si128(
+                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+                res_hi_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+
+        s[3] = s[4];
+        s[4] = s[5];
+      }
+    }
+  } else {
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src6;
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      {
+        __m256i src_ab[7];
+        __m256i src_a[7];
+        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        for (int kk = 0; kk < 6; ++kk) {
+          data += src_stride;
+          src_a[kk + 1] =
+              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+          src_ab[kk] =
+              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
         }
-      } else {
-        __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+        src6 = src_a[6];
+        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+        s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+        s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+        s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+        s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+      }
 
-        res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[(i + 7) * src_stride + j];
+        const __m256i src7 =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
 
-        const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
-        const __m256i res_hi_0_shift =
-            _mm256_slli_epi32(res_hi_0_32b, left_shift);
-        const __m256i res_hi_0_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + src_stride)));
+        const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
 
-        const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
-        const __m256i res_hi_1_shift =
-            _mm256_slli_epi32(res_hi_1_32b, left_shift);
-        const __m256i res_hi_1_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
 
-        const __m256i res_hi_round =
-            _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+        __m256i res_lo = convolve_lowbd(s, coeffs);
 
-        const __m256i res_hi_unsigned =
-            _mm256_add_epi16(res_hi_round, offset_const_2);
+        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
 
-        if (do_average) {
-          const __m256i data_ref_0_lo = load_line2_avx2(
-              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+        const __m256i res_lo_0_shift =
+            _mm256_slli_epi32(res_lo_0_32b, left_shift);
+        const __m256i res_lo_0_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
 
-          const __m256i data_ref_0_hi =
-              load_line2_avx2(&dst[i * dst_stride + j + 8],
-                              &dst[i * dst_stride + j + 8 + dst_stride]);
+        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+        const __m256i res_lo_1_shift =
+            _mm256_slli_epi32(res_lo_1_32b, left_shift);
+        const __m256i res_lo_1_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
 
-          const __m256i comp_avg_res_lo =
-              comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+        const __m256i res_lo_round =
+            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
 
-          const __m256i comp_avg_res_hi =
-              comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg);
+        const __m256i res_lo_unsigned =
+            _mm256_add_epi16(res_lo_round, offset_const_2);
 
-          const __m256i round_result_lo = convolve_rounding(
-              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+        if (w - j < 16) {
+          if (do_average) {
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
 
-          const __m256i round_result_hi = convolve_rounding(
-              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
 
-          const __m256i res_8 =
-              _mm256_packus_epi16(round_result_lo, round_result_hi);
-          const __m128i res_0 = _mm256_castsi256_si128(res_8);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
 
-          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-          _mm_store_si128(
-              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+            if (w - j > 4) {
+              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+              _mm_storel_epi64(
+                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+                  res_1);
+            } else {
+              *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                  _mm_cvtsi128_si32(res_0);
+              *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                  _mm_cvtsi128_si32(res_1);
+            }
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
 
+            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
         } else {
-          const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+          __m256i res_hi = convolve_lowbd(s + 4, coeffs);
 
-          const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                          res_lo_1);
+          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+          const __m256i res_hi_0_shift =
+              _mm256_slli_epi32(res_hi_0_32b, left_shift);
+          const __m256i res_hi_0_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+          const __m256i res_hi_1_shift =
+              _mm256_slli_epi32(res_hi_1_32b, left_shift);
+          const __m256i res_hi_1_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+          const __m256i res_hi_round =
+              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+          const __m256i res_hi_unsigned =
+              _mm256_add_epi16(res_hi_round, offset_const_2);
+
+          if (do_average) {
+            const __m256i data_ref_0_lo =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
 
-          const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0);
+            const __m256i data_ref_0_hi =
+                load_line2_avx2(&dst[i * dst_stride + j + 8],
+                                &dst[i * dst_stride + j + 8 + dst_stride]);
 
-          const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1);
-          _mm_store_si128(
-              (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1);
+            const __m256i comp_avg_res_lo = comp_avg(
+                &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i comp_avg_res_hi = comp_avg(
+                &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i round_result_lo =
+                convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i round_result_hi =
+                convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result_lo, round_result_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+          } else {
+            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+            const __m128i res_lo_1 =
+                _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_lo_1);
+
+            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+                            res_hi_0);
+
+            const __m128i res_hi_1 =
+                _mm256_extracti128_si256(res_hi_unsigned, 1);
+            _mm_store_si128(
+                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+                res_hi_1);
+          }
         }
-      }
-      s[0] = s[1];
-      s[1] = s[2];
-      s[2] = s[3];
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
 
-      s[4] = s[5];
-      s[5] = s[6];
-      s[6] = s[7];
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
     }
   }
 }
 
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                              int dst_stride0, int w, int h,
-                              const InterpFilterParams *filter_params_x,
-                              const InterpFilterParams *filter_params_y,
-                              const int subpel_x_q4, const int subpel_y_q4,
-                              ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
 
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
-  int im_h = h + filter_params_y->taps - 1;
+
   int im_stride = 8;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
   const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -364,18 +616,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const int rounding_shift =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
-  __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
 
   assert(conv_params->round_0 > 0);
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
   const __m256i round_const_h = _mm256_set1_epi16(
       ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
   const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -385,9 +628,29 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
       (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
   const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
 
-  for (j = 0; j < w; j += 8) {
-    /* Horizontal filter */
-    {
+  __m256i filt[4], coeffs_x[4], coeffs_y[4];
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
+    is_vert_4tap = 1;
+
+  if (is_horiz_4tap) {
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+    for (int j = 0; j < w; j += 8) {
+      /* Horizontal filter */
       const uint8_t *src_h = src_ptr + j;
       for (i = 0; i < im_h; i += 2) {
         __m256i data =
@@ -396,49 +659,59 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
           data = _mm256_inserti128_si256(
               data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
         src_h += (src_stride << 1);
-        __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
+        __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
 
         res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
                                round_shift_h);
 
         _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
       }
+      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
     }
+  } else if (is_vert_4tap) {
+    int im_h = h + 3;
+    const int fo_vert = 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      const uint8_t *src_h = src_ptr + j;
+      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
 
-    /* Vertical filter */
-    {
+      /* Vertical filter */
+      __m256i s[6];
       __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
       __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
       __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
       __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
-      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
-      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
 
       s[0] = _mm256_unpacklo_epi16(s0, s1);
       s[1] = _mm256_unpacklo_epi16(s2, s3);
-      s[2] = _mm256_unpacklo_epi16(s4, s5);
 
-      s[4] = _mm256_unpackhi_epi16(s0, s1);
-      s[5] = _mm256_unpackhi_epi16(s2, s3);
-      s[6] = _mm256_unpackhi_epi16(s4, s5);
+      s[3] = _mm256_unpackhi_epi16(s0, s1);
+      s[4] = _mm256_unpackhi_epi16(s2, s3);
 
       for (i = 0; i < h; i += 2) {
         const int16_t *data = &im_block[i * im_stride];
 
-        const __m256i s6 =
-            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
-        const __m256i s7 =
-            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+        const __m256i s4 =
+            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+        const __m256i s5 =
+            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
 
-        s[3] = _mm256_unpacklo_epi16(s6, s7);
-        s[7] = _mm256_unpackhi_epi16(s6, s7);
+        s[2] = _mm256_unpacklo_epi16(s4, s5);
+        s[5] = _mm256_unpackhi_epi16(s4, s5);
 
-        const __m256i res_a = convolve(s, coeffs_y);
+        const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
         const __m256i res_a_round = _mm256_sra_epi32(
             _mm256_add_epi32(res_a, round_const_v), round_shift_v);
 
         if (w - j > 4) {
-          const __m256i res_b = convolve(s + 4, coeffs_y);
+          const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
           const __m256i res_b_round = _mm256_sra_epi32(
               _mm256_add_epi32(res_b, round_const_v), round_shift_v);
           const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
@@ -448,8 +721,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
             const __m256i data_ref_0 =
                 load_line2_avx2(&dst[i * dst_stride + j],
                                 &dst[i * dst_stride + j + dst_stride]);
-            const __m256i comp_avg_res =
-                comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
 
             const __m256i round_result = convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -479,8 +752,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
                 load_line2_avx2(&dst[i * dst_stride + j],
                                 &dst[i * dst_stride + j + dst_stride]);
 
-            const __m256i comp_avg_res =
-                comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
 
             const __m256i round_result = convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -504,25 +777,36 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
                             res_1);
           }
         }
-
         s[0] = s[1];
         s[1] = s[2];
-        s[2] = s[3];
-
+        s[3] = s[4];
         s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
       }
     }
+  } else {
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      const uint8_t *src_h = src_ptr + j;
+      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+
+      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+    }
   }
 }
 
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
-                                   uint8_t *dst0, int dst_stride0, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
-                                   ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_avx2(
+    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -535,7 +819,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const __m256i wt = unpack_weights_avx2(conv_params);
   const __m256i zero = _mm256_setzero_si256();
 
@@ -562,7 +846,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
               _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
 
           const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m256i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -600,7 +884,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
           const __m256i data_ref_0 = load_line2_avx2(
               &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
           const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m256i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/jnt_convolve_sse2.c b/libaom/av1/common/x86/jnt_convolve_sse2.c
index 7f5677b..641cd02 100644
--- a/libaom/av1/common/x86/jnt_convolve_sse2.c
+++ b/libaom/av1/common/x86/jnt_convolve_sse2.c
@@ -16,12 +16,12 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
 
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                             int dst_stride0, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   const int dst_stride = conv_params->dst_stride;
@@ -37,7 +37,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const __m128i wt1 = _mm_set1_epi16(w1);
   const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -77,7 +77,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
         const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
 
         const __m128i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
         const __m128i round_result = convolve_rounding(
             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -134,7 +134,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -150,12 +150,12 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   }
 }
 
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                             int dst_stride0, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   const int dst_stride = conv_params->dst_stride;
@@ -167,7 +167,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
   const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -225,7 +225,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
         const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
 
         const __m128i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
         const __m128i round_result = convolve_rounding(
             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -254,7 +254,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
         const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
 
         const __m128i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
         const __m128i round_result = convolve_rounding(
             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -331,7 +331,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -360,7 +360,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -384,12 +384,12 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   }
 }
 
-void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                              int dst_stride0, int w, int h,
-                              const InterpFilterParams *filter_params_x,
-                              const InterpFilterParams *filter_params_y,
-                              const int subpel_x_q4, const int subpel_y_q4,
-                              ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
@@ -402,7 +402,7 @@ void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
   const __m128i zero = _mm_setzero_si128();
@@ -594,7 +594,7 @@ void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/jnt_convolve_ssse3.c b/libaom/av1/common/x86/jnt_convolve_ssse3.c
index 8227727..9aeab29 100644
--- a/libaom/av1/common/x86/jnt_convolve_ssse3.c
+++ b/libaom/av1/common/x86/jnt_convolve_ssse3.c
@@ -16,12 +16,11 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
 
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
-                               uint8_t *dst0, int dst_stride0, int w, int h,
-                               const InterpFilterParams *filter_params_x,
-                               const InterpFilterParams *filter_params_y,
-                               const int subpel_x_q4, const int subpel_y_q4,
-                               ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_ssse3(
+    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
@@ -34,7 +33,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
   const __m128i zero = _mm_setzero_si128();
@@ -211,7 +210,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/warp_plane_sse4.c b/libaom/av1/common/x86/warp_plane_sse4.c
index b810cea..4532d17 100644
--- a/libaom/av1/common/x86/warp_plane_sse4.c
+++ b/libaom/av1/common/x86/warp_plane_sse4.c
@@ -577,7 +577,7 @@ static INLINE void store_vertical_filter_output(
       __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
       const __m128i p_16 = _mm_loadl_epi64(p);
 
-      if (conv_params->use_jnt_comp_avg) {
+      if (conv_params->use_dist_wtd_comp_avg) {
         const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
         const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
         const __m128i shifted_32 =
@@ -610,7 +610,7 @@ static INLINE void store_vertical_filter_output(
             (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
         const __m128i p4_16 = _mm_loadl_epi64(p4);
 
-        if (conv_params->use_jnt_comp_avg) {
+        if (conv_params->use_dist_wtd_comp_avg) {
           const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
           const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
           const __m128i shifted_32 =
diff --git a/libaom/av1/common/x86/wiener_convolve_avx2.c b/libaom/av1/common/x86/wiener_convolve_avx2.c
index 1f13e2f..87a6e12 100644
--- a/libaom/av1/common/x86/wiener_convolve_avx2.c
+++ b/libaom/av1/common/x86/wiener_convolve_avx2.c
@@ -17,7 +17,6 @@
 #include "av1/common/convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 
@@ -26,207 +25,236 @@
 // on the left.
 // A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
 // loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
-
-// Exploiting the range of wiener filter coefficients,
-// horizontal filtering can be done in 16 bit intermediate precision.
-// The details are as follows :
-// Consider the horizontal wiener filter coefficients of the following form :
-//      [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0]
-// Subtracting  2^(FILTER_BITS) from the centre tap we get the following  :
-//      [C0, C1, C2,     -2 * (C0 + C1 + C2),             C2, C1, C0]
-// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3
-// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit
-// precision. Finally, after rounding the above result by round_0, we multiply
-// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the
-// horizontal filter output.
-
 void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
                                       const int16_t *filter_y, int y_step_q4,
                                       int w, int h,
                                       const ConvolveParams *conv_params) {
+  const int bd = 8;
   assert(x_step_q4 == 16 && y_step_q4 == 16);
   assert(!(w & 7));
   (void)x_step_q4;
   (void)y_step_q4;
 
-  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
-  int im_h = h + SUBPEL_TAPS - 2;
-  int im_stride = 8;
-  memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE);
-  int i, j;
-  const int center_tap = (SUBPEL_TAPS - 1) / 2;
+  DECLARE_ALIGNED(32, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 2;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
   const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
 
-  __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
-
-  assert(conv_params->round_0 > 0);
-
-  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
-
-  filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
-
-  const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
-  const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
-
-  // coeffs 0 1 0 1 0 1 0 1
-  coeffs_h[0] =
-      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
-  // coeffs 2 3 2 3 2 3 2 3
-  coeffs_h[1] =
-      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
-  // coeffs 4 5 4 5 4 5 4 5
-  coeffs_h[2] =
-      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
-  // coeffs 6 7 6 7 6 7 6 7
-  coeffs_h[3] =
-      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
-
-  const __m256i round_const_h =
-      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
-  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m256i zero_256 = _mm256_setzero_si256();
 
   // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i zero_128 = _mm_setzero_si128();
-  const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
-  const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
-
-  const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
-
-  // coeffs 0 1 0 1 0 1 0 1
-  coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
-  // coeffs 2 3 2 3 2 3 2 3
-  coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
-  // coeffs 4 5 4 5 4 5 4 5
-  coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
-  // coeffs 6 7 6 7 6 7 6 7
-  coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
-
-  const __m256i round_const_v =
-      _mm256_set1_epi32((1 << (conv_params->round_1 - 1)));
-  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
-
-  for (j = 0; j < w; j += 8) {
-    for (i = 0; i < im_h; i += 2) {
-      __m256i data = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
-
-      // Load the next line
-      if (i + 1 < im_h)
-        data = _mm256_inserti128_si256(
-            data,
-            _mm_loadu_si128(
-                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
-            1);
-
-      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
-
-      res =
-          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
-
-      __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
-
-      // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
-      // the result
-      data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
-      res = _mm256_add_epi16(res, data_0);
-
-      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+  const __m256i clamp_low = zero_256;
+  const __m256i clamp_high =
+      _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+  /* Horizontal filter */
+  {
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const = _mm256_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (int i = 0; i < intermediate_height; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint8_t *data_ij = src_ptr + i * src_stride + j;
+
+        // Load 8-bit src data
+        const __m128i data_0 = xx_loadu_128(data_ij + 0);
+        const __m128i data_1 = xx_loadu_128(data_ij + 1);
+        const __m128i data_2 = xx_loadu_128(data_ij + 2);
+        const __m128i data_3 = xx_loadu_128(data_ij + 3);
+        const __m128i data_4 = xx_loadu_128(data_ij + 4);
+        const __m128i data_5 = xx_loadu_128(data_ij + 5);
+        const __m128i data_6 = xx_loadu_128(data_ij + 6);
+        const __m128i data_7 = xx_loadu_128(data_ij + 7);
+
+        // (Zero-)Extend 8-bit data to 16-bit data
+        const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
+        const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
+        const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
+        const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
+        const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
+        const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
+        const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
+        const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
+
+        // Multiply src data by filter coeffs and sum pairs
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        // Calculate scalar product for even- and odd-indices separately,
+        // increasing to 32-bit precision
+        const __m256i res_even_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+        const __m256i res_odd_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+
+        const __m256i res_even = _mm256_srai_epi32(
+            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+        const __m256i res_odd = _mm256_srai_epi32(
+            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+        // Reduce to 16-bit precision and pack even- and odd-index results
+        // back into one register. The _mm256_packs_epi32 intrinsic returns
+        // a register with the pixels ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+        const __m256i res_clamped =
+            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+
+        // Store in a temporary array
+        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+      }
     }
+  }
 
-    /* Vertical filter */
-    {
-      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
-      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
-      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
-      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
-      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
-      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
-
-      __m256i s[8];
-      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
-      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
-      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
-
-      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
-      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
-      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
-
-      for (i = 0; i < h - 1; i += 2) {
-        const int16_t *data = &im_block[i * im_stride];
-
-        const __m256i s6 =
-            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
-        const __m256i s7 =
-            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
-
-        s[3] = _mm256_unpacklo_epi16(s6, s7);
-        s[7] = _mm256_unpackhi_epi16(s6, s7);
-
-        __m256i res_a = convolve(s, coeffs_v);
-        __m256i res_b = convolve(s + 4, coeffs_v);
-
-        const __m256i res_a_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
-        const __m256i res_b_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
-
-        /* rounding code */
-        // 16 bit conversion
-        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
-        // 8 bit conversion and saturation to uint8
-        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
-
-        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
-
-        // Store values into the destination buffer
-        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
-
-        _mm_storel_epi64(p_0, res_0);
-        _mm_storel_epi64(p_1, res_1);
-
-        s[0] = s[1];
-        s[1] = s[2];
-        s[2] = s[3];
-
-        s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
-      }
-      if (h - i) {
-        s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
-        s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
-        s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
-
-        const int16_t *data = &im_block[i * im_stride];
-        const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
-        const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
-
-        __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
-        __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
-
-        s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
-        __m256i convolveres = convolve(s, coeffs_v);
-
-        const __m256i res_round = _mm256_sra_epi32(
-            _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
-
-        /* rounding code */
-        // 16 bit conversion
-        __m128i reslo = _mm256_castsi256_si128(res_round);
-        __m128i reshi = _mm256_extracti128_si256(res_round, 1);
-        const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
-
-        // 8 bit conversion and saturation to uint8
-        const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
-        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storel_epi64(p_0, res_8b);
+  /* Vertical filter */
+  {
+    // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
+    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+    // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const =
+        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                          (1 << (bd + conv_params->round_1 - 1)));
+
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
+
+        // Load 16-bit data from the output of the horizontal filter in
+        // which the pixels are ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
+        const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
+        const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
+        const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
+        const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
+        const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
+        const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
+        const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
+
+        // Filter the even-indices, increasing to 32-bit precision
+        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+        const __m256i res_even = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+        // Filter the odd-indices, increasing to 32-bit precision
+        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        const __m256i res_odd = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+        // Pixels are currently in the following order:
+        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
+        //
+        // Rearrange the pixels into the following order:
+        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
+        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+        const __m256i res_lo_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m256i res_hi_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        // Reduce to 16-bit precision and pack into the correct order:
+        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+        const __m256i res_16bit =
+            _mm256_packs_epi32(res_lo_round, res_hi_round);
+
+        // Reduce to 8-bit precision. This messes up the order:
+        // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
+        // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
+        const __m256i res_8bit =
+            _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
+
+        // Swap the two central 32-bit values to get the order:
+        // [ - - - - - - - - - - - - - - - - ]
+        // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
+        const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
+
+        // Store the lower 128-bit lane in the dst array
+        xx_storeu_128(dst + i * dst_stride + j,
+                      _mm256_castsi256_si128(res_8bit2));
       }
     }
   }
author	Ray Essick <essick@google.com>	2019-03-29 15:30:55 -0700
committer	Ray Essick <essick@google.com>	2019-05-03 21:28:43 +0000
commit	ec6586dd308c18c15b581e3579894b4204c834bc (patch)
tree	97c131f6ce3576d63a07f047e11bc004ed19f117 /libaom/av1/common/x86
parent	b2a64d5cd5a1ee0c01456cbeb86c45a72eca9618 (diff)
download	platform_external_libaom-ec6586dd308c18c15b581e3579894b4204c834bc.tar.gz platform_external_libaom-ec6586dd308c18c15b581e3579894b4204c834bc.tar.bz2 platform_external_libaom-ec6586dd308c18c15b581e3579894b4204c834bc.zip