libaom: Pull from upstream

Current HEAD: 978ab9e6cd19904cdd54b69a4c30b10c747eb55a git log from upstream: 978ab9e6c AV1 levels: add min frame width and height 62133bf3e AV1 levels: add max superres tile width dee839cea AV1 levels: add max tile rate 1ca3b2652 Correct ref frame buffer in scaled subpixel simple_motion_search ceb16a2e6 Introduce early exit for partition none 5bdd95475 Implement av1_get_seq_level_idx() d06d2d5d3 Refactor check_level_constraints() 352263271 Move some data from AV1LevelSpec to AV1LevelStats bfe92612d AV1 levels: add check for min compression ratio 87a8394ac FIRSTPASS_STATS: Add comments for struct members. a1cf38d09 Disable two pass partition search on lowres and midres ecf5a3c12 Level test: add testcase for target level index 19 37fa0e848 AV1 levels: add header, display and decode rate eff7d3079 Remove unused parameters in tpl experiment 13cccf2db Update border for ref buffer to allow scaled pred fa946afbf Temp fix for ctrl based resize setting 81a59f162 Add data structure to store frame info. 98bb9d649 GF length reduction: respect min_gf_interval. 42f22cce2 Speed feature for adaptive-tx-search 987055e30 Update level info when show_existing_frame is ture <...> Bug: 124137416 Test: video playback Change-Id: I710b863d81cc663c8e286732f32e9b56ab35a5a0
author: Ray Essick <essick@google.com> 2019-03-29 15:30:55 -0700
committer: Ray Essick <essick@google.com> 2019-05-03 21:28:43 +0000
commit: ec6586dd308c18c15b581e3579894b4204c834bc (patch)
tree: 97c131f6ce3576d63a07f047e11bc004ed19f117
parent: b2a64d5cd5a1ee0c01456cbeb86c45a72eca9618 (diff)
download: platform_external_libaom-ec6586dd308c18c15b581e3579894b4204c834bc.tar.gz
platform_external_libaom-ec6586dd308c18c15b581e3579894b4204c834bc.tar.bz2
platform_external_libaom-ec6586dd308c18c15b581e3579894b4204c834bc.zip
289 files changed, 37118 insertions, 16681 deletions
diff --git a/Android.bp b/Android.bp
index f375775..c722d3a 100644
--- a/Android.bp
+++ b/Android.bp
@@ -122,7 +122,6 @@ aom_av1_decoder_sources = [
 aom_av1_encoder_asm_sse2 = [
     "libaom/av1/encoder/x86/dct_sse2.asm",
     "libaom/av1/encoder/x86/error_sse2.asm",
-    "libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm",
 ]
 
 aom_av1_encoder_asm_ssse3_x86_64 = [
@@ -132,8 +131,11 @@ aom_av1_encoder_asm_ssse3_x86_64 = [
 aom_av1_encoder_intrin_avx2 = [
     "libaom/av1/encoder/x86/av1_quantize_avx2.c",
     "libaom/av1/encoder/x86/av1_highbd_quantize_avx2.c",
+    "libaom/av1/encoder/x86/corner_match_avx2.c",
     "libaom/av1/encoder/x86/error_intrin_avx2.c",
+    "libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c",
     "libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c",
+    "libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c",
     "libaom/av1/encoder/x86/wedge_utils_avx2.c",
     "libaom/av1/encoder/x86/encodetxb_avx2.c",
     "libaom/av1/encoder/x86/rdopt_avx2.c",
@@ -170,6 +172,8 @@ aom_av1_encoder_intrin_sse4_1 = [
     "libaom/av1/encoder/x86/encodetxb_sse4.c",
     "libaom/av1/encoder/x86/highbd_fwd_txfm_sse4.c",
     "libaom/av1/encoder/x86/rdopt_sse4.c",
+    "libaom/av1/encoder/x86/temporal_filter_sse4.c",
+    "libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c",
     "libaom/av1/encoder/x86/pickrst_sse4.c",
 ]
 
@@ -194,20 +198,25 @@ aom_av1_encoder_sources = [
     "libaom/av1/encoder/encodeframe.c",
     "libaom/av1/encoder/encodemb.c",
     "libaom/av1/encoder/encodemv.c",
+    "libaom/av1/encoder/encode_strategy.c",
     "libaom/av1/encoder/encoder.c",
     "libaom/av1/encoder/encodetxb.c",
     "libaom/av1/encoder/ethread.c",
     "libaom/av1/encoder/extend.c",
     "libaom/av1/encoder/firstpass.c",
     "libaom/av1/encoder/global_motion.c",
+    "libaom/av1/encoder/gop_structure.c",
     "libaom/av1/encoder/hash.c",
     "libaom/av1/encoder/hash_motion.c",
     "libaom/av1/encoder/hybrid_fwd_txfm.c",
+    "libaom/av1/encoder/level.c",
     "libaom/av1/encoder/lookahead.c",
     "libaom/av1/encoder/mbgraph.c",
     "libaom/av1/encoder/mcomp.c",
     "libaom/av1/encoder/ml.c",
     "libaom/av1/encoder/palette.c",
+    "libaom/av1/encoder/partition_strategy.c",
+    "libaom/av1/encoder/pass2_strategy.c",
     "libaom/av1/encoder/pickcdef.c",
     "libaom/av1/encoder/picklpf.c",
     "libaom/av1/encoder/pickrst.c",
@@ -220,7 +229,9 @@ aom_av1_encoder_sources = [
     "libaom/av1/encoder/speed_features.c",
     "libaom/av1/encoder/temporal_filter.c",
     "libaom/av1/encoder/tokenize.c",
+    "libaom/av1/encoder/tpl_model.c",
     "libaom/av1/encoder/wedge_utils.c",
+    "libaom/av1/encoder/var_based_part.c",
     "libaom/third_party/fastfeat/fast.c",
     "libaom/third_party/fastfeat/fast_9.c",
     "libaom/third_party/fastfeat/nonmax.c",
diff --git a/config/arm/config/aom_config.asm b/config/arm/config/aom_config.asm
index b8fcd42..50338c1 100644
--- a/config/arm/config/aom_config.asm
+++ b/config/arm/config/aom_config.asm
@@ -13,7 +13,8 @@ ARCH_MIPS equ 0
 ARCH_PPC equ 0
 ARCH_X86 equ 0
 ARCH_X86_64 equ 0
-CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+CONFIG_2PASS_PARTITION_SEARCH_LVL_END equ 3
+CONFIG_2PASS_PARTITION_SEARCH_LVL_START equ 1
 CONFIG_ACCOUNTING equ 0
 CONFIG_ANALYZER equ 0
 CONFIG_AV1_DECODER equ 1
@@ -21,7 +22,8 @@ CONFIG_AV1_ENCODER equ 0
 CONFIG_BIG_ENDIAN equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1
+CONFIG_COLLECT_COMPONENT_TIMING equ 0
+CONFIG_COLLECT_PARTITION_STATS equ 0
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
 CONFIG_DENOISE equ 1
@@ -29,11 +31,8 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
-CONFIG_FIX_GF_LENGTH equ 1
-CONFIG_FP_MB_STATS equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
-CONFIG_GLOBAL_MOTION_SEARCH equ 1
 CONFIG_GPROF equ 0
 CONFIG_INSPECTION equ 0
 CONFIG_INTERNAL_STATS equ 0
@@ -44,16 +43,15 @@ CONFIG_MAX_DECODE_PROFILE equ 0
 CONFIG_MISMATCH_DEBUG equ 0
 CONFIG_MULTITHREAD equ 1
 CONFIG_NORMAL_TILE_MODE equ 1
-CONFIG_ONE_PASS_SVM equ 0
 CONFIG_OS_SUPPORT equ 1
 CONFIG_PIC equ 0
 CONFIG_RD_DEBUG equ 0
-CONFIG_REDUCED_ENCODER_BORDER equ 0
 CONFIG_RUNTIME_CPU_DETECT equ 0
 CONFIG_SHARED equ 0
 CONFIG_SHARP_SETTINGS equ 0
 CONFIG_SIZE_LIMIT equ 1
 CONFIG_SPATIAL_RESAMPLING equ 1
+CONFIG_SPEED_STATS equ 0
 CONFIG_STATIC equ 1
 CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
diff --git a/config/arm/config/aom_config.h b/config/arm/config/aom_config.h
index 5418985..a3b86df 100644
--- a/config/arm/config/aom_config.h
+++ b/config/arm/config/aom_config.h
@@ -15,7 +15,8 @@
 #define ARCH_PPC 0
 #define ARCH_X86 0
 #define ARCH_X86_64 0
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
 #define CONFIG_ACCOUNTING 0
 #define CONFIG_ANALYZER 0
 #define CONFIG_AV1_DECODER 1
@@ -23,7 +24,8 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -31,11 +33,8 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
-#define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
-#define CONFIG_GLOBAL_MOTION_SEARCH 1
 #define CONFIG_GPROF 0
 #define CONFIG_INSPECTION 0
 #define CONFIG_INTERNAL_STATS 0
@@ -46,16 +45,15 @@
 #define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_MULTITHREAD 1
 #define CONFIG_NORMAL_TILE_MODE 1
-#define CONFIG_ONE_PASS_SVM 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
-#define CONFIG_REDUCED_ENCODER_BORDER 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_SHARED 0
 #define CONFIG_SHARP_SETTINGS 0
 #define CONFIG_SIZE_LIMIT 1
 #define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_SPEED_STATS 0
 #define CONFIG_STATIC 1
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
diff --git a/config/arm/config/aom_dsp_rtcd.h b/config/arm/config/aom_dsp_rtcd.h
index e3150f7..0b1a28a 100644
--- a/config/arm/config/aom_dsp_rtcd.h
+++ b/config/arm/config/aom_dsp_rtcd.h
@@ -1400,10 +1400,6 @@ void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define aom_v_predictor_8x8 aom_v_predictor_8x8_neon
 
-void av1_round_shift_array_c(int32_t *arr, int size, int bit);
-void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
-#define av1_round_shift_array av1_round_shift_array_neon
-
 void aom_dsp_rtcd(void);
 
 #include "config/aom_config.h"
diff --git a/config/arm/config/aom_scale_rtcd.h b/config/arm/config/aom_scale_rtcd.h
index 7260bd3..067ddb4 100644
--- a/config/arm/config/aom_scale_rtcd.h
+++ b/config/arm/config/aom_scale_rtcd.h
@@ -77,6 +77,9 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
 void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
 #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
 
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes);
+#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
+
 void aom_scale_rtcd(void);
 
 #include "config/aom_config.h"
diff --git a/config/arm/config/av1_rtcd.h b/config/arm/config/av1_rtcd.h
index c58e511..6f42666 100644
--- a/config/arm/config/av1_rtcd.h
+++ b/config/arm/config/av1_rtcd.h
@@ -89,6 +89,22 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d
 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
 #define av1_convolve_y_sr av1_convolve_y_sr_neon
 
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_neon
+
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_neon
+
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_neon
+
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_neon
+
 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
 #define av1_dr_prediction_z1 av1_dr_prediction_z1_c
 
@@ -140,6 +156,18 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *d
 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c
 
+void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
+
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c
+
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c
+
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c
+
 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
 
@@ -152,27 +180,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
 void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
 
-void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c
-
-void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x32 av1_highbd_inv_txfm_add_16x32_c
-
 void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c
 
-void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c
-
-void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x16 av1_highbd_inv_txfm_add_32x16_c
-
-void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c
-
-void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x8 av1_highbd_inv_txfm_add_32x8_c
-
 void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c
 
@@ -182,12 +192,6 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int
 void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c
 
-void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c
-
-void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x32 av1_highbd_inv_txfm_add_8x32_c
-
 void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c
 
@@ -200,18 +204,6 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
 
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c
-
-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c
-
-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c
-
-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c
-
 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 #define av1_highbd_warp_affine av1_highbd_warp_affine_c
 
@@ -279,21 +271,9 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con
 void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_inv_txfm_add av1_inv_txfm_add_neon
 
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d av1_jnt_convolve_2d_neon
-
-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_neon
-
-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_x av1_jnt_convolve_x_neon
-
-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_y av1_jnt_convolve_y_neon
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
+#define av1_round_shift_array av1_round_shift_array_neon
 
 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
diff --git a/config/arm64/config/aom_config.asm b/config/arm64/config/aom_config.asm
index b8fcd42..50338c1 100644
--- a/config/arm64/config/aom_config.asm
+++ b/config/arm64/config/aom_config.asm
@@ -13,7 +13,8 @@ ARCH_MIPS equ 0
 ARCH_PPC equ 0
 ARCH_X86 equ 0
 ARCH_X86_64 equ 0
-CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+CONFIG_2PASS_PARTITION_SEARCH_LVL_END equ 3
+CONFIG_2PASS_PARTITION_SEARCH_LVL_START equ 1
 CONFIG_ACCOUNTING equ 0
 CONFIG_ANALYZER equ 0
 CONFIG_AV1_DECODER equ 1
@@ -21,7 +22,8 @@ CONFIG_AV1_ENCODER equ 0
 CONFIG_BIG_ENDIAN equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1
+CONFIG_COLLECT_COMPONENT_TIMING equ 0
+CONFIG_COLLECT_PARTITION_STATS equ 0
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
 CONFIG_DENOISE equ 1
@@ -29,11 +31,8 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
-CONFIG_FIX_GF_LENGTH equ 1
-CONFIG_FP_MB_STATS equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
-CONFIG_GLOBAL_MOTION_SEARCH equ 1
 CONFIG_GPROF equ 0
 CONFIG_INSPECTION equ 0
 CONFIG_INTERNAL_STATS equ 0
@@ -44,16 +43,15 @@ CONFIG_MAX_DECODE_PROFILE equ 0
 CONFIG_MISMATCH_DEBUG equ 0
 CONFIG_MULTITHREAD equ 1
 CONFIG_NORMAL_TILE_MODE equ 1
-CONFIG_ONE_PASS_SVM equ 0
 CONFIG_OS_SUPPORT equ 1
 CONFIG_PIC equ 0
 CONFIG_RD_DEBUG equ 0
-CONFIG_REDUCED_ENCODER_BORDER equ 0
 CONFIG_RUNTIME_CPU_DETECT equ 0
 CONFIG_SHARED equ 0
 CONFIG_SHARP_SETTINGS equ 0
 CONFIG_SIZE_LIMIT equ 1
 CONFIG_SPATIAL_RESAMPLING equ 1
+CONFIG_SPEED_STATS equ 0
 CONFIG_STATIC equ 1
 CONFIG_WEBM_IO equ 1
 DECODE_HEIGHT_LIMIT equ 16384
diff --git a/config/arm64/config/aom_config.h b/config/arm64/config/aom_config.h
index 5418985..a3b86df 100644
--- a/config/arm64/config/aom_config.h
+++ b/config/arm64/config/aom_config.h
@@ -15,7 +15,8 @@
 #define ARCH_PPC 0
 #define ARCH_X86 0
 #define ARCH_X86_64 0
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
 #define CONFIG_ACCOUNTING 0
 #define CONFIG_ANALYZER 0
 #define CONFIG_AV1_DECODER 1
@@ -23,7 +24,8 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -31,11 +33,8 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
-#define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
-#define CONFIG_GLOBAL_MOTION_SEARCH 1
 #define CONFIG_GPROF 0
 #define CONFIG_INSPECTION 0
 #define CONFIG_INTERNAL_STATS 0
@@ -46,16 +45,15 @@
 #define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_MULTITHREAD 1
 #define CONFIG_NORMAL_TILE_MODE 1
-#define CONFIG_ONE_PASS_SVM 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
-#define CONFIG_REDUCED_ENCODER_BORDER 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_SHARED 0
 #define CONFIG_SHARP_SETTINGS 0
 #define CONFIG_SIZE_LIMIT 1
 #define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_SPEED_STATS 0
 #define CONFIG_STATIC 1
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
diff --git a/config/arm64/config/aom_dsp_rtcd.h b/config/arm64/config/aom_dsp_rtcd.h
index e3150f7..0b1a28a 100644
--- a/config/arm64/config/aom_dsp_rtcd.h
+++ b/config/arm64/config/aom_dsp_rtcd.h
@@ -1400,10 +1400,6 @@ void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define aom_v_predictor_8x8 aom_v_predictor_8x8_neon
 
-void av1_round_shift_array_c(int32_t *arr, int size, int bit);
-void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
-#define av1_round_shift_array av1_round_shift_array_neon
-
 void aom_dsp_rtcd(void);
 
 #include "config/aom_config.h"
diff --git a/config/arm64/config/aom_scale_rtcd.h b/config/arm64/config/aom_scale_rtcd.h
index 7260bd3..067ddb4 100644
--- a/config/arm64/config/aom_scale_rtcd.h
+++ b/config/arm64/config/aom_scale_rtcd.h
@@ -77,6 +77,9 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
 void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
 #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
 
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes);
+#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
+
 void aom_scale_rtcd(void);
 
 #include "config/aom_config.h"
diff --git a/config/arm64/config/av1_rtcd.h b/config/arm64/config/av1_rtcd.h
index c58e511..6f42666 100644
--- a/config/arm64/config/av1_rtcd.h
+++ b/config/arm64/config/av1_rtcd.h
@@ -89,6 +89,22 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d
 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
 #define av1_convolve_y_sr av1_convolve_y_sr_neon
 
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_neon
+
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_neon
+
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_neon
+
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_neon
+
 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
 #define av1_dr_prediction_z1 av1_dr_prediction_z1_c
 
@@ -140,6 +156,18 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *d
 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c
 
+void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
+
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c
+
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c
+
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c
+
 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
 
@@ -152,27 +180,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
 void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
 
-void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c
-
-void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x32 av1_highbd_inv_txfm_add_16x32_c
-
 void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c
 
-void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c
-
-void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x16 av1_highbd_inv_txfm_add_32x16_c
-
-void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c
-
-void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x8 av1_highbd_inv_txfm_add_32x8_c
-
 void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c
 
@@ -182,12 +192,6 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int
 void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c
 
-void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c
-
-void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x32 av1_highbd_inv_txfm_add_8x32_c
-
 void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c
 
@@ -200,18 +204,6 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
 
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c
-
-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c
-
-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c
-
-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c
-
 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 #define av1_highbd_warp_affine av1_highbd_warp_affine_c
 
@@ -279,21 +271,9 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con
 void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_inv_txfm_add av1_inv_txfm_add_neon
 
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d av1_jnt_convolve_2d_neon
-
-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_neon
-
-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_x av1_jnt_convolve_x_neon
-
-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_y av1_jnt_convolve_y_neon
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
+#define av1_round_shift_array av1_round_shift_array_neon
 
 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
diff --git a/config/x86/config/aom_config.asm b/config/x86/config/aom_config.asm
index 4360c87..222e3bf 100644
--- a/config/x86/config/aom_config.asm
+++ b/config/x86/config/aom_config.asm
@@ -3,7 +3,7 @@
 %define ARCH_PPC 0
 %define ARCH_X86 1
 %define ARCH_X86_64 0
-%define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+%define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
 %define CONFIG_ACCOUNTING 0
 %define CONFIG_ANALYZER 0
 %define CONFIG_AV1_DECODER 1
@@ -11,7 +11,8 @@
 %define CONFIG_BIG_ENDIAN 0
 %define CONFIG_BITSTREAM_DEBUG 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-%define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
+%define CONFIG_COLLECT_PARTITION_STATS 0
 %define CONFIG_COLLECT_RD_STATS 0
 %define CONFIG_DEBUG 0
 %define CONFIG_DENOISE 1
@@ -19,11 +20,8 @@
 %define CONFIG_DIST_8X8 0
 %define CONFIG_ENTROPY_STATS 0
 %define CONFIG_FILEOPTIONS 1
-%define CONFIG_FIX_GF_LENGTH 1
-%define CONFIG_FP_MB_STATS 0
 %define CONFIG_GCC 1
 %define CONFIG_GCOV 0
-%define CONFIG_GLOBAL_MOTION_SEARCH 1
 %define CONFIG_GPROF 0
 %define CONFIG_INSPECTION 0
 %define CONFIG_INTERNAL_STATS 0
@@ -34,16 +32,15 @@
 %define CONFIG_MISMATCH_DEBUG 0
 %define CONFIG_MULTITHREAD 1
 %define CONFIG_NORMAL_TILE_MODE 1
-%define CONFIG_ONE_PASS_SVM 0
 %define CONFIG_OS_SUPPORT 1
 %define CONFIG_PIC 1
 %define CONFIG_RD_DEBUG 0
-%define CONFIG_REDUCED_ENCODER_BORDER 0
 %define CONFIG_RUNTIME_CPU_DETECT 0
 %define CONFIG_SHARED 0
 %define CONFIG_SHARP_SETTINGS 0
 %define CONFIG_SIZE_LIMIT 1
 %define CONFIG_SPATIAL_RESAMPLING 1
+%define CONFIG_SPEED_STATS 0
 %define CONFIG_STATIC 1
 %define CONFIG_WEBM_IO 1
 %define DECODE_HEIGHT_LIMIT 16384
diff --git a/config/x86/config/aom_config.h b/config/x86/config/aom_config.h
index e162899..db2edbd 100644
--- a/config/x86/config/aom_config.h
+++ b/config/x86/config/aom_config.h
@@ -15,7 +15,8 @@
 #define ARCH_PPC 0
 #define ARCH_X86 1
 #define ARCH_X86_64 0
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
 #define CONFIG_ACCOUNTING 0
 #define CONFIG_ANALYZER 0
 #define CONFIG_AV1_DECODER 1
@@ -23,7 +24,8 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -31,11 +33,8 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
-#define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
-#define CONFIG_GLOBAL_MOTION_SEARCH 1
 #define CONFIG_GPROF 0
 #define CONFIG_INSPECTION 0
 #define CONFIG_INTERNAL_STATS 0
@@ -46,16 +45,15 @@
 #define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_MULTITHREAD 1
 #define CONFIG_NORMAL_TILE_MODE 1
-#define CONFIG_ONE_PASS_SVM 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_PIC 1
 #define CONFIG_RD_DEBUG 0
-#define CONFIG_REDUCED_ENCODER_BORDER 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_SHARED 0
 #define CONFIG_SHARP_SETTINGS 0
 #define CONFIG_SIZE_LIMIT 1
 #define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_SPEED_STATS 0
 #define CONFIG_STATIC 1
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
diff --git a/config/x86/config/aom_dsp_rtcd.h b/config/x86/config/aom_dsp_rtcd.h
index 8f11e0b..f84f313 100644
--- a/config/x86/config/aom_dsp_rtcd.h
+++ b/config/x86/config/aom_dsp_rtcd.h
@@ -1650,9 +1650,6 @@ void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define aom_v_predictor_8x8 aom_v_predictor_8x8_sse2
 
-void av1_round_shift_array_c(int32_t *arr, int size, int bit);
-#define av1_round_shift_array av1_round_shift_array_c
-
 void aom_dsp_rtcd(void);
 
 #ifdef RTCD_C
diff --git a/config/x86/config/aom_scale_rtcd.h b/config/x86/config/aom_scale_rtcd.h
index b6e8149..65c184b 100644
--- a/config/x86/config/aom_scale_rtcd.h
+++ b/config/x86/config/aom_scale_rtcd.h
@@ -77,6 +77,9 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
 void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
 #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
 
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes);
+#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
+
 void aom_scale_rtcd(void);
 
 #ifdef RTCD_C
diff --git a/config/x86/config/av1_rtcd.h b/config/x86/config/av1_rtcd.h
index c5d7794..f788933 100644
--- a/config/x86/config/av1_rtcd.h
+++ b/config/x86/config/av1_rtcd.h
@@ -88,6 +88,23 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d
 void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
 #define av1_convolve_y_sr av1_convolve_y_sr_sse2
 
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_ssse3
+
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_sse2
+
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_sse2
+
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_sse2
+
 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
 #define av1_dr_prediction_z1 av1_dr_prediction_z1_c
 
@@ -143,6 +160,18 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *d
 void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_ssse3
 
+void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
+
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c
+
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c
+
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c
+
 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
 
@@ -155,27 +184,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
 void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
 
-void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c
-
-void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x32 av1_highbd_inv_txfm_add_16x32_c
-
 void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c
 
-void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c
-
-void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x16 av1_highbd_inv_txfm_add_32x16_c
-
-void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c
-
-void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x8 av1_highbd_inv_txfm_add_32x8_c
-
 void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c
 
@@ -185,12 +196,6 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int
 void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c
 
-void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c
-
-void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x32 av1_highbd_inv_txfm_add_8x32_c
-
 void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c
 
@@ -203,18 +208,6 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
 
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c
-
-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c
-
-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c
-
-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c
-
 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 #define av1_highbd_warp_affine av1_highbd_warp_affine_c
 
@@ -283,22 +276,8 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con
 void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_inv_txfm_add av1_inv_txfm_add_ssse3
 
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d av1_jnt_convolve_2d_ssse3
-
-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_sse2
-
-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_x av1_jnt_convolve_x_sse2
-
-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_y av1_jnt_convolve_y_sse2
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+#define av1_round_shift_array av1_round_shift_array_c
 
 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
diff --git a/config/x86_64/config/aom_config.asm b/config/x86_64/config/aom_config.asm
index 986dc75..43e7f74 100644
--- a/config/x86_64/config/aom_config.asm
+++ b/config/x86_64/config/aom_config.asm
@@ -3,7 +3,7 @@
 %define ARCH_PPC 0
 %define ARCH_X86 0
 %define ARCH_X86_64 1
-%define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+%define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
 %define CONFIG_ACCOUNTING 0
 %define CONFIG_ANALYZER 0
 %define CONFIG_AV1_DECODER 1
@@ -11,7 +11,8 @@
 %define CONFIG_BIG_ENDIAN 0
 %define CONFIG_BITSTREAM_DEBUG 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-%define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
+%define CONFIG_COLLECT_PARTITION_STATS 0
 %define CONFIG_COLLECT_RD_STATS 0
 %define CONFIG_DEBUG 0
 %define CONFIG_DENOISE 1
@@ -19,11 +20,8 @@
 %define CONFIG_DIST_8X8 0
 %define CONFIG_ENTROPY_STATS 0
 %define CONFIG_FILEOPTIONS 1
-%define CONFIG_FIX_GF_LENGTH 1
-%define CONFIG_FP_MB_STATS 0
 %define CONFIG_GCC 1
 %define CONFIG_GCOV 0
-%define CONFIG_GLOBAL_MOTION_SEARCH 1
 %define CONFIG_GPROF 0
 %define CONFIG_INSPECTION 0
 %define CONFIG_INTERNAL_STATS 0
@@ -34,16 +32,15 @@
 %define CONFIG_MISMATCH_DEBUG 0
 %define CONFIG_MULTITHREAD 1
 %define CONFIG_NORMAL_TILE_MODE 1
-%define CONFIG_ONE_PASS_SVM 0
 %define CONFIG_OS_SUPPORT 1
 %define CONFIG_PIC 0
 %define CONFIG_RD_DEBUG 0
-%define CONFIG_REDUCED_ENCODER_BORDER 0
 %define CONFIG_RUNTIME_CPU_DETECT 0
 %define CONFIG_SHARED 0
 %define CONFIG_SHARP_SETTINGS 0
 %define CONFIG_SIZE_LIMIT 1
 %define CONFIG_SPATIAL_RESAMPLING 1
+%define CONFIG_SPEED_STATS 0
 %define CONFIG_STATIC 1
 %define CONFIG_WEBM_IO 1
 %define DECODE_HEIGHT_LIMIT 16384
diff --git a/config/x86_64/config/aom_config.h b/config/x86_64/config/aom_config.h
index 0f32913..610e8ca 100644
--- a/config/x86_64/config/aom_config.h
+++ b/config/x86_64/config/aom_config.h
@@ -15,7 +15,8 @@
 #define ARCH_PPC 0
 #define ARCH_X86 0
 #define ARCH_X86_64 1
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
 #define CONFIG_ACCOUNTING 0
 #define CONFIG_ANALYZER 0
 #define CONFIG_AV1_DECODER 1
@@ -23,7 +24,8 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -31,11 +33,8 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
-#define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
-#define CONFIG_GLOBAL_MOTION_SEARCH 1
 #define CONFIG_GPROF 0
 #define CONFIG_INSPECTION 0
 #define CONFIG_INTERNAL_STATS 0
@@ -46,16 +45,15 @@
 #define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_MULTITHREAD 1
 #define CONFIG_NORMAL_TILE_MODE 1
-#define CONFIG_ONE_PASS_SVM 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_PIC 0
 #define CONFIG_RD_DEBUG 0
-#define CONFIG_REDUCED_ENCODER_BORDER 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_SHARED 0
 #define CONFIG_SHARP_SETTINGS 0
 #define CONFIG_SIZE_LIMIT 1
 #define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_SPEED_STATS 0
 #define CONFIG_STATIC 1
 #define CONFIG_WEBM_IO 1
 #define DECODE_HEIGHT_LIMIT 16384
diff --git a/config/x86_64/config/aom_dsp_rtcd.h b/config/x86_64/config/aom_dsp_rtcd.h
index 8f11e0b..f84f313 100644
--- a/config/x86_64/config/aom_dsp_rtcd.h
+++ b/config/x86_64/config/aom_dsp_rtcd.h
@@ -1650,9 +1650,6 @@ void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define aom_v_predictor_8x8 aom_v_predictor_8x8_sse2
 
-void av1_round_shift_array_c(int32_t *arr, int size, int bit);
-#define av1_round_shift_array av1_round_shift_array_c
-
 void aom_dsp_rtcd(void);
 
 #ifdef RTCD_C
diff --git a/config/x86_64/config/aom_scale_rtcd.h b/config/x86_64/config/aom_scale_rtcd.h
index b6e8149..65c184b 100644
--- a/config/x86_64/config/aom_scale_rtcd.h
+++ b/config/x86_64/config/aom_scale_rtcd.h
@@ -77,6 +77,9 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
 void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
 #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
 
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes);
+#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
+
 void aom_scale_rtcd(void);
 
 #ifdef RTCD_C
diff --git a/config/x86_64/config/av1_rtcd.h b/config/x86_64/config/av1_rtcd.h
index 043595d..84673ba 100644
--- a/config/x86_64/config/av1_rtcd.h
+++ b/config/x86_64/config/av1_rtcd.h
@@ -88,6 +88,23 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d
 void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
 #define av1_convolve_y_sr av1_convolve_y_sr_sse2
 
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_ssse3
+
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_sse2
+
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_sse2
+
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_sse2
+
 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
 #define av1_dr_prediction_z1 av1_dr_prediction_z1_c
 
@@ -146,6 +163,18 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *d
 void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_ssse3
 
+void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
+
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c
+
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c
+
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c
+
 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
 
@@ -158,27 +187,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
 void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
 
-void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c
-
-void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x32 av1_highbd_inv_txfm_add_16x32_c
-
 void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c
 
-void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c
-
-void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x16 av1_highbd_inv_txfm_add_32x16_c
-
-void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c
-
-void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x8 av1_highbd_inv_txfm_add_32x8_c
-
 void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c
 
@@ -188,12 +199,6 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int
 void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c
 
-void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c
-
-void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x32 av1_highbd_inv_txfm_add_8x32_c
-
 void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c
 
@@ -206,18 +211,6 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
 
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c
-
-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c
-
-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c
-
-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c
-
 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 #define av1_highbd_warp_affine av1_highbd_warp_affine_c
 
@@ -286,22 +279,8 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con
 void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_inv_txfm_add av1_inv_txfm_add_ssse3
 
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d av1_jnt_convolve_2d_ssse3
-
-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_sse2
-
-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_x av1_jnt_convolve_x_sse2
-
-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_y av1_jnt_convolve_y_sse2
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+#define av1_round_shift_array av1_round_shift_array_c
 
 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
diff --git a/libaom/CMakeLists.txt b/libaom/CMakeLists.txt
index f409892..2c35a0f 100644
--- a/libaom/CMakeLists.txt
+++ b/libaom/CMakeLists.txt
@@ -293,8 +293,11 @@ if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
 
     if(EMSCRIPTEN)
       add_preproc_definition(_POSIX_SOURCE)
-      append_link_flag_to_target("inspect" "-s TOTAL_MEMORY=402653184")
+      append_link_flag_to_target("inspect" "--emrun")
+      append_link_flag_to_target("inspect" "-s USE_PTHREADS=0")
+      append_link_flag_to_target("inspect" "-s WASM=1")
       append_link_flag_to_target("inspect" "-s MODULARIZE=1")
+      append_link_flag_to_target("inspect" "-s ALLOW_MEMORY_GROWTH=1")
       append_link_flag_to_target(
         "inspect" "-s \'EXTRA_EXPORTED_RUNTIME_METHODS=[\"UTF8ToString\"]\'")
       append_link_flag_to_target("inspect"
diff --git a/libaom/PATENTS b/libaom/PATENTS
index be491f5..493f616 100644
--- a/libaom/PATENTS
+++ b/libaom/PATENTS
@@ -57,10 +57,10 @@ Alliance for Open Media Patent License 1.0
 
 2. Definitions.
 
-2.1. Affiliate.  �Affiliate� means an entity that directly or indirectly
+2.1. Affiliate.  "Affiliate" means an entity that directly or indirectly
      Controls, is Controlled by, or is under common Control of that party.
 
-2.2. Control. �Control� means direct or indirect control of more than 50% of
+2.2. Control. "Control" means direct or indirect control of more than 50% of
      the voting power to elect directors of that corporation, or for any other
      entity, the power to direct management of such entity.
 
@@ -70,7 +70,7 @@ Alliance for Open Media Patent License 1.0
 2.4. Encoder.  "Encoder" means any encoder that produces a bitstream that can
      be decoded by a Decoder only to the extent it produces such a bitstream.
 
-2.5. Final Deliverable.  �Final Deliverable� means the final version of a
+2.5. Final Deliverable.  "Final Deliverable" means the final version of a
      deliverable approved by the Alliance for Open Media as a Final
      Deliverable.
 
@@ -79,9 +79,9 @@ Alliance for Open Media Patent License 1.0
      Implementation also includes components of an Implementation only to the
      extent they are used as part of an Implementation.
 
-2.7. License. �License� means this license.
+2.7. License. "License" means this license.
 
-2.8. Licensee. �Licensee� means any person or entity who exercises patent
+2.8. Licensee. "Licensee" means any person or entity who exercises patent
      rights granted under this License.
 
 2.9. Licensor.  "Licensor" means (i) any Licensee that makes, sells, offers
@@ -98,11 +98,11 @@ Alliance for Open Media Patent License 1.0
       as if the Specification was a W3C Recommendation; or (ii) are infringed
       by the Reference Implementation.
 
-2.11. Reference Implementation. �Reference Implementation� means an Encoder
+2.11. Reference Implementation. "Reference Implementation" means an Encoder
       and/or Decoder released by the Alliance for Open Media as a Final
       Deliverable.
 
-2.12. Specification. �Specification� means the specification designated by
+2.12. Specification. "Specification" means the specification designated by
       the Alliance for Open Media as a Final Deliverable for which this
       License was issued.
 
diff --git a/libaom/aom/aom_encoder.h b/libaom/aom/aom_encoder.h
index 777236f..f8a7cec 100644
--- a/libaom/aom/aom_encoder.h
+++ b/libaom/aom/aom_encoder.h
@@ -406,8 +406,7 @@ typedef struct aom_codec_enc_cfg {
    * upscaling after the encode/decode process. Taking control of upscaling and
    * using restoration filters should allow it to outperform normal resizing.
    *
-   * Mode 0 is SUPERRES_NONE, mode 1 is SUPERRES_FIXED, mode 2 is
-   * SUPERRES_RANDOM and mode 3 is SUPERRES_QTHRESH.
+   * Valid values are 0 to 4 as defined in enum SUPERRES_MODE.
    */
   unsigned int rc_superres_mode;
 
@@ -862,6 +861,11 @@ aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
  */
 aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
 
+/*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */
+#define AOM_USAGE_GOOD_QUALITY (0)
+/*!\brief usage parameter analogous to AV1 REALTIME mode. */
+#define AOM_USAGE_REALTIME (1)
+
 /*!\brief Encode a frame
  *
  * Encodes a video frame at the given "presentation time." The presentation
diff --git a/libaom/aom/aom_frame_buffer.h b/libaom/aom/aom_frame_buffer.h
index fba4322..a715645 100644
--- a/libaom/aom/aom_frame_buffer.h
+++ b/libaom/aom/aom_frame_buffer.h
@@ -53,9 +53,9 @@ typedef struct aom_codec_frame_buffer {
  * data. The callback is triggered when the decoder needs a frame buffer to
  * decode a compressed image into. This function may be called more than once
  * for every call to aom_codec_decode. The application may set fb->priv to
- * some data which will be passed back in the ximage and the release function
- * call. |fb| is guaranteed to not be NULL. On success the callback must
- * return 0. Any failure the callback must return a value less than 0.
+ * some data which will be passed back in the aom_image_t and the release
+ * function call. |fb| is guaranteed to not be NULL. On success the callback
+ * must return 0. Any failure the callback must return a value less than 0.
  *
  * \param[in] priv         Callback's private data
  * \param[in] new_size     Size in bytes needed by the buffer
diff --git a/libaom/aom/aomcx.h b/libaom/aom/aomcx.h
index 9aa77bb..da7498f 100644
--- a/libaom/aom/aomcx.h
+++ b/libaom/aom/aomcx.h
@@ -512,16 +512,25 @@ enum aome_enc_control_id {
    */
   AV1E_SET_RENDER_SIZE,
 
-  /*!\brief Codec control function to set target level.
-   *
-   * 255: off (default); 0: only keep level stats; 10: target for level 1.0;
-   * 11: target for level 1.1; ... 62: target for level 6.2
-   */
-  AV1E_SET_TARGET_LEVEL,
-
-  /*!\brief Codec control function to get bitstream level.
-   */
-  AV1E_GET_LEVEL,
+  /*!\brief Control to set target sequence level index for a certain operating
+   * point(OP).
+   * Possible values are in the form of "ABxy"(pad leading zeros if less than
+   * 4 digits).
+   *   AB: OP index.
+   *   xy: Target level index for the OP. Can be values 0~23(corresponding to
+   *   level 2.0 ~ 7.3) or 31(maximum level parameter, no level-based
+   *   constraints).
+   * E.g. "0" means target level index 0 for the 0th OP;
+   *      "111" means target level index 11 for the 1st OP;
+   *      "1021" means target level index 21 for the 10th OP.
+   * If the target level is not specified for an OP, the maximum level parameter
+   * of 31 is used as default.
+   */
+  AV1E_SET_TARGET_SEQ_LEVEL_IDX,
+
+  /*!\brief Codec control function to get sequence level index.
+   */
+  AV1E_GET_SEQ_LEVEL_IDX,
 
   /*!\brief Codec control function to set intended superblock size.
    *
@@ -561,12 +570,23 @@ enum aome_enc_control_id {
    */
   AV1E_SET_ENABLE_RESTORATION,
 
+  /*!\brief Codec control function to predict with OBMC mode.
+   *
+   *                          0 = do not allow OBMC mode
+   *                          1 = allow OBMC mode
+   *
+   *  By default, the encoder allows OBMC prediction mode.
+   *
+   */
+  AV1E_SET_ENABLE_OBMC,
+
   /*!\brief Codec control function to encode without trellis quantization.
    *
    *                          0 = apply trellis quantization
    *                          1 = do not apply trellis quantization
+   *                          2 = disable trellis quantization partially
    *
-   *  By default, the encoder applies trellis optimization on quantized
+   *  By default, the encoder applies optimization on quantized
    *  coefficients.
    *
    */
@@ -700,13 +720,59 @@ enum aome_enc_control_id {
    */
   AV1E_SET_ANS_WINDOW_SIZE_LOG2,
 
-  /*!\brief Codec control function to turn on / off dual filter
-   * enabling/disabling.
+  /*!\brief Codec control function to enable/disable rectangular partitions.
+   *
+   * This will enable or disable usage of rectangular partitions. The default
+   * value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_RECT_PARTITIONS,
+
+  /*!\brief Codec control function to enable/disable AB partitions.
+   *
+   * This will enable or disable usage of AB partitions. The default
+   * value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_AB_PARTITIONS,
+
+  /*!\brief Codec control function to enable/disable 1:4 and 4:1 partitions.
    *
-   * This will enable or disable dual filter. The default value is 1
+   * This will enable or disable usage of 1:4 and 4:1 partitions. The default
+   * value is 1.
    *
    */
-  AV1E_SET_ENABLE_DF,
+  AV1E_SET_ENABLE_1TO4_PARTITIONS,
+
+  /*!\brief Codec control function to set min partition size.
+   *
+   * This will set min partition size. The default value is 4 for 4x4.
+   * valid values are [4, 8, 16, 32, 64, 128]
+   * min_partition_size is applied to both width and height of the partition.
+   * i.e, both width and height of a partition can not be smaller than
+   * the min_partition_size, except the partition at the picture boundary.
+   *
+   */
+  AV1E_SET_MIN_PARTITION_SIZE,
+
+  /*!\brief Codec control function to set max partition size.
+   *
+   * This will set max partition size. The default value is 128 for 128x128.
+   * valid values are [4, 8, 16, 32, 64, 128]
+   * max_partition_size is applied to both width and height of the partition.
+   * i.e, both width and height of a partition can not be larger than
+   * the max_partition_size.
+   */
+  AV1E_SET_MAX_PARTITION_SIZE,
+
+  /*!\brief Codec control function to turn on / off intra edge filter
+   * at sequence level.
+   *
+   * This will enable or disable usage of intra-edge filtering. The default
+   * value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_INTRA_EDGE_FILTER,
 
   /*!\brief Codec control function to turn on / off frame order hint for a
    * few tools:
@@ -720,14 +786,42 @@ enum aome_enc_control_id {
    */
   AV1E_SET_ENABLE_ORDER_HINT,
 
-  /*!\brief Codec control function to turn on / off joint compound mode
+  /*!\brief Codec control function to turn on / off 64-length transforms.
+   *
+   * This will enable or disable usage of length 64 transforms in any
+   * direction. The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_TX64,
+
+  /*!\brief Codec control function to turn on / off flip and identity
+   * transforms.
+   *
+   * This will enable or disable usage of flip and identity transform
+   * types in any direction. The default value is 1. Including:
+   * FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, ADST_FLIPADST,
+   * FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST,
+   * H_FLIPADST
+   */
+  AV1E_SET_ENABLE_FLIP_IDTX,
+
+  /*!\brief Codec control function to set transform block size search method.
+   *
+   * This will set the transform block size search method.
+   * 0: use Full RD search, 1: use Fast RD search, 2: always use largest
+   * allowed transform block size based on partition size.
+   */
+  AV1E_SET_TX_SIZE_SEARCH_METHOD,
+
+  /*!\brief Codec control function to turn on / off dist-wtd compound mode
    * at sequence level.
    *
-   * This will enable or disable joint compound mode. The default value is 1.
-   * If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced to 0.
+   * This will enable or disable distance-weighted compound mode. The default
+   * value is 1. If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced
+   * to 0.
    *
    */
-  AV1E_SET_ENABLE_JNT_COMP,
+  AV1E_SET_ENABLE_DIST_WTD_COMP,
 
   /*!\brief Codec control function to turn on / off ref frame mvs (mfmv) usage
    * at sequence level.
@@ -747,6 +841,86 @@ enum aome_enc_control_id {
    */
   AV1E_SET_ALLOW_REF_FRAME_MVS,
 
+  /*!\brief Codec control function to turn on / off dual filter usage
+   * for a sequence.
+   *
+   * This will enable or disable use of dual interpolation filter.
+   * The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_DUAL_FILTER,
+
+  /*!\brief Codec control function to turn on / off masked compound usage
+   * for a sequence.
+   *
+   * This will enable or disable usage of wedge and diff-wtd compound
+   * modes. The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_MASKED_COMP,
+
+  /*!\brief Codec control function to turn on / off one sided compound usage
+   * for a sequence.
+   *
+   * This will enable or disable usage of one sided compound
+   * modes. The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_ONESIDED_COMP,
+
+  /*!\brief Codec control function to turn on / off interintra compound
+   * for a sequence.
+   *
+   * This will enable or disable usage of inter-intra compound modes.
+   * The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_INTERINTRA_COMP,
+
+  /*!\brief Codec control function to turn on / off smooth inter-intra
+   * mode for a sequence.
+   *
+   * This will enable or disable usage of smooth inter-intra mode.
+   * The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_SMOOTH_INTERINTRA,
+
+  /*!\brief Codec control function to turn on / off difference weighted
+   * compound.
+   *
+   * This will enable or disable usage of difference weighted compound.
+   * The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_DIFF_WTD_COMP,
+
+  /*!\brief Codec control function to turn on / off interinter wedge
+   * compound.
+   *
+   * This will enable or disable usage of interinter wedge compound.
+   * The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_INTERINTER_WEDGE,
+
+  /*!\brief Codec control function to turn on / off interintra wedge
+   * compound.
+   *
+   * This will enable or disable usage of interintra wedge compound.
+   * The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_INTERINTRA_WEDGE,
+
+  /*!\brief Codec control function to turn on / off global motion usage
+   * for a sequence.
+   *
+   * This will enable or disable usage of global motion. The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_GLOBAL_MOTION,
+
   /*!\brief Codec control function to turn on / off warped motion usage
    * at sequence level.
    *
@@ -764,6 +938,39 @@ enum aome_enc_control_id {
    */
   AV1E_SET_ALLOW_WARPED_MOTION,
 
+  /*!\brief Codec control function to turn on / off filter intra usage at
+   * sequence level.
+   *
+   * This will enable or disable usage of filter intra. The default value is 1.
+   * If AV1E_SET_ENABLE_FILTER_INTRA is 0, then this flag is forced to 0.
+   *
+   */
+  AV1E_SET_ENABLE_FILTER_INTRA,
+
+  /*!\brief Codec control function to turn on / off smooth intra modes usage.
+   *
+   * This will enable or disable usage of smooth, smooth_h and smooth_v intra
+   * modes. The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_SMOOTH_INTRA,
+
+  /*!\brief Codec control function to turn on / off Paeth intra mode usage.
+   *
+   * This will enable or disable usage of Paeth intra mode. The default value
+   * is 1.
+   *
+   */
+  AV1E_SET_ENABLE_PAETH_INTRA,
+
+  /*!\brief Codec control function to turn on / off CFL uv intra mode usage.
+   *
+   * This will enable or disable usage of chroma-from-luma intra mode. The
+   * default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_CFL_INTRA,
+
   /*!\brief Codec control function to turn on / off frame superresolution.
    *
    * This will enable or disable frame superresolution. The default value is 1
@@ -771,6 +978,15 @@ enum aome_enc_control_id {
    */
   AV1E_SET_ENABLE_SUPERRES,
 
+  /*!\brief Codec control function to turn on/off palette mode */
+  AV1E_SET_ENABLE_PALETTE,
+
+  /*!\brief Codec control function to turn on/off intra block copy mode */
+  AV1E_SET_ENABLE_INTRABC,
+
+  /*!\brief Codec control function to turn on/off intra angle delta */
+  AV1E_SET_ENABLE_ANGLE_DELTA,
+
   /*!\brief Codec control function to set the delta q mode
    *
    * AV1 has a segment based feature that allows encoder to adaptively change
@@ -828,6 +1044,54 @@ enum aome_enc_control_id {
 
   /*!\brief Sets the chroma subsampling y value */
   AV1E_SET_CHROMA_SUBSAMPLING_Y,
+
+  /*!\brief Control to use a reduced tx type set */
+  AV1E_SET_REDUCED_TX_TYPE_SET,
+
+  /*!\brief Control to use dct only for intra modes */
+  AV1E_SET_INTRA_DCT_ONLY,
+
+  /*!\brief Control to use dct only for inter modes */
+  AV1E_SET_INTER_DCT_ONLY,
+
+  /*!\brief Control to use default tx type only for intra modes */
+  AV1E_SET_INTRA_DEFAULT_TX_ONLY,
+
+  /*!\brief Control to use adaptive quantize_b */
+  AV1E_SET_QUANT_B_ADAPT,
+
+  /*!\brief Control to select maximum height for the GF group pyramid structure
+   * (valid values: 0 - 4) */
+  AV1E_SET_GF_MAX_PYRAMID_HEIGHT,
+
+  /*!\brief Control to select maximum reference frames allowed per frame
+   * (valid values: 3 - 7) */
+  AV1E_SET_MAX_REFERENCE_FRAMES,
+
+  /*!\brief Control to use reduced set of single and compound references. */
+  AV1E_SET_REDUCED_REFERENCE_SET,
+
+  /*!\brief Control to set frequency of the cost updates for coefficients
+   * Possible values are:
+   * 0: Update at SB level (default)
+   * 1: Update at SB row level in tile
+   * 2: Update at tile level
+   */
+  AV1E_SET_COEFF_COST_UPD_FREQ,
+
+  /*!\brief Control to set frequency of the cost updates for mode
+   * Possible values are:
+   * 0: Update at SB level (default)
+   * 1: Update at SB row level in tile
+   * 2: Update at tile level
+   */
+  AV1E_SET_MODE_COST_UPD_FREQ,
+
+  /*!\brief Control to set bit mask that specifies which tier each of the 32
+   * possible operating points conforms to.
+   * Bit value 0: Main Tier; 1: High Tier.
+   */
+  AV1E_SET_TIER_MASK,
 };
 
 /*!\brief aom 1-D scaling mode
@@ -934,13 +1198,11 @@ AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
 AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
 #define AOM_CTRL_AOME_SET_SCALEMODE
 
-AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, int)
+AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, unsigned int)
 #define AOM_CTRL_AOME_SET_SPATIAL_LAYER_ID
 
 AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
 #define AOM_CTRL_AOME_SET_CPUUSED
-AOM_CTRL_USE_TYPE(AOME_SET_DEVSF, int)
-#define AOM_CTRL_AOME_SET_DEVSF
 AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
 #define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
 
@@ -961,12 +1223,12 @@ AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
 AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
 #define AOM_CTRL_AOME_SET_CQ_LEVEL
 
-AOM_CTRL_USE_TYPE(AV1E_SET_ROW_MT, int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ROW_MT, unsigned int)
 #define AOM_CTRL_AV1E_SET_ROW_MT
 
-AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, int)
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, unsigned int)
 #define AOM_CTRL_AV1E_SET_TILE_COLUMNS
-AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, int)
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, unsigned int)
 #define AOM_CTRL_AV1E_SET_TILE_ROWS
 
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TPL_MODEL, unsigned int)
@@ -997,6 +1259,9 @@ AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CDEF, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RESTORATION, unsigned int)
 #define AOM_CTRL_AV1E_SET_ENABLE_RESTORATION
 
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_OBMC, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_OBMC
+
 AOM_CTRL_USE_TYPE(AV1E_SET_DISABLE_TRELLIS_QUANT, unsigned int)
 #define AOM_CTRL_AV1E_SET_DISABLE_TRELLIS_QUANT
 
@@ -1029,37 +1294,109 @@ AOM_CTRL_USE_TYPE(AV1E_SET_MTU, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_TIMING_INFO_TYPE, int) /* aom_timing_info_type_t */
 #define AOM_CTRL_AV1E_SET_TIMING_INFO_TYPE
 
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DF, unsigned int)
-#define AOM_CTRL_AV1E_SET_ENABLE_DF
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RECT_PARTITIONS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_RECT_PARTITIONS
 
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ORDER_HINT, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_AB_PARTITIONS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_AB_PARTITIONS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_1TO4_PARTITIONS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_1TO4_PARTITIONS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MIN_PARTITION_SIZE, int)
+#define AOM_CTRL_AV1E_SET_MIN_PARTITION_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_PARTITION_SIZE, int)
+#define AOM_CTRL_AV1E_SET_MAX_PARTITION_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTRA_EDGE_FILTER
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ORDER_HINT, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_ORDER_HINT
 
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_JNT_COMP, unsigned int)
-#define AOM_CTRL_AV1E_SET_ENABLE_JNT_COMP
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TX64, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_TX64
 
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_REF_FRAME_MVS, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_TX_SIZE_SEARCH_METHOD, int)
+#define AOM_CTRL_AV1E_SET_TXSIZE_SEARCH_METHOD
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FLIP_IDTX, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_FLIP_IDTX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_WTD_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIST_WTD_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_REF_FRAME_MVS, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_REF_FRAME_MVS
 
-AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_REF_FRAME_MVS, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_REF_FRAME_MVS, int)
 #define AOM_CTRL_AV1E_SET_ALLOW_REF_FRAME_MVS
 
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_WARPED_MOTION, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DUAL_FILTER, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DUAL_FILTER
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_MASKED_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_MASKED_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ONESIDED_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_ONESIDED_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTRA_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTRA_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_SMOOTH_INTERINTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIFF_WTD_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIFF_WTD_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTER_WEDGE, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTER_WEDGE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTRA_WEDGE, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTRA_WEDGE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_GLOBAL_MOTION, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_GLOBAL_MOTION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_WARPED_MOTION, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_WARPED_MOTION
 
-AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_WARPED_MOTION, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_WARPED_MOTION, int)
 #define AOM_CTRL_AV1E_SET_ALLOW_WARPED_MOTION
 
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SUPERRES, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FILTER_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_FILTER_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SMOOTH_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_SMOOTH_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PAETH_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_PAETH_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CFL_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_CFL_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SUPERRES, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_SUPERRES
 
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PALETTE, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_PALETTE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRABC, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTRABC
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ANGLE_DELTA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_ANGLE_DELTA
+
 AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
 #define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
 
-AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, int)
 #define AOM_CTRL_AV1E_SET_ERROR_RESILIENT_MODE
 
-AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, int)
 #define AOM_CTRL_AV1E_SET_S_FRAME_MODE
 
 AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
@@ -1107,14 +1444,8 @@ AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
 AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
 #define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
 
-AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_LEVEL, unsigned int)
-#define AOM_CTRL_AV1E_SET_TARGET_LEVEL
-
-AOM_CTRL_USE_TYPE(AV1E_GET_LEVEL, int *)
-#define AOM_CTRL_AV1E_GET_LEVEL
-
-AOM_CTRL_USE_TYPE(AV1E_SET_ANS_WINDOW_SIZE_LOG2, unsigned int)
-#define AOM_CTRL_AV1E_SET_ANS_WINDOW_SIZE_LOG2
+AOM_CTRL_USE_TYPE(AV1E_GET_SEQ_LEVEL_IDX, int *)
+#define AOM_CTRL_AV1E_GET_SEQ_LEVEL_IDX
 
 AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int)
 #define AOM_CTRL_AV1E_SET_SINGLE_TILE_DECODING
@@ -1122,13 +1453,13 @@ AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
 #define AOM_CTRL_AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST
 
-AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, int)
 #define AOM_CTRL_AV1E_SET_FILM_GRAIN_TEST_VECTOR
 
 AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TABLE, const char *)
 #define AOM_CTRL_AV1E_SET_FILM_GRAIN_TABLE
 
-AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, int)
+AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, unsigned int)
 #define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE
 
 #ifdef CONFIG_DENOISE
@@ -1145,6 +1476,42 @@ AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_X, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_Y, unsigned int)
 #define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_Y
 
+AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_TX_TYPE_SET, int)
+#define AOM_CTRL_AV1E_SET_REDUCED_TX_TYPE_SET
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DCT_ONLY, int)
+#define AOM_CTRL_AV1E_SET_INTRA_DCT_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTER_DCT_ONLY, int)
+#define AOM_CTRL_AV1E_SET_INTER_DCT_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DEFAULT_TX_ONLY, int)
+#define AOM_CTRL_AV1E_SET_INTRA_DEFAULT_TX_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QUANT_B_ADAPT, int)
+#define AOM_CTRL_AV1E_SET_QUANT_B_ADAPT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_MAX_PYRAMID_HEIGHT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_REFERENCE_FRAMES, int)
+#define AOM_CTRL_AV1E_SET_MAX_REFERENCE_FRAMES
+
+AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_REFERENCE_SET, int)
+#define AOM_CTRL_AV1E_SET_REDUCED_REFERENCE_SET
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COEFF_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_COEFF_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MODE_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_MODE_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_SEQ_LEVEL_IDX, int)
+#define AOM_CTRL_AV1E_SET_TARGET_SEQ_LEVEL_IDX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TIER_MASK, unsigned int)
+#define AOM_CTRL_AV1E_SET_TIER_MASK
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/libaom/aom_dsp/add_noise.c b/libaom/aom_dsp/add_noise.c
index bfb3e7e..43587ca 100644
--- a/libaom/aom_dsp/add_noise.c
+++ b/libaom/aom_dsp/add_noise.c
@@ -40,7 +40,7 @@ void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16],
 }
 
 static double gaussian(double sigma, double mu, double x) {
-  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+  return 1 / (sigma * sqrt(2.0 * PI)) *
          (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
 }
 
diff --git a/libaom/aom_dsp/aom_dsp.cmake b/libaom/aom_dsp/aom_dsp.cmake
index a8490c4..abf6a60 100644
--- a/libaom/aom_dsp/aom_dsp.cmake
+++ b/libaom/aom_dsp/aom_dsp.cmake
@@ -194,6 +194,7 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h"
               "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
@@ -226,6 +227,7 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
               "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
@@ -361,6 +363,8 @@ function(setup_aom_dsp_targets)
     endif()
   endif()
 
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
+
   # Pass the new lib targets up to the parent scope instance of
   # $AOM_LIB_TARGETS.
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
diff --git a/libaom/aom_dsp/aom_dsp_rtcd_defs.pl b/libaom/aom_dsp/aom_dsp_rtcd_defs.pl
index 59d0620..f56a117 100755
--- a/libaom/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/libaom/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -466,10 +466,6 @@ specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
 add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
 specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
 
-# Helper functions.
-add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
-specialize "av1_round_shift_array", qw/sse4_1 neon/;
-
 #
 # Encoder functions.
 #
@@ -522,10 +518,17 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
 
+  add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_adaptive sse2/;
+
   add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
 
+  add_proto qw/void aom_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_32x32_adaptive sse2/;
+
   add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_64x64 ssse3/;
 }  # CONFIG_AV1_ENCODER
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
@@ -536,7 +539,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_highbd_quantize_b_32x32 sse2/;
 
   add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-
+  specialize qw/aom_highbd_quantize_b_64x64 sse2/;
 }  # CONFIG_AV1_ENCODER
 
 #
@@ -596,7 +599,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
     add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-    add_proto qw/unsigned int/, "aom_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
+    add_proto qw/unsigned int/, "aom_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
   }
 
   specialize qw/aom_sad128x128    avx2          sse2/;
@@ -647,29 +650,29 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sad16x64_avg sse2/;
   specialize qw/aom_sad64x16_avg sse2/;
 
-  specialize qw/aom_jnt_sad128x128_avg ssse3/;
-  specialize qw/aom_jnt_sad128x64_avg  ssse3/;
-  specialize qw/aom_jnt_sad64x128_avg  ssse3/;
-  specialize qw/aom_jnt_sad64x64_avg   ssse3/;
-  specialize qw/aom_jnt_sad64x32_avg   ssse3/;
-  specialize qw/aom_jnt_sad32x64_avg   ssse3/;
-  specialize qw/aom_jnt_sad32x32_avg   ssse3/;
-  specialize qw/aom_jnt_sad32x16_avg   ssse3/;
-  specialize qw/aom_jnt_sad16x32_avg   ssse3/;
-  specialize qw/aom_jnt_sad16x16_avg   ssse3/;
-  specialize qw/aom_jnt_sad16x8_avg    ssse3/;
-  specialize qw/aom_jnt_sad8x16_avg    ssse3/;
-  specialize qw/aom_jnt_sad8x8_avg     ssse3/;
-  specialize qw/aom_jnt_sad8x4_avg     ssse3/;
-  specialize qw/aom_jnt_sad4x8_avg     ssse3/;
-  specialize qw/aom_jnt_sad4x4_avg     ssse3/;
-
-  specialize qw/aom_jnt_sad4x16_avg     ssse3/;
-  specialize qw/aom_jnt_sad16x4_avg     ssse3/;
-  specialize qw/aom_jnt_sad8x32_avg     ssse3/;
-  specialize qw/aom_jnt_sad32x8_avg     ssse3/;
-  specialize qw/aom_jnt_sad16x64_avg     ssse3/;
-  specialize qw/aom_jnt_sad64x16_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad128x128_avg ssse3/;
+  specialize qw/aom_dist_wtd_sad128x64_avg  ssse3/;
+  specialize qw/aom_dist_wtd_sad64x128_avg  ssse3/;
+  specialize qw/aom_dist_wtd_sad64x64_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad64x32_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad32x64_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad32x32_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad32x16_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad16x32_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad16x16_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad16x8_avg    ssse3/;
+  specialize qw/aom_dist_wtd_sad8x16_avg    ssse3/;
+  specialize qw/aom_dist_wtd_sad8x8_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad8x4_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad4x8_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad4x4_avg     ssse3/;
+
+  specialize qw/aom_dist_wtd_sad4x16_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad16x4_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad8x32_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad32x8_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad16x64_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad64x16_avg     ssse3/;
 
   add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
   add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
@@ -694,7 +697,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
         specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
         specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
       }
-      add_proto qw/unsigned int/, "aom_highbd_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
+      add_proto qw/unsigned int/, "aom_highbd_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
     }
     specialize qw/aom_highbd_sad128x128 avx2/;
     specialize qw/aom_highbd_sad128x64  avx2/;
@@ -839,6 +842,30 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_highbd_sad64x16x4d sse2/;
 
   #
+  # Avg
+  #
+  add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/aom_avg_8x8 sse2/;
+
+  add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/aom_avg_4x4 sse2/;
+
+  add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/aom_minmax_8x8 sse2/;
+
+  add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
+  # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
+  #specialize qw/aom_int_pro_row sse2/;
+
+  add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
+  # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
+  #specialize qw/aom_int_pro_col sse2/;
+
+  add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
+  # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
+  #specialize qw/aom_vector_var sse2/;
+
+  #
   # hamadard transform and satd for implmenting temporal dependency model
   #
   add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
@@ -919,11 +946,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
                                                    int ref_stride, int subpel_search";
   specialize qw/aom_comp_avg_upsampled_pred sse2/;
 
-  add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+  add_proto qw/void aom_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                        const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
                                                        int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                       int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
-  specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
+                                                       int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+  specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3/;
 
   add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                        const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
@@ -942,11 +969,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
                                                           int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
   specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
 
-  add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+  add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                               const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
                                                               int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-                                                              int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
-  specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/;
+                                                              int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+  specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/;
 
 
   #
@@ -972,7 +999,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
     add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    add_proto qw/uint32_t/, "aom_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
+    add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
   }
   specialize qw/aom_variance128x128   sse2 avx2         /;
   specialize qw/aom_variance128x64    sse2 avx2         /;
@@ -1044,30 +1071,30 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
   specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
 
-  specialize qw/aom_jnt_sub_pixel_avg_variance64x64 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance64x32 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance32x64 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance32x32 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance32x16 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance16x32 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance16x16 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance16x8  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance8x16  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance8x8   ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance8x4   ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance4x8   ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance4x4   ssse3/;
-
-  specialize qw/aom_jnt_sub_pixel_avg_variance4x16  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance16x4  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance8x32  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance32x8  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance16x64 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance64x16 ssse3/;
-
-  specialize qw/aom_jnt_sub_pixel_avg_variance128x128  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance128x64   ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance64x128   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4   ssse3/;
+
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/;
+
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128   ssse3/;
 
 
   foreach $bd (8, 10, 12) {
@@ -1099,7 +1126,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
         specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
       }
 
-      add_proto qw/uint32_t/, "aom_highbd_${bd}_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
+      add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
     }
   }
 
@@ -1188,8 +1215,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   #
   add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
 
-  add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
-  specialize qw/aom_jnt_comp_avg_pred ssse3/;
+  add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
+  specialize qw/aom_dist_wtd_comp_avg_pred ssse3/;
 
     add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     specialize qw/aom_highbd_12_variance128x128 sse2/;
@@ -1355,12 +1382,21 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
     add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
 
-    add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
-    specialize qw/aom_highbd_jnt_comp_avg_pred sse2/;
+    add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
+    specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2/;
 
     #
     # Subpixel Variance
     #
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/;
+
     add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
     specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
 
@@ -1397,6 +1433,15 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
     add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2/;
+
     add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
     specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
 
@@ -1433,6 +1478,15 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
     add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/;
+
     add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
     specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
 
diff --git a/libaom/aom_dsp/avg.c b/libaom/aom_dsp/avg.c
index 4d78c9c..43d2760 100644
--- a/libaom/aom_dsp/avg.c
+++ b/libaom/aom_dsp/avg.c
@@ -14,6 +14,40 @@
 #include "config/aom_dsp_rtcd.h"
 #include "aom_ports/mem.h"
 
+void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+                      int *min, int *max) {
+  int i, j;
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j] - d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+
+unsigned int aom_avg_4x4_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 4; ++i, s += p)
+    for (j = 0; j < 4; sum += s[j], ++j) {
+    }
+
+  return (sum + 8) >> 4;
+}
+
+unsigned int aom_avg_8x8_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 8; ++i, s += p)
+    for (j = 0; j < 8; sum += s[j], ++j) {
+    }
+
+  return (sum + 32) >> 6;
+}
+
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
 //           second pass, 12 bit, dynamic range [-2040, 2040]
 static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
@@ -146,3 +180,48 @@ int aom_satd_c(const tran_low_t *coeff, int length) {
   // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
   return satd;
 }
+
+// Integer projection onto row vectors.
+// height: value range {16, 32, 64, 128}.
+void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
+                       const int ref_stride, const int height) {
+  int idx;
+  const int norm_factor = height >> 1;
+  for (idx = 0; idx < 16; ++idx) {
+    int i;
+    hbuf[idx] = 0;
+    // hbuf[idx]: 14 bit, dynamic range [0, 32640].
+    for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
+    // hbuf[idx]: 9 bit, dynamic range [0, 1020].
+    hbuf[idx] /= norm_factor;
+    ++ref;
+  }
+}
+
+// width: value range {16, 32, 64, 128}.
+int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) {
+  int idx;
+  int16_t sum = 0;
+  // sum: 14 bit, dynamic range [0, 32640]
+  for (idx = 0; idx < width; ++idx) sum += ref[idx];
+  return sum;
+}
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
+  int i;
+  int width = 4 << bwl;
+  int sse = 0, mean = 0, var;
+
+  for (i = 0; i < width; ++i) {
+    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.
+    mean += diff;                // mean: dynamic range 16 bits.
+    sse += diff * diff;          // sse:  dynamic range 26 bits.
+  }
+
+  // (mean * mean): dynamic range 31 bits.
+  var = sse - ((mean * mean) >> (bwl + 2));
+  return var;
+}
diff --git a/libaom/aom_dsp/bitreader_buffer.c b/libaom/aom_dsp/bitreader_buffer.c
index 984b217..d79feea 100644
--- a/libaom/aom_dsp/bitreader_buffer.c
+++ b/libaom/aom_dsp/bitreader_buffer.c
@@ -60,9 +60,9 @@ int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
 
 uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
   int leading_zeros = 0;
-  while (!aom_rb_read_bit(rb)) ++leading_zeros;
+  while (leading_zeros < 32 && !aom_rb_read_bit(rb)) ++leading_zeros;
   // Maximum 32 bits.
-  if (leading_zeros >= 32) return UINT32_MAX;
+  if (leading_zeros == 32) return UINT32_MAX;
   const uint32_t base = (1u << leading_zeros) - 1;
   const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
   return base + value;
diff --git a/libaom/aom_dsp/grain_synthesis.c b/libaom/aom_dsp/grain_synthesis.c
index b96e1c3..4b94dbc 100644
--- a/libaom/aom_dsp/grain_synthesis.c
+++ b/libaom/aom_dsp/grain_synthesis.c
@@ -232,7 +232,6 @@ static int scaling_lut_y[256];
 static int scaling_lut_cb[256];
 static int scaling_lut_cr[256];
 
-static int grain_center;
 static int grain_min;
 static int grain_max;
 
@@ -1077,7 +1076,7 @@ int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
   int overlap = params->overlap_flag;
   int bit_depth = params->bit_depth;
 
-  grain_center = 128 << (bit_depth - 8);
+  const int grain_center = 128 << (bit_depth - 8);
   grain_min = 0 - grain_center;
   grain_max = (256 << (bit_depth - 8)) - 1 - grain_center;
 
diff --git a/libaom/aom_dsp/grain_synthesis.h b/libaom/aom_dsp/grain_synthesis.h
index 7aee6f6..9155b39 100644
--- a/libaom/aom_dsp/grain_synthesis.h
+++ b/libaom/aom_dsp/grain_synthesis.h
@@ -20,6 +20,8 @@
 extern "C" {
 #endif
 
+#include <string.h>
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom/aom_image.h"
 
@@ -28,6 +30,9 @@ extern "C" {
  * This structure contains input parameters for film grain synthesis
  */
 typedef struct {
+  // This structure is compared element-by-element in the function
+  // av1_check_grain_params_equiv: this function must be updated if any changes
+  // are made to this structure.
   int apply_grain;
 
   int update_parameters;
@@ -79,8 +84,73 @@ typedef struct {
   int grain_scale_shift;
 
   uint16_t random_seed;
+  // This structure is compared element-by-element in the function
+  // av1_check_grain_params_equiv: this function must be updated if any changes
+  // are made to this structure.
 } aom_film_grain_t;
 
+/*!\brief Check if two film grain parameters structs are equivalent
+ *
+ * Check if two film grain parameters are equal, except for the
+ * update_parameters and random_seed elements which are ignored.
+ *
+ * \param[in]    pa               The first set of parameters to compare
+ * \param[in]    pb               The second set of parameters to compare
+ * \return       Returns 1 if the params are equivalent, 0 otherwise
+ */
+static INLINE int av1_check_grain_params_equiv(
+    const aom_film_grain_t *const pa, const aom_film_grain_t *const pb) {
+  if (pa->apply_grain != pb->apply_grain) return 0;
+  // Don't compare update_parameters
+
+  if (pa->num_y_points != pb->num_y_points) return 0;
+  if (memcmp(pa->scaling_points_y, pb->scaling_points_y,
+             pa->num_y_points * 2 * sizeof(*pa->scaling_points_y)) != 0)
+    return 0;
+
+  if (pa->num_cb_points != pb->num_cb_points) return 0;
+  if (memcmp(pa->scaling_points_cb, pb->scaling_points_cb,
+             pa->num_cb_points * 2 * sizeof(*pa->scaling_points_cb)) != 0)
+    return 0;
+
+  if (pa->num_cr_points != pb->num_cr_points) return 0;
+  if (memcmp(pa->scaling_points_cr, pb->scaling_points_cr,
+             pa->num_cr_points * 2 * sizeof(*pa->scaling_points_cr)) != 0)
+    return 0;
+
+  if (pa->scaling_shift != pb->scaling_shift) return 0;
+  if (pa->ar_coeff_lag != pb->ar_coeff_lag) return 0;
+
+  const int num_pos = 2 * pa->ar_coeff_lag * (pa->ar_coeff_lag + 1);
+  if (memcmp(pa->ar_coeffs_y, pb->ar_coeffs_y,
+             num_pos * sizeof(*pa->ar_coeffs_y)) != 0)
+    return 0;
+  if (memcmp(pa->ar_coeffs_cb, pb->ar_coeffs_cb,
+             num_pos * sizeof(*pa->ar_coeffs_cb)) != 0)
+    return 0;
+  if (memcmp(pa->ar_coeffs_cr, pb->ar_coeffs_cr,
+             num_pos * sizeof(*pa->ar_coeffs_cr)) != 0)
+    return 0;
+
+  if (pa->ar_coeff_shift != pb->ar_coeff_shift) return 0;
+
+  if (pa->cb_mult != pb->cb_mult) return 0;
+  if (pa->cb_luma_mult != pb->cb_luma_mult) return 0;
+  if (pa->cb_offset != pb->cb_offset) return 0;
+
+  if (pa->cr_mult != pb->cr_mult) return 0;
+  if (pa->cr_luma_mult != pb->cr_luma_mult) return 0;
+  if (pa->cr_offset != pb->cr_offset) return 0;
+
+  if (pa->overlap_flag != pb->overlap_flag) return 0;
+  if (pa->clip_to_restricted_range != pb->clip_to_restricted_range) return 0;
+  if (pa->bit_depth != pb->bit_depth) return 0;
+  if (pa->chroma_scaling_from_luma != pb->chroma_scaling_from_luma) return 0;
+  if (pa->grain_scale_shift != pb->grain_scale_shift) return 0;
+
+  return 1;
+}
+
 /*!\brief Add film grain
  *
  * Add film grain to an image
diff --git a/libaom/aom_dsp/noise_model.h b/libaom/aom_dsp/noise_model.h
index 049d5be..5e7de9b 100644
--- a/libaom/aom_dsp/noise_model.h
+++ b/libaom/aom_dsp/noise_model.h
@@ -158,10 +158,10 @@ int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
                               int stride, uint8_t *flat_blocks);
 
 // The noise shape indicates the allowed coefficients in the AR model.
-typedef enum {
+enum {
   AOM_NOISE_SHAPE_DIAMOND = 0,
   AOM_NOISE_SHAPE_SQUARE = 1
-} aom_noise_shape;
+} UENUM1BYTE(aom_noise_shape);
 
 // The parameters of the noise model include the shape type, lag, the
 // bit depth of the input images provided, and whether the input images
@@ -202,13 +202,13 @@ typedef struct {
 } aom_noise_model_t;
 
 /*!\brief Result of a noise model update. */
-typedef enum {
+enum {
   AOM_NOISE_STATUS_OK = 0,
   AOM_NOISE_STATUS_INVALID_ARGUMENT,
   AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
   AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE,
   AOM_NOISE_STATUS_INTERNAL_ERROR,
-} aom_noise_status_t;
+} UENUM1BYTE(aom_noise_status_t);
 
 /*!\brief Initializes a noise model with the given parameters.
  *
diff --git a/libaom/aom_dsp/prob.h b/libaom/aom_dsp/prob.h
index d003a98..20ffdea 100644
--- a/libaom/aom_dsp/prob.h
+++ b/libaom/aom_dsp/prob.h
@@ -641,7 +641,7 @@ static INLINE uint8_t get_prob(unsigned int num, unsigned int den) {
   }
 }
 
-static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
+static INLINE void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
   int rate;
   int i, tmp;
 
diff --git a/libaom/aom_dsp/quantize.c b/libaom/aom_dsp/quantize.c
index 62dbd86..ced34b4 100644
--- a/libaom/aom_dsp/quantize.c
+++ b/libaom/aom_dsp/quantize.c
@@ -11,6 +11,98 @@
 
 #include "aom_dsp/quantize.h"
 #include "aom_mem/aom_mem.h"
+#include "av1/encoder/av1_quantize.h"
+
+void quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  int prescan_add[2];
+  for (i = 0; i < 2; ++i)
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+    const int prescan_add_val = prescan_add[rc != 0];
+    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
+      non_zero_count--;
+    else
+      break;
+  }
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int tmp32;
+
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+      int64_t tmp =
+          clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+      tmp *= wt;
+      tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                     quant_shift_ptr[rc != 0]) >>
+                    (16 - log_scale + AOM_QM_BITS));  // quantization
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+
+      if (tmp32) {
+        eob = i;
+#if SKIP_EOB_FACTOR_ADJUST
+        if (first == -1) first = i;
+#endif  // SKIP_EOB_FACTOR_ADJUST
+      }
+    }
+  }
+#if SKIP_EOB_FACTOR_ADJUST
+  if (eob >= 0 && first == eob) {
+    const int rc = scan[eob];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int coeff = coeff_ptr[rc] * wt;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+          coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        eob = -1;
+      }
+    }
+  }
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  *eob_ptr = eob + 1;
+}
 
 void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -74,6 +166,94 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = eob + 1;
 }
 
+void highbd_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  int i, eob = -1;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int dequant;
+  int idx_arr[4096];
+  (void)iscan;
+  int idx = 0;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  int prescan_add[2];
+  for (i = 0; i < 2; ++i)
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    const int prescan_add_val = prescan_add[rc != 0];
+    if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+        coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
+      idx_arr[idx++] = i;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 =
+        abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+    const int64_t tmpw = tmp1 * wt;
+    const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+                                 (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dequant =
+        (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+    if (abs_qcoeff) {
+      eob = idx_arr[i];
+#if SKIP_EOB_FACTOR_ADJUST
+      if (first == -1) first = eob;
+#endif  // SKIP_EOB_FACTOR_ADJUST
+    }
+  }
+#if SKIP_EOB_FACTOR_ADJUST
+  if (eob >= 0 && first == eob) {
+    const int rc = scan[eob];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int coeff = coeff_ptr[rc] * wt;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+          coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        eob = -1;
+      }
+    }
+  }
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  *eob_ptr = eob + 1;
+}
+
 void highbd_quantize_b_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -133,6 +313,80 @@ void highbd_quantize_b_helper_c(
 
 /* These functions should only be called when quantisation matrices
    are not used. */
+void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                               quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                               NULL, NULL, 0);
+}
+
+void aom_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                               quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                               NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                               quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                               NULL, NULL, 2);
+}
+
+void aom_highbd_quantize_b_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                      quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                      dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                      iscan, NULL, NULL, 0);
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                      quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                      dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                      iscan, NULL, NULL, 1);
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                      quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                      dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                      iscan, NULL, NULL, 2);
+}
+
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       const int16_t *zbin_ptr, const int16_t *round_ptr,
                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
diff --git a/libaom/aom_dsp/quantize.h b/libaom/aom_dsp/quantize.h
index c55ab23..43c30ee 100644
--- a/libaom/aom_dsp/quantize.h
+++ b/libaom/aom_dsp/quantize.h
@@ -20,6 +20,66 @@
 extern "C" {
 #endif
 
+void quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan);
+
+void aom_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void aom_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void highbd_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_highbd_quantize_b_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void aom_highbd_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void aom_highbd_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
 void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          const int16_t *zbin_ptr, const int16_t *round_ptr,
                          const int16_t *quant_ptr,
diff --git a/libaom/aom_dsp/sad.c b/libaom/aom_dsp/sad.c
index 252e0e1..9169e78 100644
--- a/libaom/aom_dsp/sad.c
+++ b/libaom/aom_dsp/sad.c
@@ -54,12 +54,12 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
     aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride);         \
     return sad(src, src_stride, comp_pred, m, m, n);                          \
   }                                                                           \
-  unsigned int aom_jnt_sad##m##x##n##_avg_c(                                  \
+  unsigned int aom_dist_wtd_sad##m##x##n##_avg_c(                             \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
     uint8_t comp_pred[m * n];                                                 \
-    aom_jnt_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride,    \
-                            jcp_param);                                       \
+    aom_dist_wtd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref,           \
+                                 ref_stride, jcp_param);                      \
     return sad(src, src_stride, comp_pred, m, m, n);                          \
   }
 
@@ -208,12 +208,13 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
                              ref, ref_stride);                                 \
     return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
   }                                                                            \
-  unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c(                            \
+  unsigned int aom_highbd_dist_wtd_sad##m##x##n##_avg_c(                       \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
     uint16_t comp_pred[m * n];                                                 \
-    aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred,   \
-                                 m, n, ref, ref_stride, jcp_param);            \
+    aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred),           \
+                                      second_pred, m, n, ref, ref_stride,      \
+                                      jcp_param);                              \
     return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
   }
 
diff --git a/libaom/aom_dsp/variance.c b/libaom/aom_dsp/variance.c
index 0f4990e..18a33c5 100644
--- a/libaom/aom_dsp/variance.c
+++ b/libaom/aom_dsp/variance.c
@@ -164,40 +164,40 @@ void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
     return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);             \
   }
 
-#define SUBPIX_AVG_VAR(W, H)                                                  \
-  uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                           \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
-      const uint8_t *b, int b_stride, uint32_t *sse,                          \
-      const uint8_t *second_pred) {                                           \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint8_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
-                                                                              \
-    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
-                                            bilinear_filters_2t[xoffset]);    \
-    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
-                                             bilinear_filters_2t[yoffset]);   \
-                                                                              \
-    aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                    \
-                                                                              \
-    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);             \
-  }                                                                           \
-  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_c(                       \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
-      const uint8_t *b, int b_stride, uint32_t *sse,                          \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint8_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
-                                                                              \
-    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
-                                            bilinear_filters_2t[xoffset]);    \
-    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
-                                             bilinear_filters_2t[yoffset]);   \
-                                                                              \
-    aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param);     \
-                                                                              \
-    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                 \
+#define SUBPIX_AVG_VAR(W, H)                                                   \
+  uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                            \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
+      const uint8_t *b, int b_stride, uint32_t *sse,                           \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
+                                            bilinear_filters_2t[xoffset]);     \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                     \
+                                                                               \
+    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);              \
+  }                                                                            \
+  uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(                   \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
+      const uint8_t *b, int b_stride, uint32_t *sse,                           \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
+                                            bilinear_filters_2t[xoffset]);     \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
+                                                                               \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                  \
   }
 
 /* Identical to the variance call except it takes an additional parameter, sum,
@@ -291,7 +291,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
     const int ref_num = 0;
     const int is_intrabc = is_intrabc_block(mi);
     const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
     const int is_scaled = av1_is_scaled(sf);
 
     if (is_scaled) {
@@ -424,9 +424,10 @@ void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
   }
 }
 
-void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
-                             int height, const uint8_t *ref, int ref_stride,
-                             const JNT_COMP_PARAMS *jcp_param) {
+void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+                                  int width, int height, const uint8_t *ref,
+                                  int ref_stride,
+                                  const DIST_WTD_COMP_PARAMS *jcp_param) {
   int i, j;
   const int fwd_offset = jcp_param->fwd_offset;
   const int bck_offset = jcp_param->bck_offset;
@@ -443,11 +444,11 @@ void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
   }
 }
 
-void aom_jnt_comp_avg_upsampled_pred_c(
+void aom_dist_wtd_comp_avg_upsampled_pred_c(
     MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
   int i, j;
   const int fwd_offset = jcp_param->fwd_offset;
   const int bck_offset = jcp_param->bck_offset;
@@ -688,125 +689,128 @@ void aom_highbd_var_filter_block2d_bil_second_pass(
                                                dst, dst_stride, sse);        \
   }
 
-#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                            \
-  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                   \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,   \
-                               CONVERT_TO_BYTEPTR(temp2), W);                  \
-                                                                               \
-    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
-                                              dst, dst_stride, sse);           \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                  \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,   \
-                               CONVERT_TO_BYTEPTR(temp2), W);                  \
-                                                                               \
-    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
-                                               dst, dst_stride, sse);          \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                  \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,   \
-                               CONVERT_TO_BYTEPTR(temp2), W);                  \
-                                                                               \
-    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
-                                               dst, dst_stride, sse);          \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c(               \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
-                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);     \
-                                                                               \
-    return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,   \
-                                          dst_stride, sse);                    \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c(              \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
-                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);     \
-                                                                               \
-    return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
-                                           dst_stride, sse);                   \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c(              \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
-                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);     \
-                                                                               \
-    return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
-                                           dst_stride, sse);                   \
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
+  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                              dst, dst_stride, sse);          \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                               dst, dst_stride, sse);         \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                               dst, dst_stride, sse);         \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(         \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+                                      W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
+                                      jcp_param);                             \
+                                                                              \
+    return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
+                                          dst_stride, sse);                   \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+                                      W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
+                                      jcp_param);                             \
+                                                                              \
+    return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                           dst_stride, sse);                  \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+                                      W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
+                                      jcp_param);                             \
+                                                                              \
+    return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                           dst_stride, sse);                  \
   }
 
 /* All three forms of the variance are available in the same sizes. */
@@ -880,7 +884,7 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
     const int ref_num = 0;
     const int is_intrabc = is_intrabc_block(mi);
     const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
     const int is_scaled = av1_is_scaled(sf);
 
     if (is_scaled) {
@@ -1018,10 +1022,10 @@ void aom_highbd_comp_avg_upsampled_pred_c(
   }
 }
 
-void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
-                                    int width, int height, const uint8_t *ref8,
-                                    int ref_stride,
-                                    const JNT_COMP_PARAMS *jcp_param) {
+void aom_highbd_dist_wtd_comp_avg_pred_c(
+    uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+    const uint8_t *ref8, int ref_stride,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
   int i, j;
   const int fwd_offset = jcp_param->fwd_offset;
   const int bck_offset = jcp_param->bck_offset;
@@ -1041,11 +1045,11 @@ void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
   }
 }
 
-void aom_highbd_jnt_comp_avg_upsampled_pred_c(
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
     int subpel_search) {
   int i, j;
   const int fwd_offset = jcp_param->fwd_offset;
diff --git a/libaom/aom_dsp/variance.h b/libaom/aom_dsp/variance.h
index 362da29..4550c17 100644
--- a/libaom/aom_dsp/variance.h
+++ b/libaom/aom_dsp/variance.h
@@ -50,15 +50,14 @@ typedef unsigned int (*aom_subp_avg_variance_fn_t)(
     const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
     int b_stride, unsigned int *sse, const uint8_t *second_pred);
 
-typedef unsigned int (*aom_jnt_sad_avg_fn_t)(const uint8_t *a, int a_stride,
-                                             const uint8_t *b, int b_stride,
-                                             const uint8_t *second_pred,
-                                             const JNT_COMP_PARAMS *jcp_param);
+typedef unsigned int (*aom_dist_wtd_sad_avg_fn_t)(
+    const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+    const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
 
-typedef unsigned int (*aom_jnt_subp_avg_variance_fn_t)(
+typedef unsigned int (*aom_dist_wtd_subp_avg_variance_fn_t)(
     const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
     int b_stride, unsigned int *sse, const uint8_t *second_pred,
-    const JNT_COMP_PARAMS *jcp_param);
+    const DIST_WTD_COMP_PARAMS *jcp_param);
 
 typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
                                             const uint8_t *ref, int ref_stride,
@@ -101,8 +100,8 @@ typedef struct aom_variance_vtable {
   aom_obmc_sad_fn_t osdf;
   aom_obmc_variance_fn_t ovf;
   aom_obmc_subpixvariance_fn_t osvf;
-  aom_jnt_sad_avg_fn_t jsdaf;
-  aom_jnt_subp_avg_variance_fn_t jsvaf;
+  aom_dist_wtd_sad_avg_fn_t jsdaf;
+  aom_dist_wtd_subp_avg_variance_fn_t jsvaf;
 } aom_variance_fn_ptr_t;
 
 void aom_highbd_var_filter_block2d_bil_first_pass(
diff --git a/libaom/aom_dsp/x86/adaptive_quantize_sse2.c b/libaom/aom_dsp/x86/adaptive_quantize_sse2.c
new file mode 100644
index 0000000..3822c27
--- /dev/null
+++ b/libaom/aom_dsp/x86/adaptive_quantize_sse2.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/av1_quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+void aom_quantize_b_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  int index = 16;
+  int non_zero_count = (int)n_coeffs;
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob = zero, eob0, prescan0, prescan1, all_zero;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 0),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 0) };
+
+  int prescan_add[2];
+  for (int i = 0; i < 2; ++i)
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+  // max buffer is of size 256 as this functions calls with
+  // maximum n_coeffs as 256
+  int16_t prescan[256];
+  memset(prescan, -1, n_coeffs * sizeof(int16_t));
+
+  // TODO(Aniket): Experiment the following loop with intrinsic
+  for (int i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const qm_val_t wt = 1 << AOM_QM_BITS;
+    const int coeff = coeff_ptr[rc] * wt;
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int prescan_add_val = prescan_add[rc != 0];
+    if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+      prescan[rc] = 0;
+      non_zero_count--;
+    } else {
+      break;
+    }
+  }
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  prescan0 = _mm_loadu_si128((const __m128i *)prescan);
+  prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8));
+
+  cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_coefficients(coeff0, dqcoeff_ptr);
+    store_coefficients(coeff1, dqcoeff_ptr + 8);
+
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  // TODO(Aniket): Reduce the processing of coeff quatization
+  // based on eob logic
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index));
+    prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8));
+
+    cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+    cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_coefficients(coeff0, dqcoeff_ptr + index);
+    store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const qm_val_t wt = (1 << AOM_QM_BITS);
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_quantize_b_32x32_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = (int)n_coeffs;
+  const int log_scale = 1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob = zero, eob0, prescan0, prescan1, all_zero;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+
+  int prescan_add[2];
+  for (int i = 0; i < 2; ++i)
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+  // max buffer is of size 1024 as this functions calls with
+  // maximum n_coeffs as 1024
+  int16_t prescan[1024];
+  memset(prescan, -1, n_coeffs * sizeof(int16_t));
+
+  // TODO(Aniket): Experiment the following loop with intrinsic
+  for (int i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const qm_val_t wt = 1 << AOM_QM_BITS;
+    const int coeff = coeff_ptr[rc] * wt;
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int prescan_add_val = prescan_add[rc != 0];
+    if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+      prescan[rc] = 0;
+      non_zero_count--;
+    } else {
+      break;
+    }
+  }
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, log_scale_vec);
+  round = _mm_add_epi16(round, log_scale_vec);
+  zbin = _mm_srli_epi16(zbin, log_scale);
+  round = _mm_srli_epi16(round, log_scale);
+  zbin = _mm_sub_epi16(zbin, one);
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  prescan0 = _mm_loadu_si128((const __m128i *)prescan);
+  prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8));
+
+  cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+                                          &log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + 8, &log_scale);
+
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  // TODO(Aniket): Reduce the processing of coeff quatization
+  // based on eob logic
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index));
+    prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8));
+
+    cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+    cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+                                          dqcoeff_ptr + index, &log_scale);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + index + 8, &log_scale);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const qm_val_t wt = (1 << AOM_QM_BITS);
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/libaom/aom_dsp/x86/avg_intrin_sse2.c b/libaom/aom_dsp/x86/avg_intrin_sse2.c
index 969e4e1..0c20261 100644
--- a/libaom/aom_dsp/x86/avg_intrin_sse2.c
+++ b/libaom/aom_dsp/x86/avg_intrin_sse2.c
@@ -16,6 +16,129 @@
 #include "aom_dsp/x86/bitdepth_conversion_sse2.h"
 #include "aom_ports/mem.h"
 
+void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+                         int *min, int *max) {
+  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+  u0 = _mm_setzero_si128();
+  // Row 0
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff0 = _mm_max_epi16(diff, negdiff);
+  // Row 1
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+  // Row 2
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 3
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 4
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 5
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 6
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 7
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+  *max = _mm_extract_epi16(maxabsdiff, 0);
+
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+  *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0 = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 32) >> 6;
+}
+
+unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0 = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 8) >> 4;
+}
+
 static void hadamard_col8_sse2(__m128i *in, int iter) {
   __m128i a0 = in[0];
   __m128i a1 = in[1];
diff --git a/libaom/aom_dsp/x86/convolve_avx2.h b/libaom/aom_dsp/x86/convolve_avx2.h
index 3cc0e23..4a1068e 100644
--- a/libaom/aom_dsp/x86/convolve_avx2.h
+++ b/libaom/aom_dsp/x86/convolve_avx2.h
@@ -34,31 +34,214 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
   2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
 };
 
-DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
-  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
-  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
+#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP                                     \
+  for (i = 0; i < (im_h - 2); i += 2) {                                        \
+    __m256i data = _mm256_castsi128_si256(                                     \
+        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
+    data = _mm256_inserti128_si256(                                            \
+        data,                                                                  \
+        _mm_loadu_si128(                                                       \
+            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
+        1);                                                                    \
+                                                                               \
+    __m256i res = convolve_lowbd_x(data, coeffs_h, filt);                      \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }                                                                            \
+                                                                               \
+  __m256i data_1 = _mm256_castsi128_si256(                                     \
+      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));             \
+                                                                               \
+  __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt);                      \
+                                                                               \
+  res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+                                                                               \
+  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_8TAP                                      \
+  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
+  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
+  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
+  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
+  __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));  \
+  __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));  \
+                                                                              \
+  __m256i s[8];                                                               \
+  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
+  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
+  s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                 \
+                                                                              \
+  s[4] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
+  s[5] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
+  s[6] = _mm256_unpackhi_epi16(src_4, src_5);                                 \
+                                                                              \
+  for (i = 0; i < h; i += 2) {                                                \
+    const int16_t *data = &im_block[i * im_stride];                           \
+                                                                              \
+    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
+    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
+                                                                              \
+    s[3] = _mm256_unpacklo_epi16(s6, s7);                                     \
+    s[7] = _mm256_unpackhi_epi16(s6, s7);                                     \
+                                                                              \
+    __m256i res_a = convolve(s, coeffs_v);                                    \
+    __m256i res_b = convolve(s + 4, coeffs_v);                                \
+                                                                              \
+    res_a =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);  \
+    res_b =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);  \
+                                                                              \
+    const __m256i res_a_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);               \
+    const __m256i res_b_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_b, round_const_v), round_shift_v);               \
+                                                                              \
+    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);   \
+    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);         \
+                                                                              \
+    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                     \
+    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                \
+                                                                              \
+    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                 \
+    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];    \
+    if (w - j > 4) {                                                          \
+      _mm_storel_epi64(p_0, res_0);                                           \
+      _mm_storel_epi64(p_1, res_1);                                           \
+    } else if (w == 4) {                                                      \
+      xx_storel_32(p_0, res_0);                                               \
+      xx_storel_32(p_1, res_1);                                               \
+    } else {                                                                  \
+      *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);                            \
+      *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);                            \
+    }                                                                         \
+                                                                              \
+    s[0] = s[1];                                                              \
+    s[1] = s[2];                                                              \
+    s[2] = s[3];                                                              \
+                                                                              \
+    s[4] = s[5];                                                              \
+    s[5] = s[6];                                                              \
+    s[6] = s[7];                                                              \
+  }
 
-DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
+#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP                               \
+  for (i = 0; i < im_h; i += 2) {                                              \
+    __m256i data = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));  \
+    if (i + 1 < im_h)                                                          \
+      data = _mm256_inserti128_si256(                                          \
+          data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);          \
+    src_h += (src_stride << 1);                                                \
+    __m256i res = convolve_lowbd_x(data, coeffs_x, filt);                      \
+                                                                               \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+                                                                               \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }
 
+#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP                                 \
+  __m256i s[8];                                                                \
+  __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));      \
+  __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));      \
+  __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));      \
+  __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));      \
+  __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));      \
+  __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));      \
+                                                                               \
+  s[0] = _mm256_unpacklo_epi16(s0, s1);                                        \
+  s[1] = _mm256_unpacklo_epi16(s2, s3);                                        \
+  s[2] = _mm256_unpacklo_epi16(s4, s5);                                        \
+                                                                               \
+  s[4] = _mm256_unpackhi_epi16(s0, s1);                                        \
+  s[5] = _mm256_unpackhi_epi16(s2, s3);                                        \
+  s[6] = _mm256_unpackhi_epi16(s4, s5);                                        \
+                                                                               \
+  for (i = 0; i < h; i += 2) {                                                 \
+    const int16_t *data = &im_block[i * im_stride];                            \
+                                                                               \
+    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));  \
+    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));  \
+                                                                               \
+    s[3] = _mm256_unpacklo_epi16(s6, s7);                                      \
+    s[7] = _mm256_unpackhi_epi16(s6, s7);                                      \
+                                                                               \
+    const __m256i res_a = convolve(s, coeffs_y);                               \
+    const __m256i res_a_round = _mm256_sra_epi32(                              \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);                \
+                                                                               \
+    if (w - j > 4) {                                                           \
+      const __m256i res_b = convolve(s + 4, coeffs_y);                         \
+      const __m256i res_b_round = _mm256_sra_epi32(                            \
+          _mm256_add_epi32(res_b, round_const_v), round_shift_v);              \
+      const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);    \
+      const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);    \
+                                                                               \
+      if (do_average) {                                                        \
+        const __m256i data_ref_0 = load_line2_avx2(                            \
+            &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);  \
+        const __m256i comp_avg_res =                                           \
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);  \
+                                                                               \
+        const __m256i round_result = convolve_rounding(                        \
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);    \
+                                                                               \
+        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \
+        const __m128i res_0 = _mm256_castsi256_si128(res_8);                   \
+        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);              \
+                                                                               \
+        _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);      \
+        _mm_storel_epi64(                                                      \
+            (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);   \
+      } else {                                                                 \
+        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);            \
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);         \
+                                                                               \
+        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);       \
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),    \
+                        res_1);                                                \
+      }                                                                        \
+    } else {                                                                   \
+      const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);    \
+      const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);    \
+                                                                               \
+      if (do_average) {                                                        \
+        const __m256i data_ref_0 = load_line2_avx2(                            \
+            &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);  \
+                                                                               \
+        const __m256i comp_avg_res =                                           \
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);  \
+                                                                               \
+        const __m256i round_result = convolve_rounding(                        \
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);    \
+                                                                               \
+        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \
+        const __m128i res_0 = _mm256_castsi256_si128(res_8);                   \
+        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);              \
+                                                                               \
+        *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);  \
+        *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =              \
+            _mm_cvtsi128_si32(res_1);                                          \
+                                                                               \
+      } else {                                                                 \
+        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);            \
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);         \
+                                                                               \
+        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);       \
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),    \
+                        res_1);                                                \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    s[0] = s[1];                                                               \
+    s[1] = s[2];                                                               \
+    s[2] = s[3];                                                               \
+                                                                               \
+    s[4] = s[5];                                                               \
+    s[5] = s[6];                                                               \
+    s[6] = s[7];                                                               \
+  }
 static INLINE void prepare_coeffs_lowbd(
     const InterpFilterParams *const filter_params, const int subpel_q4,
     __m256i *const coeffs /* [4] */) {
@@ -120,6 +303,17 @@ static INLINE __m256i convolve_lowbd(const __m256i *const s,
   return res;
 }
 
+static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
+                                          const __m256i *const coeffs) {
+  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res = _mm256_add_epi16(res_45, res_23);
+
+  return res;
+}
+
 static INLINE __m256i convolve(const __m256i *const s,
                                const __m256i *const coeffs) {
   const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
@@ -155,6 +349,17 @@ static INLINE __m256i convolve_lowbd_x(const __m256i data,
   return convolve_lowbd(s, coeffs);
 }
 
+static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data,
+                                            const __m256i *const coeffs,
+                                            const __m256i *const filt) {
+  __m256i s[2];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+
+  return convolve_lowbd_4tap(s, coeffs);
+}
+
 static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
                                          const __m256i *const res,
                                          const int do_average) {
@@ -172,9 +377,9 @@ static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
 static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
                                const __m256i *const res_unsigned,
                                const __m256i *const wt,
-                               const int use_jnt_comp_avg) {
+                               const int use_dist_wtd_comp_avg) {
   __m256i res;
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_comp_avg) {
     const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
     const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
 
@@ -206,9 +411,9 @@ static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
                                       const __m256i *const res_unsigned,
                                       const __m256i *const wt0,
                                       const __m256i *const wt1,
-                                      const int use_jnt_comp_avg) {
+                                      const int use_dist_wtd_comp_avg) {
   __m256i res;
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_comp_avg) {
     const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
     const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
     const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
diff --git a/libaom/aom_dsp/x86/convolve_sse2.h b/libaom/aom_dsp/x86/convolve_sse2.h
index 445d04b..385c7c7 100644
--- a/libaom/aom_dsp/x86/convolve_sse2.h
+++ b/libaom/aom_dsp/x86/convolve_sse2.h
@@ -78,9 +78,9 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
 static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
                                const __m128i *const res_unsigned,
                                const __m128i *const wt,
-                               const int use_jnt_comp_avg) {
+                               const int use_dist_wtd_avg) {
   __m128i res;
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_avg) {
     const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
     const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
 
diff --git a/libaom/aom_dsp/x86/convolve_sse4_1.h b/libaom/aom_dsp/x86/convolve_sse4_1.h
index 6b8388d..b1a3bb4 100644
--- a/libaom/aom_dsp/x86/convolve_sse4_1.h
+++ b/libaom/aom_dsp/x86/convolve_sse4_1.h
@@ -35,9 +35,9 @@ static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
                                              const __m128i *const res_unsigned,
                                              const __m128i *const wt0,
                                              const __m128i *const wt1,
-                                             const int use_jnt_comp_avg) {
+                                             const int use_dist_wtd_avg) {
   __m128i res;
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_avg) {
     const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
     const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
 
diff --git a/libaom/aom_dsp/x86/fft_avx2.c b/libaom/aom_dsp/x86/fft_avx2.c
index 54da022..4cccc5f 100644
--- a/libaom/aom_dsp/x86/fft_avx2.c
+++ b/libaom/aom_dsp/x86/fft_avx2.c
@@ -11,6 +11,7 @@
 
 #include <immintrin.h>
 
+#include "config/aom_dsp_rtcd.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/fft_common.h"
 
diff --git a/libaom/aom_dsp/x86/fft_sse2.c b/libaom/aom_dsp/x86/fft_sse2.c
index 12bdc3e..6f20a3c 100644
--- a/libaom/aom_dsp/x86/fft_sse2.c
+++ b/libaom/aom_dsp/x86/fft_sse2.c
@@ -11,6 +11,7 @@ s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 
 #include <xmmintrin.h>
 
+#include "config/aom_dsp_rtcd.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/fft_common.h"
 
diff --git a/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c b/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c
index 097e077..70b91c6 100644
--- a/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -727,8 +727,8 @@ void aom_highbd_lpf_horizontal_14_dual_sse2(
                                    _limit1, _thresh1, bd);
 
   for (i = 0; i < 6; i++) {
-    _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
-    _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
+    _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+    _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]);
   }
 }
 
diff --git a/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
index 58e5f98..2f4ffd3 100644
--- a/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -146,3 +146,61 @@ void aom_highbd_quantize_b_32x32_sse2(
   }
   *eob_ptr = eob + 1;
 }
+
+void aom_highbd_quantize_b_64x64_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2);
+  (void)scan;
+  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+  zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+  }
+  *eob_ptr = eob + 1;
+}
diff --git a/libaom/aom_dsp/x86/highbd_variance_sse2.c b/libaom/aom_dsp/x86/highbd_variance_sse2.c
index 226576b..fc5678d 100644
--- a/libaom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/libaom/aom_dsp/x86/highbd_variance_sse2.c
@@ -287,30 +287,38 @@ DECLS(sse2);
   uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
-    uint32_t sse;                                                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    int se = aom_highbd_sub_pixel_variance##wf##xh_##opt(                      \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
-        NULL);                                                                 \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
+    int se = 0;                                                                \
+    unsigned int sse = 0;                                                      \
+    unsigned int sse2;                                                         \
+    int row_rep = (w > 64) ? 2 : 1;                                            \
+    for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                            \
+      src += wd_64 * 64;                                                       \
+      dst += wd_64 * 64;                                                       \
       int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
+          src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2,      \
+          NULL, NULL);                                                         \
       se += se2;                                                               \
       sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
+      if (w > wf) {                                                            \
         se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
+        if (w > wf * 2) {                                                      \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,  \
+              h, &sse2, NULL, NULL);                                           \
+          se += se2;                                                           \
+          sse += sse2;                                                         \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,  \
+              h, &sse2, NULL, NULL);                                           \
+          se += se2;                                                           \
+          sse += sse2;                                                         \
+        }                                                                      \
       }                                                                        \
     }                                                                          \
     *sse_ptr = sse;                                                            \
@@ -322,33 +330,42 @@ DECLS(sse2);
       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
     int64_t var;                                                               \
     uint32_t sse;                                                              \
+    uint64_t long_sse = 0;                                                     \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    int se = aom_highbd_sub_pixel_variance##wf##xh_##opt(                      \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
-        NULL);                                                                 \
-    if (w > wf) {                                                              \
-      uint32_t sse2;                                                           \
+    int se = 0;                                                                \
+    int row_rep = (w > 64) ? 2 : 1;                                            \
+    for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                            \
+      src += wd_64 * 64;                                                       \
+      dst += wd_64 * 64;                                                       \
       int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
+          src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
+          NULL);                                                               \
       se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
+      long_sse += sse;                                                         \
+      if (w > wf) {                                                            \
+        uint32_t sse2;                                                         \
         se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
-        sse += sse2;                                                           \
+        long_sse += sse2;                                                      \
+        if (w > wf * 2) {                                                      \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,  \
+              h, &sse2, NULL, NULL);                                           \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,  \
+              h, &sse2, NULL, NULL);                                           \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+        }                                                                      \
       }                                                                        \
     }                                                                          \
     se = ROUND_POWER_OF_TWO(se, 2);                                            \
-    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
+    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4);                           \
     *sse_ptr = sse;                                                            \
     var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
     return (var >= 0) ? (uint32_t)var : 0;                                     \
@@ -364,35 +381,38 @@ DECLS(sse2);
     uint64_t long_sse = 0;                                                     \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    int row_rep = (w > 64) ? 2 : 1;                                            \
     for (start_row = 0; start_row < h; start_row += 16) {                      \
       uint32_t sse2;                                                           \
       int height = h - start_row < 16 ? h - start_row : 16;                    \
-      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
-          dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL,     \
-          NULL);                                                               \
-      se += se2;                                                               \
-      long_sse += sse2;                                                        \
-      if (w > wf) {                                                            \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
-            y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
-            &sse2, NULL, NULL);                                                \
+      uint16_t *src_tmp = src + (start_row * src_stride);                      \
+      uint16_t *dst_tmp = dst + (start_row * dst_stride);                      \
+      for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                          \
+        src_tmp += wd_64 * 64;                                                 \
+        dst_tmp += wd_64 * 64;                                                 \
+        int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
+            src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride,      \
+            height, &sse2, NULL, NULL);                                        \
         se += se2;                                                             \
         long_sse += sse2;                                                      \
-        if (w > wf * 2) {                                                      \
+        if (w > wf) {                                                          \
           se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
-              height, &sse2, NULL, NULL);                                      \
-          se += se2;                                                           \
-          long_sse += sse2;                                                    \
-          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
-              height, &sse2, NULL, NULL);                                      \
+              src_tmp + 16, src_stride, x_offset, y_offset, dst_tmp + 16,      \
+              dst_stride, height, &sse2, NULL, NULL);                          \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
+          if (w > wf * 2) {                                                    \
+            se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
+                src_tmp + 32, src_stride, x_offset, y_offset, dst_tmp + 32,    \
+                dst_stride, height, &sse2, NULL, NULL);                        \
+            se += se2;                                                         \
+            long_sse += sse2;                                                  \
+            se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
+                src_tmp + 48, src_stride, x_offset, y_offset, dst_tmp + 48,    \
+                dst_stride, height, &sse2, NULL, NULL);                        \
+            se += se2;                                                         \
+            long_sse += sse2;                                                  \
+          }                                                                    \
         }                                                                      \
       }                                                                        \
     }                                                                          \
@@ -403,22 +423,25 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int64_t));  \
-  FN(8, 32, 8, 3, 5, opt, (int64_t));   \
-  FN(32, 8, 16, 5, 3, opt, (int64_t));  \
-  FN(16, 64, 16, 4, 6, opt, (int64_t)); \
+#define FNS(opt)                          \
+  FN(128, 128, 16, 7, 7, opt, (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (int64_t));   \
+  FN(16, 8, 16, 4, 3, opt, (int64_t));    \
+  FN(8, 16, 8, 3, 4, opt, (int64_t));     \
+  FN(8, 8, 8, 3, 3, opt, (int64_t));      \
+  FN(8, 4, 8, 3, 2, opt, (int64_t));      \
+  FN(16, 4, 16, 4, 2, opt, (int64_t));    \
+  FN(8, 32, 8, 3, 5, opt, (int64_t));     \
+  FN(32, 8, 16, 5, 3, opt, (int64_t));    \
+  FN(16, 64, 16, 4, 6, opt, (int64_t));   \
   FN(64, 16, 16, 6, 4, opt, (int64_t))
 
 FNS(sse2);
@@ -603,7 +626,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
     const int ref_num = 0;
     const int is_intrabc = is_intrabc_block(mi);
     const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
     const int is_scaled = av1_is_scaled(sf);
 
     if (is_scaled) {
@@ -765,11 +788,11 @@ void aom_highbd_comp_avg_upsampled_pred_sse2(
   }
 }
 
-static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
-                                               const __m128i *w0,
-                                               const __m128i *w1,
-                                               const __m128i *r,
-                                               void *const result) {
+static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+                                                    const __m128i *w0,
+                                                    const __m128i *w1,
+                                                    const __m128i *r,
+                                                    void *const result) {
   assert(DIST_PRECISION_BITS <= 4);
   __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
   __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
@@ -780,11 +803,10 @@ static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
   xx_storeu_128(result, shift);
 }
 
-void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
-                                       const uint8_t *pred8, int width,
-                                       int height, const uint8_t *ref8,
-                                       int ref_stride,
-                                       const JNT_COMP_PARAMS *jcp_param) {
+void aom_highbd_dist_wtd_comp_avg_pred_sse2(
+    uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+    const uint8_t *ref8, int ref_stride,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
   int i;
   const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
   const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
@@ -806,7 +828,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
         __m128i p0 = xx_loadu_128(ref);
         __m128i p1 = xx_loadu_128(pred);
 
-        highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+        highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
 
         comp_pred += 8;
         pred += 8;
@@ -823,7 +845,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
       __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
       __m128i p1 = xx_loadu_128(pred);
 
-      highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+      highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
 
       comp_pred += 8;
       pred += 8;
@@ -832,11 +854,11 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
   }
 }
 
-void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
     int subpel_search) {
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   int n;
@@ -860,7 +882,7 @@ void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
     __m128i p0 = xx_loadu_128(comp_pred16);
     __m128i p1 = xx_loadu_128(pred);
 
-    highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
+    highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
 
     comp_pred16 += 8;
     pred += 8;
diff --git a/libaom/aom_dsp/x86/intrapred_asm_sse2.asm b/libaom/aom_dsp/x86/intrapred_asm_sse2.asm
index 9aece27..0eb6323 100644
--- a/libaom/aom_dsp/x86/intrapred_asm_sse2.asm
+++ b/libaom/aom_dsp/x86/intrapred_asm_sse2.asm
@@ -27,23 +27,6 @@ pw2_32:  times 8 dw 16
 
 SECTION .text
 
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
-  pavgb               %4, %1, %3
-  pxor                %3, %1
-  pand                %3, [GLOBAL(pb_1)]
-  psubb               %4, %3
-  pavgb               %4, %2
-%endmacro
-
 INIT_XMM sse2
 cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
diff --git a/libaom/aom_dsp/x86/intrapred_avx2.c b/libaom/aom_dsp/x86/intrapred_avx2.c
index 5f3e7bb..17f35a0 100644
--- a/libaom/aom_dsp/x86/intrapred_avx2.c
+++ b/libaom/aom_dsp/x86/intrapred_avx2.c
@@ -1481,9 +1481,10 @@ static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
 void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint16_t *above,
                                       const uint16_t *left, int upsample_above,
-                                      int dx, int dy) {
+                                      int dx, int dy, int bd) {
   (void)left;
   (void)dy;
+  (void)bd;
 
   switch (bw) {
     case 4:
@@ -1511,8 +1512,8 @@ void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
   return;
 }
 
-static void transpose_TX_8X8(const uint16_t *src, ptrdiff_t pitchSrc,
-                             uint16_t *dst, ptrdiff_t pitchDst) {
+static void highbd_transpose_TX_8X8(const uint16_t *src, ptrdiff_t pitchSrc,
+                                    uint16_t *dst, ptrdiff_t pitchDst) {
   __m128i r0, r1, r2, r3, r4, r5, r6, r7, r0_Lo, r1_Lo, r2_Lo, r3_Lo, r4_Lo,
       r5_Lo, r6_Lo;
   r0 = _mm_load_si128(
@@ -1579,12 +1580,921 @@ static void transpose_TX_8X8(const uint16_t *src, ptrdiff_t pitchSrc,
   _mm_storeu_si128((__m128i *)(dst + 7 * pitchDst), r3);
 }
 
-static void transpose(const uint16_t *src, ptrdiff_t pitchSrc, uint16_t *dst,
-                      ptrdiff_t pitchDst, int width, int height) {
+static uint8_t HighbdLoadMaskx[8][16] = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+};
+
+static uint8_t HighbdEvenOddMaskx4[8][16] = {
+  { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14,
+    15 },  // 0=0,1, 1=2,3, 2=4,5, 3=6,7, 4=8,9, 5=10,11, 6=12,13, 7=14,15,
+           // >7=0,1
+  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
+  { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
+  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 0, 1, 0, 1 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 0, 1 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 0, 1 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15 }
+};
+
+static uint16_t HighbdEvenOddMaskx8_2[8][16] = {
+  { 0, 2, 4, 6, 8, 10, 12, 14 },      { 2, 2, 4, 6, 8, 10, 12, 14 },
+  { 4, 4, 4, 6, 8, 10, 12, 14 },      { 6, 6, 6, 6, 8, 10, 12, 14 },
+  { 8, 8, 8, 8, 8, 10, 12, 14 },      { 10, 10, 10, 10, 10, 10, 12, 14 },
+  { 12, 12, 12, 12, 12, 12, 12, 14 }, { 14, 14, 14, 14, 14, 14, 14, 14 },
+};
+
+static uint16_t HighbdBaseMask[17][16] = {
+  {
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+  },
+  { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
+    0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
+    0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
+    0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
+};
+
+static void highbd_dr_prediction_z2_Nx4_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // a assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16;
+  __m256i diff;
+  __m128i c3f, min_base_y128;
+
+  a16 = _mm256_set1_epi32(16);
+  c3f = _mm_set1_epi32(0x3f);
+  min_base_y128 = _mm_set1_epi32(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    __m128i a0_x128, a1_x128;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        a0_x128 = _mm_shuffle_epi8(a0_x128,
+                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi32(
+            _mm_and_si128(
+                _mm_slli_epi32(
+                    _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                   (2 << 6) - y * dx, (3 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1));
+      } else {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi32(
+            _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                         (2 << 6) - y * dx, (3 << 6) - y * dx),
+                          c3f),
+            1));
+      }
+      a0_x = _mm256_cvtepu16_epi32(a0_x128);
+      a1_x = _mm256_cvtepu16_epi32(a1_x128);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      DECLARE_ALIGNED(32, int, base_y_c[4]);
+      r6 = _mm_set1_epi32(r << 6);
+      dy128 = _mm_set1_epi32(dy);
+      c1234 = _mm_setr_epi32(1, 2, 3, 4);
+      y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
+      base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]]);
+      a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi32(
+            _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi32(diff, shift);
+    res = _mm256_add_epi32(a32, b);
+    res = _mm256_srli_epi32(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resx = _mm_packus_epi32(resx, resx);
+
+    resy = _mm256_extracti128_si256(res, 1);
+    resy = _mm_packus_epi32(resy, resy);
+
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
+  __m256i diff;
+  __m128i a0_x128, a1_x128;
+
+  a16 = _mm256_set1_epi32(16);
+  c3f = _mm256_set1_epi32(0x3f);
+  min_base_y256 = _mm256_set1_epi32(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      resx = _mm_setzero_si128();
+    } else {
+      if (upsample_above) {
+        a0_x128 = _mm_setr_epi16(
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][0]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][1]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][2]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][3]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][4]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][5]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][6]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][7]]);
+        a1_x128 = _mm_setr_epi16(
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][0]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][1]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][2]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][3]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][4]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][5]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][6]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][7]]);
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(
+                _mm256_slli_epi32(
+                    _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                      (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                      (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                      (6 << 6) - y * dx, (7 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1);
+      } else {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(
+                _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+                                  (3 << 6) - y * dx, (4 << 6) - y * dx,
+                                  (5 << 6) - y * dx, (6 << 6) - y * dx,
+                                  (7 << 6) - y * dx),
+                c3f),
+            1);
+      }
+
+      a0_x = _mm256_cvtepu16_epi32(a0_x128);
+      a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+      diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi32(diff, shift);
+      res = _mm256_add_epi32(a32, b);
+      res = _mm256_srli_epi32(res, 5);
+
+      resx = _mm256_castsi256_si128(_mm256_packus_epi32(
+          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+    }
+    // y calc
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int, base_y_c[8]);
+      __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
+      r6 = _mm256_set1_epi32(r << 6);
+      dy256 = _mm256_set1_epi32(dy);
+      c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+      y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+      base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+      mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+      base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+      _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+      a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+          left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+          left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+          left[base_y_c[6]], left[base_y_c[7]]));
+      a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+          left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+          left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+          left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+      if (upsample_left) {
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
+            1);
+      } else {
+        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+      }
+      diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
+      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi32(diff, shift);
+      res = _mm256_add_epi32(a32, b);
+      res = _mm256_srli_epi32(res, 5);
+
+      resy = _mm256_castsi256_si128(_mm256_packus_epi32(
+          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+    } else {
+      resy = resx;
+    }
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storeu_si128((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_Nx8_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i c3f, min_base_y128;
+  __m256i a0_x, a1_x, diff, a32, a16;
+  __m128i a0_x128, a1_x128;
+
+  a16 = _mm256_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      if (upsample_above) {
+        a0_x128 = _mm_setr_epi16(
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][0]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][1]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][2]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][3]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][4]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][5]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][6]],
+            above[base_x + HighbdEvenOddMaskx8_2[base_shift][7]]);
+        a1_x128 = _mm_setr_epi16(
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][0]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][1]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][2]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][3]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][4]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][5]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][6]],
+            above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][7]]);
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(
+                    _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                   (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                   (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                   (6 << 6) - y * dx, (7 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1));
+      } else {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                         (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                         (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                         (6 << 6) - y * dx, (7 << 6) - y * dx),
+                          c3f),
+            1));
+      }
+      a0_x = _mm256_castsi128_si256(a0_x128);
+      a1_x = _mm256_castsi128_si256(a1_x128);
+    }
+
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      r6 = _mm_set1_epi16(r << 6);
+      dy128 = _mm_set1_epi16(dy);
+      c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1],
+                            left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+                            left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resy = _mm256_extracti128_si256(res, 1);
+
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storeu_si128((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_32bit_z2_HxW_avx2(
+    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16;
+  __m256i diff, min_base_y256, c3f;
+  __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+
+  a16 = _mm256_set1_epi32(16);
+  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  c3f = _mm256_set1_epi32(0x3f);
+
+  for (int r = 0; r < H; r++) {
+    __m256i b, res, shift;
+    __m256i resx[2], resy[2];
+    __m256i resxy;
+    for (int j = 0; j < W; j += 16) {
+      int y = r + 1;
+      int base_x = (-y * dx) >> frac_bits_x;
+      int base_shift = 0;
+      if ((base_x + j) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x + j) - 1);
+      }
+      int base_min_diff = (min_base_x - base_x - j);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift > 7) {
+        resx[0] = _mm256_setzero_si256();
+      } else {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+        a1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        a0_x = _mm256_cvtepu16_epi32(a0_x128);
+        a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(
+                _mm256_setr_epi32(
+                    ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
+                    ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
+                    ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
+                    ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
+                c3f),
+            1);
+
+        diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi32(diff, shift);
+        res = _mm256_add_epi32(a32, b);
+        res = _mm256_srli_epi32(res, 5);
+
+        resx[0] = _mm256_packus_epi32(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+      }
+      int base_shift8 = 0;
+      if ((base_x + j + 8) < (min_base_x - 1)) {
+        base_shift8 = (min_base_x - (base_x + j + 8) - 1);
+      }
+      if (base_shift8 > 7) {
+        resx[1] = _mm256_setzero_si256();
+      } else {
+        a0_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8 + j));
+        a1_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9 + j));
+        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
+        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
+
+        a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
+        a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
+
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(
+                _mm256_setr_epi32(
+                    ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
+                    ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
+                    ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
+                    ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
+                c3f),
+            1);
+
+        diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
+        b = _mm256_mullo_epi32(diff, shift);
+
+        resx[1] = _mm256_add_epi32(a32, b);
+        resx[1] = _mm256_srli_epi32(resx[1], 5);
+        resx[1] = _mm256_packus_epi32(
+            resx[1],
+            _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
+      }
+      resx[0] =
+          _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
+                                  1);  // 16 16bit values
+
+      // y calc
+      if ((base_x < min_base_x)) {
+        DECLARE_ALIGNED(32, int, base_y_c[16]);
+        __m256i r6, c256, dy256, y_c256, y_c_1_256, base_y_c256, mask256;
+        r6 = _mm256_set1_epi32(r << 6);
+        dy256 = _mm256_set1_epi32(dy);
+        c256 = _mm256_setr_epi32(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
+                                 7 + j, 8 + j);
+        y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+        base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+        c256 = _mm256_setr_epi32(9 + j, 10 + j, 11 + j, 12 + j, 13 + j, 14 + j,
+                                 15 + j, 16 + j);
+        y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+        base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
+
+        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]]));
+        a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+            left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi32(diff, shift);
+        res = _mm256_add_epi32(a32, b);
+        res = _mm256_srli_epi32(res, 5);
+
+        resy[0] = _mm256_packus_epi32(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+            left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
+            left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
+            left[base_y_c[14]], left[base_y_c[15]]));
+        a1_y = _mm256_cvtepu16_epi32(
+            _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
+                           left[base_y_c[10] + 1], left[base_y_c[11] + 1],
+                           left[base_y_c[12] + 1], left[base_y_c[13] + 1],
+                           left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
+        shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi32(diff, shift);
+        res = _mm256_add_epi32(a32, b);
+        res = _mm256_srli_epi32(res, 5);
+
+        resy[1] = _mm256_packus_epi32(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+        resy[0] =
+            _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
+                                    1);  // 16 16bit values
+      } else {
+        resy[0] = resx[0];
+      }
+      resxy = _mm256_blendv_epi8(resx[0], resy[0],
+                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
+      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_HxW_avx2(
+    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16, c3f;
+  __m256i diff, min_base_y256;
+
+  a16 = _mm256_set1_epi16(16);
+  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  for (int r = 0; r < H; r++) {
+    __m256i b, res, shift;
+    __m256i resx, resy;
+    __m256i resxy;
+    __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128, shiftx;
+
+    for (int j = 0; j < W; j += 16) {
+      int y = r + 1;
+      int base_x = (-y * dx) >> frac_bits_x;
+      int base_shift = 0;
+      if ((base_x + j) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x + j) - 1);
+      }
+      int base_min_diff = (min_base_x - base_x - j);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift > 7) {
+        a0_x = _mm256_setzero_si256();
+        a1_x = _mm256_setzero_si256();
+        shift = _mm256_setzero_si256();
+      } else {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+        a1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        a0_x = _mm256_castsi128_si256(a0_x128);
+        a1_x = _mm256_castsi128_si256(a1_x128);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(_mm_setr_epi16(
+                              ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
+                              ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
+                              ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
+                              ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
+                          _mm256_castsi256_si128(c3f)),
+            1));
+      }
+
+      base_shift = 0;
+      if ((base_x + j + 8) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x + j + 8) - 1);
+      }
+      if (base_shift <= 7) {
+        a0_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j));
+        a1_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j));
+        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        shiftx = _mm_srli_epi16(
+            _mm_and_si128(
+                _mm_setr_epi16(
+                    ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
+                    ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
+                    ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
+                    ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
+                _mm256_castsi256_si128(c3f)),
+            1);
+
+        a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
+        a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
+        shift = _mm256_inserti128_si256(shift, shiftx, 1);
+      }
+
+      diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+      a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi16(diff, shift);
+      res = _mm256_add_epi16(a32, b);
+      resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+
+      // y calc
+      __m256i a0_y, a1_y, shifty;
+      if ((base_x < min_base_x)) {
+        DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+        __m256i r6, c256, dy256, y_c256, base_y_c256, mask256, mul16;
+        r6 = _mm256_set1_epi16(r << 6);
+        dy256 = _mm256_set1_epi16(dy);
+        c256 = _mm256_setr_epi16(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
+                                 7 + j, 8 + j, 9 + j, 10 + j, 11 + j, 12 + j,
+                                 13 + j, 14 + j, 15 + j, 16 + j);
+        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+                                 _mm256_srli_epi16(min_base_y256, 1));
+        y_c256 = _mm256_sub_epi16(r6, mul16);
+        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+        a0_y = _mm256_setr_epi16(
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+            left[base_y_c[15]]);
+        a1_y = _mm256_setr_epi16(
+            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+            left[base_y_c[6] + 1], left[base_y_c[7] + 1], left[base_y_c[8] + 1],
+            left[base_y_c[9] + 1], left[base_y_c[10] + 1],
+            left[base_y_c[11] + 1], left[base_y_c[12] + 1],
+            left[base_y_c[13] + 1], left[base_y_c[14] + 1],
+            left[base_y_c[15] + 1]);
+
+        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi16(diff, shifty);
+        res = _mm256_add_epi16(a32, b);
+        resy = _mm256_srli_epi16(res, 5);
+      } else {
+        resy = _mm256_setzero_si256();
+      }
+
+      resxy = _mm256_blendv_epi8(resx, resy,
+                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
+      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_above,
+                                      int upsample_left, int dx, int dy,
+                                      int bd) {
+  (void)bd;
+  assert(dx > 0);
+  assert(dy > 0);
+  switch (bw) {
+    case 4:
+      highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
+                                       upsample_above, upsample_left, dx, dy);
+      break;
+    case 8:
+      if (bd < 12) {
+        highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
+                                         upsample_above, upsample_left, dx, dy);
+      } else {
+        highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
+                                               upsample_above, upsample_left,
+                                               dx, dy);
+      }
+      break;
+    default:
+      if (bd < 12) {
+        highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+                                         upsample_above, upsample_left, dx, dy);
+      } else {
+        highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+                                               upsample_above, upsample_left,
+                                               dx, dy);
+      }
+      break;
+  }
+}
+
+static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
+                             uint16_t *dst, ptrdiff_t pitchDst, int width,
+                             int height) {
   for (int j = 0; j < height; j += 8)
     for (int i = 0; i < width; i += 8)
-      transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc, dst + j * pitchDst + i,
-                       pitchDst);
+      highbd_transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc,
+                              dst + j * pitchDst + i, pitchDst);
 }
 
 static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
@@ -1649,7 +2559,7 @@ static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
 static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
                                               const uint16_t *left,
                                               int upsample_left, int dy) {
-  __m256i dstvec[8], d[16];
+  __m256i dstvec[8], d[8];
 
   highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
                                              dy);
@@ -1818,9 +2728,9 @@ static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
 static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
                                                int upsample_left, int dy) {
-  uint16_t dstT[64 * 64];
+  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
   highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
-  transpose(dstT, 64, dst, stride, 64, 64);
+  highbd_transpose(dstT, 64, dst, stride, 64, 64);
 }
 
 static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
@@ -1872,24 +2782,24 @@ static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
                                                int upsample_left, int dy) {
   uint16_t dstT[64 * 32];
   highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
-  transpose(dstT, 64, dst, stride, 32, 64);
+  highbd_transpose(dstT, 64, dst, stride, 32, 64);
 }
 
 static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
                                                int upsample_left, int dy) {
-  uint16_t dstT[32 * 64];
+  DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
   highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
-  transpose(dstT, 32, dst, stride, 64, 32);
+  highbd_transpose(dstT, 32, dst, stride, 64, 32);
   return;
 }
 
 static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
                                                int upsample_left, int dy) {
-  uint16_t dstT[64 * 16];
+  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
   highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
-  transpose(dstT, 64, dst, stride, 16, 64);
+  highbd_transpose(dstT, 64, dst, stride, 16, 64);
 }
 
 static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
@@ -1910,9 +2820,10 @@ static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
 void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint16_t *above,
                                       const uint16_t *left, int upsample_left,
-                                      int dx, int dy) {
+                                      int dx, int dy, int bd) {
   (void)above;
   (void)dx;
+  (void)bd;
   assert(dx == 1);
   assert(dy > 0);
   if (bw == bh) {
@@ -2013,3 +2924,1716 @@ void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
   }
   return;
 }
+
+// Low bit depth functions
+static uint8_t BaseMask[33][32] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+};
+
+static AOM_FORCE_INLINE void dr_prediction_z1_4xN_internal_avx2(
+    int N, __m128i *dst, const uint8_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((N + 4) - 1) << upsample_above;
+  int x;
+  // a assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i diff, c3f;
+  __m128i a_mbase_x;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i res1, a0_128, a1_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+    if (base_max_diff > 4) base_max_diff = 4;
+    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+    a1_128 = _mm_srli_si128(a0_128, 1);
+
+    if (upsample_above) {
+      a0_128 = _mm_shuffle_epi8(
+          a0_128,
+          _mm_setr_epi8(0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15));
+      a1_128 = _mm_srli_si128(a0_128, 4);
+
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+          1);
+    } else {
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+    }
+    a0 = _mm256_cvtepu8_epi16(a0_128);
+    a1 = _mm256_cvtepu8_epi16(a1_128);
+
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    res1 = _mm256_castsi256_si128(res);
+    res1 = _mm_packus_epi16(res1, res1);
+
+    dst[r] =
+        _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  __m128i dstvec[16];
+
+  dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_8xN_internal_avx2(
+    int N, __m128i *dst, const uint8_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+  int x;
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a0_1, a1_1, a32, a16, diff, c3f;
+  __m128i a_mbase_x;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  c3f = _mm256_set1_epi32(0x3f);
+
+  x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, res1, shift;
+    __m128i res128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = a_mbase_x;  // save 16 values, 8 to be used furter
+      }
+      return;
+    }
+    if (base_max_diff > 8) base_max_diff = 8;
+
+    a0 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+    a1 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+    if (upsample_above) {
+      a0 = _mm256_permutevar8x32_epi32(
+          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
+
+      a0_1 =
+          _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+      a0_1 = _mm256_permutevar8x32_epi32(
+          a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+      a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
+
+      a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
+      a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
+
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(
+              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), c3f),
+          1);
+    } else {
+      shift = _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+    }
+
+    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi32(diff, shift);
+    res = _mm256_add_epi32(a32, b);
+    res = _mm256_srli_epi32(res, 5);
+
+    res1 = _mm256_packus_epi32(
+        res, _mm256_castsi128_si256(
+                 _mm256_extracti128_si256(res, 1)));  // goto 16 bit
+
+    res128 = _mm_packus_epi16(_mm256_castsi256_si128(res1),
+                              _mm256_castsi256_si128(res1));  // goto 8 bit
+
+    res128 =
+        _mm_blendv_epi8(a_mbase_x, res128, *(__m128i *)BaseMask[base_max_diff]);
+    dst[r] = res128;
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  __m128i dstvec[32];
+
+  dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_16xN_internal_avx2(
+    int N, __m128i *dstvec, const uint8_t *above, int upsample_above, int dx) {
+  int x;
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((16 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a0_1, a1, a1_1, diff, a32, a16, c3f;
+  __m128i a_mbase_x;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm_set1_epi8((uint8_t)above[max_base_x]);
+  c3f = _mm256_set1_epi32(0x3f);
+
+  x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res[2];
+    __m128i res128[2];
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base);
+    if (base_max_diff <= 0) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 16 values
+      }
+      return;
+    }
+    __m256i shift =
+        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+
+    a0 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+    a1 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+    b = _mm256_mullo_epi32(diff, shift);
+
+    res[0] = _mm256_add_epi32(a32, b);
+    res[0] = _mm256_srli_epi32(res[0], 5);
+    res[0] = _mm256_packus_epi32(
+        res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+    res128[0] = _mm_packus_epi16(_mm256_castsi256_si128(res[0]),
+                                 _mm256_castsi256_si128(res[0]));  // goto 8 bit
+
+    if (base_max_diff > 8) {
+      if (base_max_diff > 16) base_max_diff = 16;
+      a0_1 =
+          _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+      a1_1 =
+          _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
+
+      diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
+      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+      b = _mm256_mullo_epi32(diff, shift);
+
+      res[1] = _mm256_add_epi32(a32, b);
+      res[1] = _mm256_srli_epi32(res[1], 5);
+      res[1] = _mm256_packus_epi32(
+          res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+      res128[1] =
+          _mm_packus_epi16(_mm256_castsi256_si128(res[1]),
+                           _mm256_castsi256_si128(res[1]));  // goto 8 bit
+
+    } else {
+      res128[1] = a_mbase_x;
+    }
+    res128[0] = _mm_unpacklo_epi64(res128[0], res128[1]);  // 16 8bit values
+
+    dstvec[r] = _mm_blendv_epi8(a_mbase_x, res128[0],
+                                *(__m128i *)BaseMask[base_max_diff]);
+    x += dx;
+  }
+}
+static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  __m128i dstvec[64];
+
+  dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
+    int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
+  int x;
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a0_1, a1, a1_1, a32, a16;
+  __m256i a_mbase_x, diff, c3f;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
+  c3f = _mm256_set1_epi32(0x3f);
+
+  x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res[2], res16[2];
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base);
+    if (base_max_diff <= 0) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+      }
+      return;
+    }
+    if (base_max_diff > 32) base_max_diff = 32;
+    __m256i shift =
+        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+
+    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+      int mdiff = base_max_diff - j;
+      if (mdiff <= 0) {
+        res16[jj] = a_mbase_x;
+      } else {
+        a0 = _mm256_cvtepu8_epi32(
+            _mm_loadu_si128((__m128i *)(above + base + j)));
+        a1 = _mm256_cvtepu8_epi32(
+            _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
+
+        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi32(diff, shift);
+
+        res[0] = _mm256_add_epi32(a32, b);
+        res[0] = _mm256_srli_epi32(res[0], 5);
+        res[0] = _mm256_packus_epi32(
+            res[0],
+            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+
+        // goto 8 bit
+        res[0] = _mm256_packus_epi16(res[0], res[0]);
+
+        if (mdiff > 8) {
+          a0_1 = _mm256_cvtepu8_epi32(
+              _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
+          a1_1 = _mm256_cvtepu8_epi32(
+              _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
+
+          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
+          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
+          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+          b = _mm256_mullo_epi32(diff, shift);
+
+          res[1] = _mm256_add_epi32(a32, b);
+          res[1] = _mm256_srli_epi32(res[1], 5);
+          res[1] = _mm256_packus_epi32(
+              res[1],
+              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+          res[1] = _mm256_packus_epi16(res[1], res[1]);
+          // goto 8 bit
+        } else {
+          res[1] = a_mbase_x;
+        }
+        res16[jj] = _mm256_unpacklo_epi64(res[0], res[1]);  // 16 8bit values
+      }
+    }
+    res16[1] =
+        _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
+                                1);  // 32 8bit values
+
+    dstvec[r] = _mm256_blendv_epi8(
+        a_mbase_x, res16[1],
+        *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  __m256i dstvec[64];
+  dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  int x;
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a0_1, a1, a1_1, a32, a16;
+  __m256i a_mbase_x, diff, c3f;
+  __m128i max_base_x128, base_inc128, mask128;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
+  max_base_x128 = _mm_set1_epi8(max_base_x);
+  c3f = _mm256_set1_epi32(0x3f);
+
+  x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m256i b, res[2];
+    __m128i res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
+        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+
+    __m128i a0_128, a0_1_128, a1_128, a1_1_128;
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm_storeu_si128((__m128i *)(dst + j),
+                         _mm256_castsi256_si128(a_mbase_x));
+      } else {
+        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+        a0 = _mm256_cvtepu8_epi32(a0_128);
+        a1 = _mm256_cvtepu8_epi32(a1_128);
+
+        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi32(diff, shift);
+
+        res[0] = _mm256_add_epi32(a32, b);
+        res[0] = _mm256_srli_epi32(res[0], 5);
+        res[0] = _mm256_packus_epi32(
+            res[0],
+            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+        // goto 8 bit
+        res[0] = _mm256_packus_epi16(res[0], res[0]);
+
+        if (mdif > 8) {
+          a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
+          a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
+          a0_1 = _mm256_cvtepu8_epi32(a0_1_128);
+          a1_1 = _mm256_cvtepu8_epi32(a1_1_128);
+
+          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
+          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
+          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+          b = _mm256_mullo_epi32(diff, shift);
+
+          res[1] = _mm256_add_epi32(a32, b);
+          res[1] = _mm256_srli_epi32(res[1], 5);
+          res[1] = _mm256_packus_epi32(
+              res[1],
+              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+          res[1] = _mm256_packus_epi16(res[1], res[1]);
+
+        } else {
+          res[1] = a_mbase_x;
+        }
+        res1 = _mm_unpacklo_epi64(
+            _mm256_castsi256_si128(res[0]),
+            _mm256_castsi256_si128(res[1]));  // 16 8bit values
+
+        base_inc128 = _mm_setr_epi8(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
+                                 _mm_setzero_si128());
+        res1 =
+            _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), res1, mask128);
+        _mm_storeu_si128((__m128i *)(dst + j), res1);
+      }
+    }
+    x += dx;
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int dx, int dy) {
+  (void)left;
+  (void)dy;
+  switch (bw) {
+    case 4:
+      dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 8:
+      dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 16:
+      dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 32:
+      dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 64:
+      dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    default: break;
+  }
+  return;
+}
+
+static uint8_t LoadMaskx[8][16] = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+  { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+  { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
+  { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
+  { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
+};
+
+static uint8_t EvenOddMaskx4[8][16] = {
+  { 0, 2, 4, 6, 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0, 1, 3, 5, 7, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0 },
+  { 0, 0, 2, 4, 6, 8, 3, 5, 7, 9, 0, 0, 0, 0, 0, 0 },
+  { 0, 0, 0, 3, 5, 7, 9, 4, 6, 8, 10, 0, 0, 0, 0, 0 },
+  { 0, 0, 0, 0, 4, 6, 8, 10, 5, 7, 9, 11, 0, 0, 0, 0 },
+  { 0, 0, 0, 0, 0, 5, 7, 9, 11, 6, 8, 10, 12, 0, 0, 0 },
+  { 0, 0, 0, 0, 0, 0, 6, 8, 10, 12, 7, 9, 11, 13, 0, 0 },
+  { 0, 0, 0, 0, 0, 0, 0, 7, 9, 11, 13, 8, 10, 12, 14, 0 }
+};
+
+static uint8_t EvenOddMaskx[8][16] = {
+  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 0, 0, 0, 0 },
+  { 0, 1, 3, 5, 7, 9, 11, 13, 15, 2, 4, 6, 8, 0, 0, 0 },
+  { 0, 0, 2, 4, 6, 8, 10, 12, 14, 3, 5, 7, 9, 0, 0, 0 },
+  { 0, 0, 0, 3, 5, 7, 9, 11, 13, 15, 4, 6, 8, 10, 0 },
+  { 0, 0, 0, 0, 4, 6, 8, 10, 12, 14, 5, 7, 9, 11, 0, 0 },
+  { 0, 0, 0, 0, 0, 5, 7, 9, 11, 13, 15, 6, 8, 10, 12, 0 },
+  { 0, 0, 0, 0, 0, 0, 6, 8, 10, 12, 14, 7, 9, 11, 13, 0 },
+  { 0, 0, 0, 0, 0, 0, 0, 7, 9, 11, 13, 15, 8, 10, 12, 14 }
+};
+
+static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // a assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16, diff;
+  __m128i c3f, min_base_y128;
+
+  a16 = _mm256_set1_epi32(16);
+  c3f = _mm_set1_epi32(0x3f);
+  min_base_y128 = _mm_set1_epi32(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    __m128i a0_x128, a1_x128;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx4[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 4);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi32(
+            _mm_and_si128(
+                _mm_slli_epi32(
+                    _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                   (2 << 6) - y * dx, (3 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1));
+      } else {
+        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 1);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi32(
+            _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                         (2 << 6) - y * dx, (3 << 6) - y * dx),
+                          c3f),
+            1));
+      }
+      a0_x = _mm256_cvtepu8_epi32(a0_x128);
+      a1_x = _mm256_cvtepu8_epi32(a1_x128);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int, base_y_c[4]);
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      r6 = _mm_set1_epi32(r << 6);
+      dy128 = _mm_set1_epi32(dy);
+      c1234 = _mm_setr_epi32(1, 2, 3, 4);
+      y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
+      base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]]);
+      a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi32(
+            _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi32(diff, shift);
+    res = _mm256_add_epi32(a32, b);
+    res = _mm256_srli_epi32(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resx = _mm_packus_epi32(resx, resx);
+    resx = _mm_packus_epi16(resx, resx);
+
+    resy = _mm256_extracti128_si256(res, 1);
+    resy = _mm_packus_epi32(resy, resy);
+    resy = _mm_packus_epi16(resy, resy);
+
+    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+    *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i diff, a32, a16;
+  __m256i a0_x, a1_x;
+  __m128i a0_x128, a1_x128, min_base_y128, c3f;
+
+  a16 = _mm256_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+      if (upsample_above) {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(
+                    _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                   (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                   (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                   (6 << 6) - y * dx, (7 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1));
+      } else {
+        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                         (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                         (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                         (6 << 6) - y * dx, (7 << 6) - y * dx),
+                          c3f),
+            1));
+      }
+      a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
+      a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
+    }
+
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      r6 = _mm_set1_epi16(r << 6);
+      dy128 = _mm_set1_epi16(dy);
+      c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1],
+                            left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+                            left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
+                            _mm256_castsi256_si128(res));
+    resy = _mm256_extracti128_si256(res, 1);
+    resy = _mm_packus_epi16(resy, resy);
+
+    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
+                                      ptrdiff_t stride, const uint8_t *above,
+                                      const uint8_t *left, int upsample_above,
+                                      int upsample_left, int dx, int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be caluculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16;
+  __m256i diff, min_base_y256, c3f, shifty;
+  __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128, a0_1_x, a1_1_x, shiftx;
+
+  a16 = _mm256_set1_epi16(16);
+  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  for (int r = 0; r < H; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy;
+    __m128i resxy;
+    for (int j = 0; j < W; j += 16) {
+      int y = r + 1;
+      int base_x = (-y * dx) >> frac_bits_x;
+
+      int base_shift = 0;
+      if ((base_x + j) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x + j) - 1);
+      }
+      int base_min_diff = (min_base_x - base_x - j);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+      if (base_shift > 7) {
+        a0_x = _mm256_setzero_si256();
+        a1_x = _mm256_setzero_si256();
+        shift = _mm256_setzero_si256();
+      } else {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+        a1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+        a0_x = _mm256_cvtepu8_epi16(a0_x128);
+        a1_x = _mm256_cvtepu8_epi16(a1_x128);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(_mm_setr_epi16(
+                              ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
+                              ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
+                              ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
+                              ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
+                          _mm256_castsi256_si128(c3f)),
+            1));
+      }
+
+      base_shift = 0;
+      if ((base_x + j + 8) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x + j + 8) - 1);
+      }
+      if (base_shift <= 7) {
+        a0_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j));
+        a1_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j));
+        a0_1_x128 =
+            _mm_shuffle_epi8(a0_1_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_1_x128 =
+            _mm_shuffle_epi8(a1_1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+        a0_1_x = _mm_cvtepu8_epi16(a0_1_x128);
+        a1_1_x = _mm_cvtepu8_epi16(a1_1_x128);
+
+        shiftx = _mm_srli_epi16(
+            _mm_and_si128(
+                _mm_setr_epi16(
+                    ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
+                    ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
+                    ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
+                    ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
+                _mm256_castsi256_si128(c3f)),
+            1);
+
+        a0_x = _mm256_inserti128_si256(a0_x, a0_1_x, 1);
+        a1_x = _mm256_inserti128_si256(a1_x, a1_1_x, 1);
+        shift = _mm256_inserti128_si256(shift, shiftx, 1);
+      }
+
+      diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+      a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi16(diff, shift);
+      res = _mm256_add_epi16(a32, b);
+      res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+      resx = _mm256_castsi256_si128(_mm256_packus_epi16(
+          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+
+      // y calc
+      if ((base_x < min_base_x)) {
+        DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+        __m256i r6, c256, dy256, y_c256, base_y_c256, mask256, mul16;
+        r6 = _mm256_set1_epi16(r << 6);
+        dy256 = _mm256_set1_epi16(dy);
+        c256 = _mm256_setr_epi16(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
+                                 7 + j, 8 + j, 9 + j, 10 + j, 11 + j, 12 + j,
+                                 13 + j, 14 + j, 15 + j, 16 + j);
+        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+                                 _mm256_srli_epi16(min_base_y256, 1));
+        y_c256 = _mm256_sub_epi16(r6, mul16);
+
+        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256); /**/
+
+        a0_y = _mm256_setr_epi16(
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+            left[base_y_c[15]]);
+        a1_y = _mm256_setr_epi16(
+            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+            left[base_y_c[6] + 1], left[base_y_c[7] + 1], left[base_y_c[8] + 1],
+            left[base_y_c[9] + 1], left[base_y_c[10] + 1],
+            left[base_y_c[11] + 1], left[base_y_c[12] + 1],
+            left[base_y_c[13] + 1], left[base_y_c[14] + 1],
+            left[base_y_c[15] + 1]);
+
+        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi16(diff, shifty);
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+        resy = _mm256_castsi256_si128(_mm256_packus_epi16(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+
+      } else {
+        resy = _mm_setzero_si128();
+      }
+      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+      _mm_storeu_si128((__m128i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int upsample_left, int dx,
+                               int dy) {
+  assert(dx > 0);
+  assert(dy > 0);
+  switch (bw) {
+    case 4:
+      dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    case 8:
+      dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    default:
+      dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+                                upsample_above, upsample_left, dx, dy);
+      break;
+  }
+  return;
+}
+
+// z3 functions
+static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
+  __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
+  w0 = _mm_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm_unpackhi_epi8(x[0], x[1]);
+  w3 = _mm_unpackhi_epi8(x[2], x[3]);
+
+  ww0 = _mm_unpacklo_epi16(w0, w1);
+  ww1 = _mm_unpacklo_epi16(w2, w3);
+  ww2 = _mm_unpackhi_epi16(w0, w1);
+  ww3 = _mm_unpackhi_epi16(w2, w3);
+
+  w0 = _mm_unpacklo_epi32(ww0, ww1);
+  w2 = _mm_unpacklo_epi32(ww2, ww3);
+  w1 = _mm_unpackhi_epi32(ww0, ww1);
+  w3 = _mm_unpackhi_epi32(ww2, ww3);
+
+  d[0] = _mm_unpacklo_epi64(w0, w2);
+  d[1] = _mm_unpackhi_epi64(w0, w2);
+  d[2] = _mm_unpacklo_epi64(w1, w3);
+  d[3] = _mm_unpackhi_epi64(w1, w3);
+
+  d[4] = _mm_srli_si128(d[0], 8);
+  d[5] = _mm_srli_si128(d[1], 8);
+  d[6] = _mm_srli_si128(d[2], 8);
+  d[7] = _mm_srli_si128(d[3], 8);
+
+  d[8] = _mm_srli_si128(d[0], 4);
+  d[9] = _mm_srli_si128(d[1], 4);
+  d[10] = _mm_srli_si128(d[2], 4);
+  d[11] = _mm_srli_si128(d[3], 4);
+
+  d[12] = _mm_srli_si128(d[0], 12);
+  d[13] = _mm_srli_si128(d[1], 12);
+  d[14] = _mm_srli_si128(d[2], 12);
+  d[15] = _mm_srli_si128(d[3], 12);
+}
+
+static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m256i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm256_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm256_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm256_unpacklo_epi8(x[4], x[5]);
+  w3 = _mm256_unpacklo_epi8(x[6], x[7]);
+
+  w8 = _mm256_unpacklo_epi8(x[8], x[9]);
+  w9 = _mm256_unpacklo_epi8(x[10], x[11]);
+  w10 = _mm256_unpacklo_epi8(x[12], x[13]);
+  w11 = _mm256_unpacklo_epi8(x[14], x[15]);
+
+  w4 = _mm256_unpacklo_epi16(w0, w1);
+  w5 = _mm256_unpacklo_epi16(w2, w3);
+  w12 = _mm256_unpacklo_epi16(w8, w9);
+  w13 = _mm256_unpacklo_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[0] = _mm256_unpacklo_epi64(w6, w14);
+  d[1] = _mm256_unpackhi_epi64(w6, w14);
+  d[2] = _mm256_unpacklo_epi64(w7, w15);
+  d[3] = _mm256_unpackhi_epi64(w7, w15);
+
+  w4 = _mm256_unpackhi_epi16(w0, w1);
+  w5 = _mm256_unpackhi_epi16(w2, w3);
+  w12 = _mm256_unpackhi_epi16(w8, w9);
+  w13 = _mm256_unpackhi_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[4] = _mm256_unpacklo_epi64(w6, w14);
+  d[5] = _mm256_unpackhi_epi64(w6, w14);
+  d[6] = _mm256_unpacklo_epi64(w7, w15);
+  d[7] = _mm256_unpackhi_epi64(w7, w15);
+
+  // upper half
+  w0 = _mm256_unpackhi_epi8(x[0], x[1]);
+  w1 = _mm256_unpackhi_epi8(x[2], x[3]);
+  w2 = _mm256_unpackhi_epi8(x[4], x[5]);
+  w3 = _mm256_unpackhi_epi8(x[6], x[7]);
+
+  w8 = _mm256_unpackhi_epi8(x[8], x[9]);
+  w9 = _mm256_unpackhi_epi8(x[10], x[11]);
+  w10 = _mm256_unpackhi_epi8(x[12], x[13]);
+  w11 = _mm256_unpackhi_epi8(x[14], x[15]);
+
+  w4 = _mm256_unpacklo_epi16(w0, w1);
+  w5 = _mm256_unpacklo_epi16(w2, w3);
+  w12 = _mm256_unpacklo_epi16(w8, w9);
+  w13 = _mm256_unpacklo_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[8] = _mm256_unpacklo_epi64(w6, w14);
+  d[9] = _mm256_unpackhi_epi64(w6, w14);
+  d[10] = _mm256_unpacklo_epi64(w7, w15);
+  d[11] = _mm256_unpackhi_epi64(w7, w15);
+
+  w4 = _mm256_unpackhi_epi16(w0, w1);
+  w5 = _mm256_unpackhi_epi16(w2, w3);
+  w12 = _mm256_unpackhi_epi16(w8, w9);
+  w13 = _mm256_unpackhi_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[12] = _mm256_unpacklo_epi64(w6, w14);
+  d[13] = _mm256_unpackhi_epi64(w6, w14);
+  d[14] = _mm256_unpacklo_epi64(w7, w15);
+  d[15] = _mm256_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm_unpacklo_epi8(x[4], x[5]);
+  w3 = _mm_unpacklo_epi8(x[6], x[7]);
+
+  w8 = _mm_unpacklo_epi8(x[8], x[9]);
+  w9 = _mm_unpacklo_epi8(x[10], x[11]);
+  w10 = _mm_unpacklo_epi8(x[12], x[13]);
+  w11 = _mm_unpacklo_epi8(x[14], x[15]);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[0] = _mm_unpacklo_epi64(w6, w14);
+  d[1] = _mm_unpackhi_epi64(w6, w14);
+  d[2] = _mm_unpacklo_epi64(w7, w15);
+  d[3] = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[4] = _mm_unpacklo_epi64(w6, w14);
+  d[5] = _mm_unpackhi_epi64(w6, w14);
+  d[6] = _mm_unpacklo_epi64(w7, w15);
+  d[7] = _mm_unpackhi_epi64(w7, w15);
+
+  // upper half
+  w0 = _mm_unpackhi_epi8(x[0], x[1]);
+  w1 = _mm_unpackhi_epi8(x[2], x[3]);
+  w2 = _mm_unpackhi_epi8(x[4], x[5]);
+  w3 = _mm_unpackhi_epi8(x[6], x[7]);
+
+  w8 = _mm_unpackhi_epi8(x[8], x[9]);
+  w9 = _mm_unpackhi_epi8(x[10], x[11]);
+  w10 = _mm_unpackhi_epi8(x[12], x[13]);
+  w11 = _mm_unpackhi_epi8(x[14], x[15]);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[8] = _mm_unpacklo_epi64(w6, w14);
+  d[9] = _mm_unpackhi_epi64(w6, w14);
+  d[10] = _mm_unpacklo_epi64(w7, w15);
+  d[11] = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[12] = _mm_unpacklo_epi64(w6, w14);
+  d[13] = _mm_unpackhi_epi64(w6, w14);
+  d[14] = _mm_unpacklo_epi64(w7, w15);
+  d[15] = _mm_unpackhi_epi64(w7, w15);
+}
+
+static void transpose_TX_8X8(const uint8_t *src, ptrdiff_t pitchSrc,
+                             uint8_t *dst, ptrdiff_t pitchDst) {
+  __m128i r0, r1, r2, r3, r4, r5, r6, r7;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+  r0 = _mm_loadl_epi64((__m128i *)(src + 0 * pitchSrc));
+  r1 = _mm_loadl_epi64((__m128i *)(src + 1 * pitchSrc));
+  r2 = _mm_loadl_epi64((__m128i *)(src + 2 * pitchSrc));
+  r3 = _mm_loadl_epi64((__m128i *)(src + 3 * pitchSrc));
+  r4 = _mm_loadl_epi64((__m128i *)(src + 4 * pitchSrc));
+  r5 = _mm_loadl_epi64((__m128i *)(src + 5 * pitchSrc));
+  r6 = _mm_loadl_epi64((__m128i *)(src + 6 * pitchSrc));
+  r7 = _mm_loadl_epi64((__m128i *)(src + 7 * pitchSrc));
+
+  transpose8x8_sse2(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  _mm_storel_epi64((__m128i *)(dst + 0 * pitchDst), d0d1);
+  _mm_storel_epi64((__m128i *)(dst + 1 * pitchDst), _mm_srli_si128(d0d1, 8));
+  _mm_storel_epi64((__m128i *)(dst + 2 * pitchDst), d2d3);
+  _mm_storel_epi64((__m128i *)(dst + 3 * pitchDst), _mm_srli_si128(d2d3, 8));
+  _mm_storel_epi64((__m128i *)(dst + 4 * pitchDst), d4d5);
+  _mm_storel_epi64((__m128i *)(dst + 5 * pitchDst), _mm_srli_si128(d4d5, 8));
+  _mm_storel_epi64((__m128i *)(dst + 6 * pitchDst), d6d7);
+  _mm_storel_epi64((__m128i *)(dst + 7 * pitchDst), _mm_srli_si128(d6d7, 8));
+}
+
+static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
+                      ptrdiff_t pitchDst, int width, int height) {
+  for (int j = 0; j < height; j += 8)
+    for (int i = 0; i < width; i += 8)
+      transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc, dst + j * pitchDst + i,
+                       pitchDst);
+}
+
+static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[4], d[4];
+
+  dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                            &d[0], &d[1], &d[2], &d[3]);
+
+  *(uint32_t *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
+  *(uint32_t *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
+  *(uint32_t *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
+  *(uint32_t *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
+  return;
+}
+
+static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[8], d[8];
+
+  dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
+                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
+                    &d[3]);
+
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
+  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
+  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
+}
+
+static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[4], d[8];
+
+  dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
+                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+  for (int i = 0; i < 8; i++) {
+    *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[8], d[4];
+
+  dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
+                        &d[1], &d[2], &d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[8], d[8];
+
+  dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
+                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
+                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
+  for (int i = 0; i < 8; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+                     _mm_srli_si128(d[i], 8));
+  }
+}
+
+static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[16], d[16];
+
+  dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[4], d[16];
+
+  dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, dy);
+  transpose4x16_sse2(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+  }
+}
+
+static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[16], d[8];
+
+  dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+  for (int i = 4; i < 8; i++) {
+    d[i] = _mm_setzero_si128();
+  }
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+  for (int i = 0; i < 4; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m256i dstvec[16], d[16];
+
+  dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+  for (int i = 8; i < 16; i++) {
+    dstvec[i] = _mm256_setzero_si256();
+  }
+  transpose16x32_avx2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 0; i < 16; i++) {
+    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+  }
+}
+
+static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[32], d[16];
+
+  dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+  transpose16x8_8x16_sse2(
+      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
+      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
+      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
+      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
+      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
+      &d[6 + 8], &d[7 + 8]);
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
+  }
+}
+
+static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[16], d[16];
+
+  dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+  transpose16x16_sse2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m256i dstvec[32], d[32];
+
+  dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+  transpose16x32_avx2(dstvec, d);
+  transpose16x32_avx2(dstvec + 16, d + 16);
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * stride),
+                     _mm256_castsi256_si128(d[j]));
+    _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
+                     _mm256_castsi256_si128(d[j + 16]));
+  }
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+                     _mm256_extracti128_si256(d[j], 1));
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
+                     _mm256_extracti128_si256(d[j + 16], 1));
+  }
+}
+
+static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
+  dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m256i dstvec[16], d[16];
+
+  dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+  transpose16x32_avx2(dstvec, d);
+  // store
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * stride),
+                     _mm256_castsi256_si128(d[j]));
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+                     _mm256_extracti128_si256(d[j], 1));
+  }
+}
+
+static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[32], d[16];
+
+  dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 32; i += 16) {
+    transpose16x16_sse2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[64 * 32];
+  dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[32 * 64];
+  dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
+  transpose(dstT, 32, dst, stride, 64, 32);
+  return;
+}
+
+static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[64 * 16];
+  dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[64], d[16];
+
+  dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 64; i += 16) {
+    transpose16x16_sse2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_left, int dx, int dy) {
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy > 0);
+
+  if (bw == bh) {
+    switch (bw) {
+      case 4:
+        dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 8:
+        dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 16:
+        dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 32:
+        dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 64:
+        dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
+        break;
+    }
+  } else {
+    if (bw < bh) {
+      if (bw + bw == bh) {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      } else {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      }
+    } else {
+      if (bh + bh == bw) {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      } else {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      }
+    }
+  }
+}
diff --git a/libaom/aom_dsp/x86/jnt_sad_ssse3.c b/libaom/aom_dsp/x86/jnt_sad_ssse3.c
index c3c8824..2e3e2be 100644
--- a/libaom/aom_dsp/x86/jnt_sad_ssse3.c
+++ b/libaom/aom_dsp/x86/jnt_sad_ssse3.c
@@ -192,47 +192,47 @@ unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
   return res;
 }
 
-#define jnt_sadMxN_sse2(m, n)                                                 \
-  unsigned int aom_jnt_sad##m##x##n##_avg_ssse3(                              \
+#define dist_wtd_sadMxN_sse2(m, n)                                            \
+  unsigned int aom_dist_wtd_sad##m##x##n##_avg_ssse3(                         \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
     uint8_t comp_pred[m * n];                                                 \
-    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
-                          jcp_param);                                         \
+    aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
+                               jcp_param);                                    \
     return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n);          \
   }
 
-#define jnt_sadMxN_avx2(m, n)                                                 \
-  unsigned int aom_jnt_sad##m##x##n##_avg_avx2(                               \
+#define dist_wtd_sadMxN_avx2(m, n)                                            \
+  unsigned int aom_dist_wtd_sad##m##x##n##_avg_avx2(                          \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
     uint8_t comp_pred[m * n];                                                 \
-    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
-                          jcp_param);                                         \
+    aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
+                               jcp_param);                                    \
     return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n);          \
   }
 
 /* clang-format off */
-jnt_sadMxN_sse2(128, 128)
-jnt_sadMxN_sse2(128, 64)
-jnt_sadMxN_sse2(64, 128)
-jnt_sadMxN_sse2(64, 64)
-jnt_sadMxN_sse2(64, 32)
-jnt_sadMxN_sse2(32, 64)
-jnt_sadMxN_sse2(32, 32)
-jnt_sadMxN_sse2(32, 16)
-jnt_sadMxN_sse2(16, 32)
-jnt_sadMxN_sse2(16, 16)
-jnt_sadMxN_sse2(16, 8)
-jnt_sadMxN_sse2(8, 16)
-jnt_sadMxN_sse2(8, 8)
-jnt_sadMxN_sse2(8, 4)
-jnt_sadMxN_sse2(4, 8)
-jnt_sadMxN_sse2(4, 4)
-jnt_sadMxN_sse2(4, 16)
-jnt_sadMxN_sse2(16, 4)
-jnt_sadMxN_sse2(8, 32)
-jnt_sadMxN_sse2(32, 8)
-jnt_sadMxN_sse2(16, 64)
-jnt_sadMxN_sse2(64, 16)
+dist_wtd_sadMxN_sse2(128, 128)
+dist_wtd_sadMxN_sse2(128, 64)
+dist_wtd_sadMxN_sse2(64, 128)
+dist_wtd_sadMxN_sse2(64, 64)
+dist_wtd_sadMxN_sse2(64, 32)
+dist_wtd_sadMxN_sse2(32, 64)
+dist_wtd_sadMxN_sse2(32, 32)
+dist_wtd_sadMxN_sse2(32, 16)
+dist_wtd_sadMxN_sse2(16, 32)
+dist_wtd_sadMxN_sse2(16, 16)
+dist_wtd_sadMxN_sse2(16, 8)
+dist_wtd_sadMxN_sse2(8, 16)
+dist_wtd_sadMxN_sse2(8, 8)
+dist_wtd_sadMxN_sse2(8, 4)
+dist_wtd_sadMxN_sse2(4, 8)
+dist_wtd_sadMxN_sse2(4, 4)
+dist_wtd_sadMxN_sse2(4, 16)
+dist_wtd_sadMxN_sse2(16, 4)
+dist_wtd_sadMxN_sse2(8, 32)
+dist_wtd_sadMxN_sse2(32, 8)
+dist_wtd_sadMxN_sse2(16, 64)
+dist_wtd_sadMxN_sse2(64, 16)
     /* clang-format on */
diff --git a/libaom/aom_dsp/x86/jnt_variance_ssse3.c b/libaom/aom_dsp/x86/jnt_variance_ssse3.c
index f9a41a2..c8b02f5 100644
--- a/libaom/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/libaom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -29,7 +29,7 @@ void aom_var_filter_block2d_bil_second_pass_ssse3(
     unsigned int pixel_step, unsigned int output_height,
     unsigned int output_width, const uint8_t *filter);
 
-static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
                                         const __m128i *w, const __m128i *r,
                                         void *const result) {
   __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
@@ -45,10 +45,10 @@ static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
   xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
 }
 
-void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
-                                 int width, int height, const uint8_t *ref,
-                                 int ref_stride,
-                                 const JNT_COMP_PARAMS *jcp_param) {
+void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                                      int width, int height, const uint8_t *ref,
+                                      int ref_stride,
+                                      const DIST_WTD_COMP_PARAMS *jcp_param) {
   int i;
   const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
   const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
@@ -67,7 +67,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
         __m128i p0 = xx_loadu_128(ref);
         __m128i p1 = xx_loadu_128(pred);
 
-        compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+        compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
 
         comp_pred += 16;
         pred += 16;
@@ -85,7 +85,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
       __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
       __m128i p1 = xx_loadu_128(pred);
 
-      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+      compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
 
       comp_pred += 16;
       pred += 16;
@@ -107,7 +107,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
                         row3[0], row3[1], row3[2], row3[3]);
       __m128i p1 = xx_loadu_128(pred);
 
-      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+      compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
 
       comp_pred += 16;
       pred += 16;
@@ -116,11 +116,11 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
   }
 }
 
-void aom_jnt_comp_avg_upsampled_pred_ssse3(
+void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
   int n;
   int i;
   aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
@@ -141,52 +141,52 @@ void aom_jnt_comp_avg_upsampled_pred_ssse3(
     __m128i p0 = xx_loadu_128(comp_pred);
     __m128i p1 = xx_loadu_128(pred);
 
-    compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+    compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
 
     comp_pred += 16;
     pred += 16;
   }
 }
 
-#define JNT_SUBPIX_AVG_VAR(W, H)                                         \
-  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3(              \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,          \
-      const uint8_t *b, int b_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {    \
-    uint16_t fdata3[(H + 1) * W];                                        \
-    uint8_t temp2[H * W];                                                \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                          \
-                                                                         \
-    aom_var_filter_block2d_bil_first_pass_ssse3(                         \
-        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_var_filter_block2d_bil_second_pass_ssse3(                        \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);        \
-                                                                         \
-    aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,      \
-                                jcp_param);                              \
-                                                                         \
-    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);            \
+#define DIST_WTD_SUBPIX_AVG_VAR(W, H)                                      \
+  uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3(           \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,            \
+      const uint8_t *b, int b_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+    uint16_t fdata3[(H + 1) * W];                                          \
+    uint8_t temp2[H * W];                                                  \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                            \
+                                                                           \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                           \
+        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                          \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);          \
+                                                                           \
+    aom_dist_wtd_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,   \
+                                     jcp_param);                           \
+                                                                           \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);              \
   }
 
-JNT_SUBPIX_AVG_VAR(128, 128)
-JNT_SUBPIX_AVG_VAR(128, 64)
-JNT_SUBPIX_AVG_VAR(64, 128)
-JNT_SUBPIX_AVG_VAR(64, 64)
-JNT_SUBPIX_AVG_VAR(64, 32)
-JNT_SUBPIX_AVG_VAR(32, 64)
-JNT_SUBPIX_AVG_VAR(32, 32)
-JNT_SUBPIX_AVG_VAR(32, 16)
-JNT_SUBPIX_AVG_VAR(16, 32)
-JNT_SUBPIX_AVG_VAR(16, 16)
-JNT_SUBPIX_AVG_VAR(16, 8)
-JNT_SUBPIX_AVG_VAR(8, 16)
-JNT_SUBPIX_AVG_VAR(8, 8)
-JNT_SUBPIX_AVG_VAR(8, 4)
-JNT_SUBPIX_AVG_VAR(4, 8)
-JNT_SUBPIX_AVG_VAR(4, 4)
-JNT_SUBPIX_AVG_VAR(4, 16)
-JNT_SUBPIX_AVG_VAR(16, 4)
-JNT_SUBPIX_AVG_VAR(8, 32)
-JNT_SUBPIX_AVG_VAR(32, 8)
-JNT_SUBPIX_AVG_VAR(16, 64)
-JNT_SUBPIX_AVG_VAR(64, 16)
+DIST_WTD_SUBPIX_AVG_VAR(128, 128)
+DIST_WTD_SUBPIX_AVG_VAR(128, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 128)
+DIST_WTD_SUBPIX_AVG_VAR(64, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 64)
+DIST_WTD_SUBPIX_AVG_VAR(32, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 32)
+DIST_WTD_SUBPIX_AVG_VAR(16, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 16)
+DIST_WTD_SUBPIX_AVG_VAR(8, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 4)
+DIST_WTD_SUBPIX_AVG_VAR(4, 8)
+DIST_WTD_SUBPIX_AVG_VAR(4, 4)
+DIST_WTD_SUBPIX_AVG_VAR(4, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 4)
+DIST_WTD_SUBPIX_AVG_VAR(8, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 8)
+DIST_WTD_SUBPIX_AVG_VAR(16, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 16)
diff --git a/libaom/aom_dsp/x86/loopfilter_sse2.c b/libaom/aom_dsp/x86/loopfilter_sse2.c
index 26f249e..c021f50 100644
--- a/libaom/aom_dsp/x86/loopfilter_sse2.c
+++ b/libaom/aom_dsp/x86/loopfilter_sse2.c
@@ -16,237 +16,69 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/emmintrin_compat.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
 
 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }
 
-static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
-                                             __m128i *x2, __m128i *x3,
-                                             __m128i *d0, __m128i *d1,
-                                             __m128i *d2, __m128i *d3) {
-  // input
-  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
-  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
-  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
-  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
-  // output
-  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-
-  __m128i w0, w1;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  *d0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-
-  *d1 = _mm_srli_si128(*d0,
-                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d2 = _mm_srli_si128(*d0,
-                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d3 = _mm_srli_si128(*d0,
-                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-}
-
-static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                         __m128i *x3, __m128i *d0, __m128i *d1,
-                                         __m128i *d2, __m128i *d3, __m128i *d4,
-                                         __m128i *d5, __m128i *d6,
-                                         __m128i *d7) {
-  // input
-  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
-  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
-  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
-  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
-  // output
-  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
-
-  __m128i w0, w1, ww0, ww1;
-
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them to 4x8  independently while flipping the second matrix horizontally.
+// Used for 14 taps pq pairs creation
+static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                        __m128i *x3, __m128i *q0p0,
+                                        __m128i *q1p1, __m128i *q2p2,
+                                        __m128i *q3p3, __m128i *q4p4,
+                                        __m128i *q5p5, __m128i *q6p6,
+                                        __m128i *q7p7) {
+  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
   w0 = _mm_unpacklo_epi8(
       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
   w1 = _mm_unpacklo_epi8(
       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  w2 = _mm_unpackhi_epi8(
+      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
+  w3 = _mm_unpackhi_epi8(
+      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
 
   ww0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
   ww1 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-
-  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d1 = _mm_srli_si128(ww0,
-                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d2 = _mm_srli_si128(ww0,
-                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d3 = _mm_srli_si128(ww0,
-                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-
-  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d5 = _mm_srli_si128(ww1,
-                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d6 = _mm_srli_si128(ww1,
-                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d7 = _mm_srli_si128(ww1,
-                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
-}
-
-static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                         __m128i *x3, __m128i *x4, __m128i *x5,
-                                         __m128i *x6, __m128i *x7, __m128i *d0,
-                                         __m128i *d1, __m128i *d2,
-                                         __m128i *d3) {
-  // input
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  // x4 40 41 42 43 44 45 46 47
-  // x5  50 51 52 53 54 55 56 57
-  // x6  60 61 62 63 64 65 66 67
-  // x7 70 71 72 73 74 75 76 77
-  // output
-  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
-  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
-  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
-  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
-
-  __m128i w0, w1, w2, w3, w4, w5;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  w2 = _mm_unpacklo_epi8(
-      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
-  w3 = _mm_unpacklo_epi8(
-      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
-  w4 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpacklo_epi16(
-      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  *d0 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  *d1 = _mm_srli_si128(*d0, 8);
-  *d2 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-  *d3 = _mm_srli_si128(*d2, 8);
-}
-
-static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                     __m128i *x3, __m128i *x4, __m128i *x5,
-                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
-                                     __m128i *d2d3, __m128i *d4d5,
-                                     __m128i *d6d7) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  // x4 40 41 42 43 44 45 46 47
-  // x5  50 51 52 53 54 55 56 57
-  w2 = _mm_unpacklo_epi8(
-      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
-  // x6  60 61 62 63 64 65 66 67
-  // x7 70 71 72 73 74 75 76 77
-  w3 = _mm_unpacklo_epi8(
-      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
-  w4 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpacklo_epi16(
-      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  *d0d1 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  *d2d3 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
-  w6 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-  w7 = _mm_unpackhi_epi16(
-      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-
-  *d4d5 = _mm_unpacklo_epi32(
-      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
-  *d6d7 = _mm_unpackhi_epi32(
-      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-}
+      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
+  ww2 = _mm_unpacklo_epi16(
+      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
+  ww3 = _mm_unpackhi_epi16(
+      w2,
+      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
 
-static INLINE void transpose16x8_8x16_sse2(
-    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
-    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
-    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
-    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
-    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
-  __m128i w10, w11, w12, w13, w14, w15;
-
-  w0 = _mm_unpacklo_epi8(*x0, *x1);
-  w1 = _mm_unpacklo_epi8(*x2, *x3);
-  w2 = _mm_unpacklo_epi8(*x4, *x5);
-  w3 = _mm_unpacklo_epi8(*x6, *x7);
-
-  w8 = _mm_unpacklo_epi8(*x8, *x9);
-  w9 = _mm_unpacklo_epi8(*x10, *x11);
-  w10 = _mm_unpacklo_epi8(*x12, *x13);
-  w11 = _mm_unpacklo_epi8(*x14, *x15);
-
-  w4 = _mm_unpacklo_epi16(w0, w1);
-  w5 = _mm_unpacklo_epi16(w2, w3);
-  w12 = _mm_unpacklo_epi16(w8, w9);
-  w13 = _mm_unpacklo_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store first 4-line result
-  *d0 = _mm_unpacklo_epi64(w6, w14);
-  *d1 = _mm_unpackhi_epi64(w6, w14);
-  *d2 = _mm_unpacklo_epi64(w7, w15);
-  *d3 = _mm_unpackhi_epi64(w7, w15);
-
-  w4 = _mm_unpackhi_epi16(w0, w1);
-  w5 = _mm_unpackhi_epi16(w2, w3);
-  w12 = _mm_unpackhi_epi16(w8, w9);
-  w13 = _mm_unpackhi_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store second 4-line result
-  *d4 = _mm_unpacklo_epi64(w6, w14);
-  *d5 = _mm_unpackhi_epi64(w6, w14);
-  *d6 = _mm_unpacklo_epi64(w7, w15);
-  *d7 = _mm_unpackhi_epi64(w7, w15);
+  *q7p7 = _mm_unpacklo_epi32(
+      ww0,
+      _mm_srli_si128(
+          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
+  *q6p6 = _mm_unpackhi_epi32(
+      _mm_slli_si128(ww0, 4),
+      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
+  *q5p5 = _mm_unpackhi_epi32(
+      ww0,
+      _mm_slli_si128(
+          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
+  *q4p4 = _mm_unpacklo_epi32(
+      _mm_srli_si128(ww0, 12),
+      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
+  *q3p3 = _mm_unpacklo_epi32(
+      ww1,
+      _mm_srli_si128(
+          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
+  *q2p2 = _mm_unpackhi_epi32(
+      _mm_slli_si128(ww1, 4),
+      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
+  *q1p1 = _mm_unpackhi_epi32(
+      ww1,
+      _mm_slli_si128(
+          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
+  *q0p0 = _mm_unpacklo_epi32(
+      _mm_srli_si128(ww1, 12),
+      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
 }
 
 // this function treats its input as 2 parallel 8x4 matrices, transposes each of
@@ -306,116 +138,6 @@ static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
   *pq3 = _mm_unpackhi_epi64(d2, d3);  // pq
 }
 
-static INLINE void transpose8x16_16x8_sse2(
-    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
-    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
-    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
-    __m128i *d12d13, __m128i *d14d15) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
-  __m128i w10, w11, w12, w13, w14, w15;
-
-  w0 = _mm_unpacklo_epi8(*x0, *x1);
-  w1 = _mm_unpacklo_epi8(*x2, *x3);
-  w2 = _mm_unpacklo_epi8(*x4, *x5);
-  w3 = _mm_unpacklo_epi8(*x6, *x7);
-
-  w8 = _mm_unpackhi_epi8(*x0, *x1);
-  w9 = _mm_unpackhi_epi8(*x2, *x3);
-  w10 = _mm_unpackhi_epi8(*x4, *x5);
-  w11 = _mm_unpackhi_epi8(*x6, *x7);
-
-  w4 = _mm_unpacklo_epi16(w0, w1);
-  w5 = _mm_unpacklo_epi16(w2, w3);
-  w12 = _mm_unpacklo_epi16(w8, w9);
-  w13 = _mm_unpacklo_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store first 4-line result
-  *d0d1 = _mm_unpacklo_epi64(w6, w14);
-  *d2d3 = _mm_unpackhi_epi64(w6, w14);
-  *d4d5 = _mm_unpacklo_epi64(w7, w15);
-  *d6d7 = _mm_unpackhi_epi64(w7, w15);
-
-  w4 = _mm_unpackhi_epi16(w0, w1);
-  w5 = _mm_unpackhi_epi16(w2, w3);
-  w12 = _mm_unpackhi_epi16(w8, w9);
-  w13 = _mm_unpackhi_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store second 4-line result
-  *d8d9 = _mm_unpacklo_epi64(w6, w14);
-  *d10d11 = _mm_unpackhi_epi64(w6, w14);
-  *d12d13 = _mm_unpacklo_epi64(w7, w15);
-  *d14d15 = _mm_unpackhi_epi64(w7, w15);
-}
-
-// this function treats its input as 2 parallel 8x4 matrices, transposes each of
-// them to 4x8  independently while flipping the second matrix horizontaly. Used
-// for 14 taps pq pairs creation
-static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                        __m128i *x3, __m128i *q0p0,
-                                        __m128i *q1p1, __m128i *q2p2,
-                                        __m128i *q3p3, __m128i *q4p4,
-                                        __m128i *q5p5, __m128i *q6p6,
-                                        __m128i *q7p7) {
-  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  w2 = _mm_unpackhi_epi8(
-      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
-  w3 = _mm_unpackhi_epi8(
-      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
-
-  ww0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
-  ww1 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
-  ww2 = _mm_unpacklo_epi16(
-      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
-  ww3 = _mm_unpackhi_epi16(
-      w2,
-      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
-
-  *q7p7 = _mm_unpacklo_epi32(
-      ww0,
-      _mm_srli_si128(
-          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
-  *q6p6 = _mm_unpackhi_epi32(
-      _mm_slli_si128(ww0, 4),
-      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
-  *q5p5 = _mm_unpackhi_epi32(
-      ww0,
-      _mm_slli_si128(
-          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
-  *q4p4 = _mm_unpacklo_epi32(
-      _mm_srli_si128(ww0, 12),
-      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
-  *q3p3 = _mm_unpacklo_epi32(
-      ww1,
-      _mm_srli_si128(
-          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
-  *q2p2 = _mm_unpackhi_epi32(
-      _mm_slli_si128(ww1, 4),
-      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
-  *q1p1 = _mm_unpackhi_epi32(
-      ww1,
-      _mm_slli_si128(
-          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
-  *q0p0 = _mm_unpacklo_epi32(
-      _mm_srli_si128(ww1, 12),
-      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
-}
-
 static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
                                           __m128i *hev, __m128i *mask,
                                           __m128i *qs1qs0, __m128i *ps1ps0) {
diff --git a/libaom/aom_dsp/x86/lpf_common_sse2.h b/libaom/aom_dsp/x86/lpf_common_sse2.h
index 8970fe7..6ed2cbf 100644
--- a/libaom/aom_dsp/x86/lpf_common_sse2.h
+++ b/libaom/aom_dsp/x86/lpf_common_sse2.h
@@ -212,4 +212,284 @@ static INLINE void highbd_transpose8x16_sse2(
                            d4 + 1, d5 + 1, d6 + 1, d7 + 1);
 }
 
+// Low bit depth functions
+static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                             __m128i *x2, __m128i *x3,
+                                             __m128i *d0, __m128i *d1,
+                                             __m128i *d2, __m128i *d3) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  *d0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+
+  *d1 = _mm_srli_si128(*d0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(*d0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(*d0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *d0, __m128i *d1,
+                                         __m128i *d2, __m128i *d3, __m128i *d4,
+                                         __m128i *d5, __m128i *d6,
+                                         __m128i *d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  ww0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+
+  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(ww0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(ww0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(ww0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d5 = _mm_srli_si128(ww1,
+                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d6 = _mm_srli_si128(ww1,
+                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d7 = _mm_srli_si128(ww1,
+                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *x4, __m128i *x5,
+                                         __m128i *x6, __m128i *x7, __m128i *d0,
+                                         __m128i *d1, __m128i *d2,
+                                         __m128i *d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
+  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, w2, w3, w4, w5;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d1 = _mm_srli_si128(*d0, 8);
+  *d2 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  *d3 = _mm_srli_si128(*d2, 8);
+}
+
+static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                     __m128i *x3, __m128i *x4, __m128i *x5,
+                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
+                                     __m128i *d2d3, __m128i *d4d5,
+                                     __m128i *d6d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0d1 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d2d3 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  w6 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+  w7 = _mm_unpackhi_epi16(
+      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+
+  *d4d5 = _mm_unpacklo_epi32(
+      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+  *d6d7 = _mm_unpackhi_epi32(
+      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose16x8_8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
+    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
+    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
+    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpacklo_epi8(*x8, *x9);
+  w9 = _mm_unpacklo_epi8(*x10, *x11);
+  w10 = _mm_unpacklo_epi8(*x12, *x13);
+  w11 = _mm_unpacklo_epi8(*x14, *x15);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0 = _mm_unpacklo_epi64(w6, w14);
+  *d1 = _mm_unpackhi_epi64(w6, w14);
+  *d2 = _mm_unpacklo_epi64(w7, w15);
+  *d3 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d4 = _mm_unpacklo_epi64(w6, w14);
+  *d5 = _mm_unpackhi_epi64(w6, w14);
+  *d6 = _mm_unpacklo_epi64(w7, w15);
+  *d7 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose8x16_16x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
+    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
+    __m128i *d12d13, __m128i *d14d15) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpackhi_epi8(*x0, *x1);
+  w9 = _mm_unpackhi_epi8(*x2, *x3);
+  w10 = _mm_unpackhi_epi8(*x4, *x5);
+  w11 = _mm_unpackhi_epi8(*x6, *x7);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0d1 = _mm_unpacklo_epi64(w6, w14);
+  *d2d3 = _mm_unpackhi_epi64(w6, w14);
+  *d4d5 = _mm_unpacklo_epi64(w7, w15);
+  *d6d7 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d8d9 = _mm_unpacklo_epi64(w6, w14);
+  *d10d11 = _mm_unpackhi_epi64(w6, w14);
+  *d12d13 = _mm_unpacklo_epi64(w7, w15);
+  *d14d15 = _mm_unpackhi_epi64(w7, w15);
+}
+
 #endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/libaom/aom_dsp/x86/quantize_sse2.c b/libaom/aom_dsp/x86/quantize_sse2.c
index d3de6e2..ebef1fb 100644
--- a/libaom/aom_dsp/x86/quantize_sse2.c
+++ b/libaom/aom_dsp/x86/quantize_sse2.c
@@ -18,28 +18,6 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-  assert(sizeof(tran_low_t) == 4);
-
-  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
-                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
-                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
-                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-}
-
-static INLINE void store_coefficients(__m128i coeff_vals,
-                                      tran_low_t *coeff_ptr) {
-  assert(sizeof(tran_low_t) == 4);
-
-  __m128i one = _mm_set1_epi16(1);
-  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
-  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
-  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
-  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
-  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-}
-
 void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          const int16_t *zbin_ptr, const int16_t *round_ptr,
                          const int16_t *quant_ptr,
diff --git a/libaom/aom_dsp/x86/quantize_ssse3.c b/libaom/aom_dsp/x86/quantize_ssse3.c
new file mode 100644
index 0000000..25980a0
--- /dev/null
+++ b/libaom/aom_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round,
+                                          const __m128i quant,
+                                          const __m128i *shift) {
+  __m128i tmp, qcoeff, tmp1;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  tmp = _mm_mullo_epi16(qcoeff, *shift);
+  tmp = _mm_srli_epi16(tmp, 14);
+  tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+  tmp1 = _mm_slli_epi16(tmp1, 2);
+  *coeff = _mm_or_si128(tmp, tmp1);
+}
+
+static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff,
+                                                     const __m128i dequant,
+                                                     const __m128i zero,
+                                                     tran_low_t *dqcoeff) {
+  // Un-sign to bias rounding like C.
+  const __m128i coeff = _mm_abs_epi16(qcoeff);
+
+  const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
+  const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
+
+  const __m128i low = _mm_mullo_epi16(coeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+  __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  // "Divide" by 4.
+  dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2);
+  dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2);
+
+  dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
+  dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i two = _mm_set1_epi16(2);
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+  (void)n_coeffs;
+
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, two);
+  round = _mm_add_epi16(round, two);
+  zbin = _mm_srli_epi16(zbin, 2);
+  round = _mm_srli_epi16(round, 2);
+  zbin = _mm_sub_epi16(zbin, one);
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < 1024; index += 16) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      continue;
+    }
+    calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+    calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero,
+                                      dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero,
+                                      dqcoeff_ptr + 8 + index);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/libaom/aom_dsp/x86/quantize_x86.h b/libaom/aom_dsp/x86/quantize_x86.h
index 4eed7dd..b2de01b 100644
--- a/libaom/aom_dsp/x86/quantize_x86.h
+++ b/libaom/aom_dsp/x86/quantize_x86.h
@@ -32,6 +32,11 @@ static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
   return _mm_sub_epi16(a, sign);
 }
 
+static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi32(a, sign);
+}
+
 static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
                                     const __m128i quant, const __m128i shift) {
   __m128i tmp, qcoeff;
@@ -41,10 +46,53 @@ static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
   *coeff = _mm_mulhi_epi16(qcoeff, shift);
 }
 
+static INLINE void calculate_qcoeff_log_scale(__m128i *coeff,
+                                              const __m128i round,
+                                              const __m128i quant,
+                                              const __m128i *shift,
+                                              const int *log_scale) {
+  __m128i tmp, tmp1, qcoeff;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  tmp = _mm_mullo_epi16(qcoeff, *shift);
+  tmp = _mm_srli_epi16(tmp, (16 - *log_scale));
+  tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+  tmp1 = _mm_slli_epi16(tmp1, *log_scale);
+  *coeff = _mm_or_si128(tmp, tmp1);
+}
+
 static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
   return _mm_mullo_epi16(qcoeff, dequant);
 }
 
+static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff,
+                                                         __m128i dequant,
+                                                         const __m128i zero,
+                                                         tran_low_t *dqcoeff,
+                                                         const int *log_scale) {
+  // calculate abs
+  __m128i coeff_sign = _mm_srai_epi16(qcoeff, 15);
+  __m128i coeff = invert_sign_sse2(qcoeff, coeff_sign);
+
+  const __m128i sign_0 = _mm_unpacklo_epi16(coeff_sign, zero);
+  const __m128i sign_1 = _mm_unpackhi_epi16(coeff_sign, zero);
+
+  const __m128i low = _mm_mullo_epi16(coeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+  __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, *log_scale);
+  dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, *log_scale);
+
+  dqcoeff32_0 = invert_sign_32_sse2(dqcoeff32_0, sign_0);
+  dqcoeff32_1 = invert_sign_32_sse2(dqcoeff32_1, sign_1);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
 // Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
 // to zbin to add 1 to the index in 'scan'.
 static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
@@ -75,3 +123,23 @@ static INLINE int16_t accumulate_eob(__m128i eob) {
   eob = _mm_max_epi16(eob, eob_shuffled);
   return _mm_extract_epi16(eob, 1);
 }
+
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+  assert(sizeof(tran_low_t) == 4);
+  const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr));
+  const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+  return _mm_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+                                      tran_low_t *coeff_ptr) {
+  assert(sizeof(tran_low_t) == 4);
+
+  __m128i one = _mm_set1_epi16(1);
+  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+}
diff --git a/libaom/aom_dsp/x86/sse_avx2.c b/libaom/aom_dsp/x86/sse_avx2.c
index fa45687..42df981 100644
--- a/libaom/aom_dsp/x86/sse_avx2.c
+++ b/libaom/aom_dsp/x86/sse_avx2.c
@@ -21,12 +21,11 @@ static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
                                 const uint8_t *b) {
   const __m256i v_a0 = yy_loadu_256(a);
   const __m256i v_b0 = yy_loadu_256(b);
-  const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0));
-  const __m256i v_a01_w =
-      _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1));
-  const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0));
-  const __m256i v_b01_w =
-      _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1));
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
+  const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
+  const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
+  const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
   const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
   const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
@@ -35,15 +34,13 @@ static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
 
 static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
   int64_t sum;
-  const __m256i sum0_4x64 =
-      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all));
-  const __m256i sum1_4x64 =
-      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1));
+  __m256i zero = _mm256_setzero_si256();
+  const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
+  const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
   const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
   const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
                                          _mm256_extracti128_si256(sum_4x64, 1));
   const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
-
   xx_storel_64(&sum, sum_1x64);
   return sum;
 }
@@ -86,7 +83,6 @@ static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
   const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
 }
-
 static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
                                  const uint8_t *b, int b_stride, __m256i *sum) {
   const __m128i v_a0 = xx_loadl_64(a);
@@ -98,12 +94,12 @@ static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
   const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
 }
-
 int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
                      int b_stride, int width, int height) {
   int32_t y = 0;
   int64_t sse = 0;
   __m256i sum = _mm256_setzero_si256();
+  __m256i zero = _mm256_setzero_si256();
   switch (width) {
     case 4:
       do {
@@ -126,14 +122,26 @@ int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
     case 16:
       do {
         const __m128i v_a0 = xx_loadu_128(a);
+        const __m128i v_a1 = xx_loadu_128(a + a_stride);
         const __m128i v_b0 = xx_loadu_128(b);
-        const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0);
-        const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0);
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
-        a += a_stride;
-        b += b_stride;
-        y += 1;
+        const __m128i v_b1 = xx_loadu_128(b + b_stride);
+        const __m256i v_a =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
+        const __m256i v_b =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
+        const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
+        const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
+        const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
+        const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
+        const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
+        const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
+        const __m256i temp =
+            _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
+                             _mm256_madd_epi16(v_bsub, v_bsub));
+        sum = _mm256_add_epi32(sum, temp);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
       } while (y < height);
       sse = summary_all_avx2(&sum);
       break;
diff --git a/libaom/aom_dsp/x86/txfm_common_avx2.h b/libaom/aom_dsp/x86/txfm_common_avx2.h
index 8a40508..06a77e7 100644
--- a/libaom/aom_dsp/x86/txfm_common_avx2.h
+++ b/libaom/aom_dsp/x86/txfm_common_avx2.h
@@ -168,6 +168,36 @@ static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
   out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
 }
 
+static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in,
+                                             __m256i *const out) {
+  const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]);
+  const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]);
+  const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]);
+  const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]);
+  const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]);
+  const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]);
+  const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+  const __m256i b0 = _mm256_unpacklo_epi32(a0, a1);
+  const __m256i b1 = _mm256_unpacklo_epi32(a2, a3);
+  const __m256i b2 = _mm256_unpacklo_epi32(a4, a5);
+  const __m256i b3 = _mm256_unpacklo_epi32(a6, a7);
+  const __m256i b4 = _mm256_unpackhi_epi32(a0, a1);
+  const __m256i b5 = _mm256_unpackhi_epi32(a2, a3);
+  const __m256i b6 = _mm256_unpackhi_epi32(a4, a5);
+  const __m256i b7 = _mm256_unpackhi_epi32(a6, a7);
+
+  out[0] = _mm256_unpacklo_epi64(b0, b1);
+  out[1] = _mm256_unpackhi_epi64(b0, b1);
+  out[2] = _mm256_unpacklo_epi64(b4, b5);
+  out[3] = _mm256_unpackhi_epi64(b4, b5);
+  out[4] = _mm256_unpacklo_epi64(b2, b3);
+  out[5] = _mm256_unpackhi_epi64(b2, b3);
+  out[6] = _mm256_unpacklo_epi64(b6, b7);
+  out[7] = _mm256_unpackhi_epi64(b6, b7);
+}
+
 static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
   for (int i = 0; i < size; ++i) {
     out[size - i - 1] = in[i];
@@ -236,6 +266,66 @@ static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input,
   }
 }
 
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+  const __m256i scale_rounding =
+      pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+  const __m256i b = _mm256_madd_epi16(a, scale_rounding);
+  return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a,
+                                                     int32_t *const b) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a, one);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a, one);
+  const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+  const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+  const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31);
+  _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo));
+  _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi));
+  _mm256_store_si256((__m256i *)(b + 64), temp);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2(
+    const __m256i *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride);
+  }
+}
+
+static INLINE void pack_reg(const __m128i *in1, const __m128i *in2,
+                            __m256i *out) {
+  out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1);
+  out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1);
+  out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1);
+  out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1);
+  out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1);
+  out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1);
+  out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1);
+  out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1);
+}
+
+static INLINE void extract_reg(const __m256i *in, __m128i *out1) {
+  out1[0] = _mm256_castsi256_si128(in[0]);
+  out1[1] = _mm256_castsi256_si128(in[1]);
+  out1[2] = _mm256_castsi256_si128(in[2]);
+  out1[3] = _mm256_castsi256_si128(in[3]);
+  out1[4] = _mm256_castsi256_si128(in[4]);
+  out1[5] = _mm256_castsi256_si128(in[5]);
+  out1[6] = _mm256_castsi256_si128(in[6]);
+  out1[7] = _mm256_castsi256_si128(in[7]);
+
+  out1[8] = _mm256_extracti128_si256(in[0], 0x01);
+  out1[9] = _mm256_extracti128_si256(in[1], 0x01);
+  out1[10] = _mm256_extracti128_si256(in[2], 0x01);
+  out1[11] = _mm256_extracti128_si256(in[3], 0x01);
+  out1[12] = _mm256_extracti128_si256(in[4], 0x01);
+  out1[13] = _mm256_extracti128_si256(in[5], 0x01);
+  out1[14] = _mm256_extracti128_si256(in[6], 0x01);
+  out1[15] = _mm256_extracti128_si256(in[7], 0x01);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libaom/aom_dsp/x86/variance_sse2.c b/libaom/aom_dsp/x86/variance_sse2.c
index c831e3e..f3efc15 100644
--- a/libaom/aom_dsp/x86/variance_sse2.c
+++ b/libaom/aom_dsp/x86/variance_sse2.c
@@ -494,7 +494,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
     const int ref_num = 0;
     const int is_intrabc = is_intrabc_block(mi);
     const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
     const int is_scaled = av1_is_scaled(sf);
 
     if (is_scaled) {
diff --git a/libaom/aom_ports/mem.h b/libaom/aom_ports/mem.h
index 3ffea3c..9e3d424 100644
--- a/libaom/aom_ports/mem.h
+++ b/libaom/aom_ports/mem.h
@@ -66,4 +66,34 @@
 #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
 #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
 
+/*!\brief force enum to be unsigned 1 byte*/
+#define UENUM1BYTE(enumvar) \
+  ;                         \
+  typedef uint8_t enumvar
+
+/*!\brief force enum to be signed 1 byte*/
+#define SENUM1BYTE(enumvar) \
+  ;                         \
+  typedef int8_t enumvar
+
+/*!\brief force enum to be unsigned 2 byte*/
+#define UENUM2BYTE(enumvar) \
+  ;                         \
+  typedef uint16_t enumvar
+
+/*!\brief force enum to be signed 2 byte*/
+#define SENUM2BYTE(enumvar) \
+  ;                         \
+  typedef int16_t enumvar
+
+/*!\brief force enum to be unsigned 4 byte*/
+#define UENUM4BYTE(enumvar) \
+  ;                         \
+  typedef uint32_t enumvar
+
+/*!\brief force enum to be unsigned 4 byte*/
+#define SENUM4BYTE(enumvar) \
+  ;                         \
+  typedef int32_t enumvar
+
 #endif  // AOM_AOM_PORTS_MEM_H_
diff --git a/libaom/aom_ports/x86.h b/libaom/aom_ports/x86.h
index 52ee49c..8c18448 100644
--- a/libaom/aom_ports/x86.h
+++ b/libaom/aom_ports/x86.h
@@ -222,11 +222,26 @@ static INLINE int x86_simd_caps(void) {
   return flags & mask;
 }
 
-// Note:
-//  32-bit CPU cycle counter is light-weighted for most function performance
-//  measurement. For large function (CPU time > a couple of seconds), 64-bit
-//  counter should be used.
-// 32-bit CPU cycle counter
+// Fine-Grain Measurement Functions
+//
+// If you are a timing a small region of code, access the timestamp counter
+// (TSC) via:
+//
+// unsigned int start = x86_tsc_start();
+//   ...
+// unsigned int end = x86_tsc_end();
+// unsigned int diff = end - start;
+//
+// The start/end functions introduce a few more instructions than using
+// x86_readtsc directly, but prevent the CPU's out-of-order execution from
+// affecting the measurement (by having earlier/later instructions be evaluated
+// in the time interval). See the white paper, "How to Benchmark Code
+// Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures" by
+// Gabriele Paoloni for more information.
+//
+// If you are timing a large function (CPU time > a couple of seconds), use
+// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The
+// out-of-order leakage that can occur is minimal compared to total runtime.
 static INLINE unsigned int x86_readtsc(void) {
 #if defined(__GNUC__) && __GNUC__
   unsigned int tsc;
@@ -263,6 +278,41 @@ static INLINE uint64_t x86_readtsc64(void) {
 #endif
 }
 
+// 32-bit CPU cycle counter with a partial fence against out-of-order execution.
+static INLINE unsigned int x86_readtscp(void) {
+#if defined(__GNUC__) && __GNUC__
+  unsigned int tscp;
+  __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :);
+  return tscp;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  unsigned int tscp;
+  asm volatile("rdtscp\n\t" : "=a"(tscp) :);
+  return tscp;
+#elif defined(_MSC_VER)
+  unsigned int ui;
+  return (unsigned int)__rdtscp(&ui);
+#else
+#if ARCH_X86_64
+  return (unsigned int)__rdtscp();
+#else
+  __asm rdtscp;
+#endif
+#endif
+}
+
+static INLINE unsigned int x86_tsc_start(void) {
+  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  return x86_readtsc();
+}
+
+static INLINE unsigned int x86_tsc_end(void) {
+  uint32_t v = x86_readtscp();
+  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  return v;
+}
+
 #if defined(__GNUC__) && __GNUC__
 #define x86_pause_hint() __asm__ __volatile__("pause \n\t")
 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
diff --git a/libaom/aom_scale/aom_scale.cmake b/libaom/aom_scale/aom_scale.cmake
index 197dea6..3199733 100644
--- a/libaom/aom_scale/aom_scale.cmake
+++ b/libaom/aom_scale/aom_scale.cmake
@@ -34,5 +34,9 @@ function(setup_aom_scale_targets)
                                   "AOM_SCALE_INTRIN_DSPR2" "aom")
   endif()
 
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
+
+  # Pass the new lib targets up to the parent scope instance of
+  # $AOM_LIB_TARGETS.
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_scale PARENT_SCOPE)
 endfunction()
diff --git a/libaom/aom_scale/aom_scale_rtcd.pl b/libaom/aom_scale/aom_scale_rtcd.pl
index 27378c7..eef6f16 100644
--- a/libaom/aom_scale/aom_scale_rtcd.pl
+++ b/libaom/aom_scale/aom_scale_rtcd.pl
@@ -26,6 +26,8 @@ if (aom_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") {
   add_proto qw/void aom_vertical_band_2_1_scale_i/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
 }
 
+add_proto qw/int aom_yv12_realloc_with_new_border/, "struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes";
+
 add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
 
 add_proto qw/void aom_yv12_copy_frame/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes";
diff --git a/libaom/aom_scale/generic/yv12config.c b/libaom/aom_scale/generic/yv12config.c
index 7cf3c4f..a5ad1a7 100644
--- a/libaom/aom_scale/generic/yv12config.c
+++ b/libaom/aom_scale/generic/yv12config.c
@@ -46,37 +46,16 @@ int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
   return 0;
 }
 
-int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                             int ss_x, int ss_y, int use_highbitdepth,
-                             int border, int byte_alignment,
-                             aom_codec_frame_buffer_t *fb,
-                             aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
-#if CONFIG_SIZE_LIMIT
-  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
-#endif
-
-  /* Only support allocating buffers that have a border that's a multiple
-   * of 32. The border restriction is required to get 16-byte alignment of
-   * the start of the chroma rows without introducing an arbitrary gap
-   * between planes, which would break the semantics of things like
-   * aom_img_set_rect(). */
-  if (border & 0x1f) return -3;
-
+static int realloc_frame_buffer_aligned(
+    YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y,
+    int use_highbitdepth, int border, int byte_alignment,
+    aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb,
+    void *cb_priv, const int y_stride, const uint64_t yplane_size,
+    const uint64_t uvplane_size, const int aligned_width,
+    const int aligned_height, const int uv_width, const int uv_height,
+    const int uv_stride, const int uv_border_w, const int uv_border_h) {
   if (ybf) {
     const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
-    const int aligned_width = (width + 7) & ~7;
-    const int aligned_height = (height + 7) & ~7;
-    const int y_stride = ((aligned_width + 2 * border) + 31) & ~31;
-    const uint64_t yplane_size =
-        (aligned_height + 2 * border) * (uint64_t)y_stride + byte_alignment;
-    const int uv_width = aligned_width >> ss_x;
-    const int uv_height = aligned_height >> ss_y;
-    const int uv_stride = y_stride >> ss_x;
-    const int uv_border_w = border >> ss_x;
-    const int uv_border_h = border >> ss_y;
-    const uint64_t uvplane_size =
-        (uv_height + 2 * uv_border_h) * (uint64_t)uv_stride + byte_alignment;
-
     const uint64_t frame_size =
         (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
 
@@ -120,6 +99,7 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
       // Allocation to hold larger frame, or first allocation.
       aom_free(ybf->buffer_alloc);
       ybf->buffer_alloc = NULL;
+      ybf->buffer_alloc_sz = 0;
 
       if (frame_size != (size_t)frame_size) return -1;
 
@@ -190,6 +170,111 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
   return -2;
 }
 
+static int calc_stride_and_planesize(const int ss_x, const int ss_y,
+                                     const int aligned_width,
+                                     const int aligned_height, const int border,
+                                     const int byte_alignment, int *y_stride,
+                                     int *uv_stride, uint64_t *yplane_size,
+                                     uint64_t *uvplane_size,
+                                     const int uv_height) {
+  /* Only support allocating buffers that have a border that's a multiple
+   * of 32. The border restriction is required to get 16-byte alignment of
+   * the start of the chroma rows without introducing an arbitrary gap
+   * between planes, which would break the semantics of things like
+   * aom_img_set_rect(). */
+  if (border & 0x1f) return -3;
+  *y_stride = ((aligned_width + 2 * border) + 31) & ~31;
+  *yplane_size =
+      (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment;
+
+  *uv_stride = *y_stride >> ss_x;
+  *uvplane_size = (uv_height + 2 * (border >> ss_y)) * (uint64_t)(*uv_stride) +
+                  byte_alignment;
+  return 0;
+}
+
+int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+                             int ss_x, int ss_y, int use_highbitdepth,
+                             int border, int byte_alignment,
+                             aom_codec_frame_buffer_t *fb,
+                             aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
+#if CONFIG_SIZE_LIMIT
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
+#endif
+
+  if (ybf) {
+    int y_stride = 0;
+    int uv_stride = 0;
+    uint64_t yplane_size = 0;
+    uint64_t uvplane_size = 0;
+    const int aligned_width = (width + 7) & ~7;
+    const int aligned_height = (height + 7) & ~7;
+    const int uv_width = aligned_width >> ss_x;
+    const int uv_height = aligned_height >> ss_y;
+    const int uv_border_w = border >> ss_x;
+    const int uv_border_h = border >> ss_y;
+
+    int error = calc_stride_and_planesize(
+        ss_x, ss_y, aligned_width, aligned_height, border, byte_alignment,
+        &y_stride, &uv_stride, &yplane_size, &uvplane_size, uv_height);
+    if (error) return error;
+    return realloc_frame_buffer_aligned(
+        ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
+        byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size,
+        aligned_width, aligned_height, uv_width, uv_height, uv_stride,
+        uv_border_w, uv_border_h);
+  }
+  return -2;
+}
+
+// TODO(anyone): This function allocates memory for
+// lookahead buffer considering height and width is
+// aligned to 128. Currently variance calculation of
+// simple_motion_search_get_best_ref() function is done
+// for full sb size (i.e integral multiple of max sb
+// size = 128 or 64). Hence partial sbs need up to 127
+// pixels beyond frame boundary. 128 aligned limitation of
+// lookahead buffer can be removed if variance calculation
+// is adjusted for partial sbs
+
+// NOTE: Chroma width and height need not be aligned to
+// 128 since variance calculation happens only for luma plane
+int aom_realloc_lookahead_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+                                 int ss_x, int ss_y, int use_highbitdepth,
+                                 int border, int byte_alignment,
+                                 aom_codec_frame_buffer_t *fb,
+                                 aom_get_frame_buffer_cb_fn_t cb,
+                                 void *cb_priv) {
+  if (ybf) {
+    int y_stride = 0;
+    int uv_stride = 0;
+    uint64_t yplane_size = 0;
+    uint64_t uvplane_size = 0;
+    const int aligned_128_width = (width + 127) & ~127;
+    const int aligned_128_height = (height + 127) & ~127;
+    const int aligned_width = (width + 7) & ~7;
+    const int aligned_height = (height + 7) & ~7;
+    const int uv_64_height = aligned_128_height >> ss_y;
+    const int uv_width = aligned_width >> ss_x;
+    const int uv_height = aligned_height >> ss_y;
+    const int uv_border_w = border >> ss_x;
+    const int uv_border_h = border >> ss_y;
+
+    int error = calc_stride_and_planesize(
+        ss_x, ss_y, aligned_128_width, aligned_128_height, border,
+        byte_alignment, &y_stride, &uv_stride, &yplane_size, &uvplane_size,
+        uv_64_height);
+    if (error) return error;
+
+    return realloc_frame_buffer_aligned(
+        ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
+        byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size,
+        aligned_width, aligned_height, uv_width, uv_height, uv_stride,
+        uv_border_w, uv_border_h);
+  }
+  return -2;
+}
+
 int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                            int ss_x, int ss_y, int use_highbitdepth, int border,
                            int byte_alignment) {
diff --git a/libaom/aom_scale/generic/yv12extend.c b/libaom/aom_scale/generic/yv12extend.c
index 127ca23..6e9cfff 100644
--- a/libaom/aom_scale/generic/yv12extend.c
+++ b/libaom/aom_scale/generic/yv12extend.c
@@ -434,3 +434,28 @@ void aom_yv12_partial_coloc_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
   aom_yv12_partial_copy_v_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart,
                             vstart);
 }
+
+int aom_yv12_realloc_with_new_border_c(YV12_BUFFER_CONFIG *ybf, int new_border,
+                                       int byte_alignment, int num_planes) {
+  if (ybf) {
+    if (new_border == ybf->border) return 0;
+    YV12_BUFFER_CONFIG new_buf;
+    memset(&new_buf, 0, sizeof(new_buf));
+    const int error = aom_alloc_frame_buffer(
+        &new_buf, ybf->y_crop_width, ybf->y_crop_height, ybf->subsampling_x,
+        ybf->subsampling_y, ybf->flags & YV12_FLAG_HIGHBITDEPTH, new_border,
+        byte_alignment);
+    if (error) return error;
+    // Copy image buffer
+    aom_yv12_copy_frame(ybf, &new_buf, num_planes);
+
+    // Extend up to new border
+    aom_extend_frame_borders(&new_buf, num_planes);
+
+    // Now free the old buffer and replace with the new
+    aom_free_frame_buffer(ybf);
+    memcpy(ybf, &new_buf, sizeof(new_buf));
+    return 0;
+  }
+  return -2;
+}
diff --git a/libaom/aom_scale/yv12config.h b/libaom/aom_scale/yv12config.h
index 10c6ad5..04a1c04 100644
--- a/libaom/aom_scale/yv12config.h
+++ b/libaom/aom_scale/yv12config.h
@@ -24,15 +24,10 @@ extern "C" {
 
 #define AOMINNERBORDERINPIXELS 160
 #define AOM_INTERP_EXTEND 4
-
-// TODO(jingning): Use unified inter predictor for encoder and
-// decoder during the development process. Revisit the frame border
-// to improve the decoder performance.
-#if CONFIG_REDUCED_ENCODER_BORDER
-#define AOM_BORDER_IN_PIXELS 160
-#else
 #define AOM_BORDER_IN_PIXELS 288
-#endif  // CONFIG_REDUCED_ENCODER_BORDER
+#define AOM_ENC_NO_SCALE_BORDER 160
+#define AOM_ENC_LOOKAHEAD_BORDER 64
+#define AOM_DEC_BORDER_IN_PIXELS 64
 
 typedef struct yv12_buffer_config {
   union {
@@ -102,7 +97,7 @@ typedef struct yv12_buffer_config {
   aom_color_primaries_t color_primaries;
   aom_transfer_characteristics_t transfer_characteristics;
   aom_matrix_coefficients_t matrix_coefficients;
-  int monochrome;
+  uint8_t monochrome;
   aom_chroma_sample_position_t chroma_sample_position;
   aom_color_range_t color_range;
   int render_width;
@@ -130,6 +125,14 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                              int border, int byte_alignment,
                              aom_codec_frame_buffer_t *fb,
                              aom_get_frame_buffer_cb_fn_t cb, void *cb_priv);
+
+int aom_realloc_lookahead_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+                                 int ss_x, int ss_y, int use_highbitdepth,
+                                 int border, int byte_alignment,
+                                 aom_codec_frame_buffer_t *fb,
+                                 aom_get_frame_buffer_cb_fn_t cb,
+                                 void *cb_priv);
+
 int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
 #ifdef __cplusplus
diff --git a/libaom/apps/aomdec.c b/libaom/apps/aomdec.c
index 58ac172..549c4da 100644
--- a/libaom/apps/aomdec.c
+++ b/libaom/apps/aomdec.c
@@ -484,6 +484,7 @@ static int main_loop(int argc, const char **argv_) {
   input.webm_ctx = &webm_ctx;
 #endif
   struct ObuDecInputContext obu_ctx = { NULL, NULL, 0, 0, 0 };
+  int is_ivf = 0;
 
   obu_ctx.avx_ctx = &aom_input_ctx;
   input.obu_ctx = &obu_ctx;
@@ -610,8 +611,10 @@ static int main_loop(int argc, const char **argv_) {
 #endif
   input.aom_input_ctx->filename = fn;
   input.aom_input_ctx->file = infile;
-  if (file_is_ivf(input.aom_input_ctx))
+  if (file_is_ivf(input.aom_input_ctx)) {
     input.aom_input_ctx->file_type = FILE_TYPE_IVF;
+    is_ivf = 1;
+  }
 #if CONFIG_WEBM_IO
   else if (file_is_webm(input.webm_ctx, input.aom_input_ctx))
     input.aom_input_ctx->file_type = FILE_TYPE_WEBM;
@@ -661,6 +664,10 @@ static int main_loop(int argc, const char **argv_) {
   }
 
   fourcc_interface = get_aom_decoder_by_fourcc(aom_input_ctx.fourcc);
+
+  if (is_ivf && !fourcc_interface)
+    fatal("Unsupported fourcc: %x\n", aom_input_ctx.fourcc);
+
   if (interface && fourcc_interface && interface != fourcc_interface)
     warn("Header indicates codec: %s\n", fourcc_interface->name);
   else
@@ -844,7 +851,7 @@ static int main_loop(int argc, const char **argv_) {
         }
         // Default to codec bit depth if output bit depth not set
         unsigned int output_bit_depth;
-        if (!fixed_output_bit_depth && single_file && !do_md5) {
+        if (!fixed_output_bit_depth && single_file) {
           output_bit_depth = img->bit_depth;
         } else {
           output_bit_depth = fixed_output_bit_depth;
diff --git a/libaom/apps/aomenc.c b/libaom/apps/aomenc.c
index 4680d3a..08bf08d 100644
--- a/libaom/apps/aomenc.c
+++ b/libaom/apps/aomenc.c
@@ -144,16 +144,14 @@ static const arg_def_t pass_arg =
     ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
 static const arg_def_t fpf_name =
     ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
-#if CONFIG_FP_MB_STATS
-static const arg_def_t fpmbf_name =
-    ARG_DEF(NULL, "fpmbf", 1, "First pass block statistics file name");
-#endif
 static const arg_def_t limit =
     ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames");
 static const arg_def_t skip =
     ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
 static const arg_def_t good_dl =
     ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline");
+static const arg_def_t rt_dl =
+    ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline");
 static const arg_def_t quietarg =
     ARG_DEF("q", "quiet", 0, "Do not print encode progress");
 static const arg_def_t verbosearg =
@@ -219,6 +217,7 @@ static const arg_def_t *main_args[] = { &help,
                                         &limit,
                                         &skip,
                                         &good_dl,
+                                        &rt_dl,
                                         &quietarg,
                                         &verbosearg,
                                         &psnrarg,
@@ -263,9 +262,9 @@ static const arg_def_t global_error_resilient =
             "Enable global error resiliency features");
 static const arg_def_t lag_in_frames =
     ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag");
-static const arg_def_t large_scale_tile =
-    ARG_DEF(NULL, "large-scale-tile", 1,
-            "Large scale tile coding (0: off (default), 1: on)");
+static const arg_def_t large_scale_tile = ARG_DEF(
+    NULL, "large-scale-tile", 1,
+    "Large scale tile coding (0: off (default), 1: on (ivf output only))");
 static const arg_def_t monochrome =
     ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)");
 static const arg_def_t full_still_picture_hdr = ARG_DEF(
@@ -415,7 +414,7 @@ static const arg_def_t cpu_used_av1 =
     ARG_DEF(NULL, "cpu-used", 1, "CPU Used (0..8)");
 static const arg_def_t rowmtarg =
     ARG_DEF(NULL, "row-mt", 1,
-            "Enable row based multi-threading (0: off (default), 1: on)");
+            "Enable row based multi-threading (0: off, 1: on (default))");
 static const arg_def_t tile_cols =
     ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
 static const arg_def_t tile_rows =
@@ -437,10 +436,121 @@ static const arg_def_t enable_restoration =
     ARG_DEF(NULL, "enable-restoration", 1,
             "Enable the loop restoration filter (0: false, "
             "1: true (default))");
+static const arg_def_t enable_rect_partitions =
+    ARG_DEF(NULL, "enable-rect-partitions", 1,
+            "Enable rectangular partitions "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_ab_partitions =
+    ARG_DEF(NULL, "enable-ab-partitions", 1,
+            "Enable ab partitions (0: false, 1: true (default))");
+static const arg_def_t enable_1to4_partitions =
+    ARG_DEF(NULL, "enable-1to4-partitions", 1,
+            "Enable 1:4 and 4:1 partitions "
+            "(0: false, 1: true (default))");
+static const arg_def_t min_partition_size =
+    ARG_DEF(NULL, "min-partition-size", 4,
+            "Set min partition size "
+            "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)");
+static const arg_def_t max_partition_size =
+    ARG_DEF(NULL, "max-partition-size", 128,
+            "Set max partition size "
+            "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)");
+static const arg_def_t enable_dual_filter =
+    ARG_DEF(NULL, "enable-dual-filter", 1,
+            "Enable dual filter "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_intra_edge_filter =
+    ARG_DEF(NULL, "enable-intra-edge-filter", 1,
+            "Enable intra edge filtering "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_order_hint =
+    ARG_DEF(NULL, "enable-order-hint", 1,
+            "Enable order hint "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_tx64 =
+    ARG_DEF(NULL, "enable-tx64", 1,
+            "Enable 64-pt transform (0: false, 1: true (default))");
+static const arg_def_t tx_size_search_method =
+    ARG_DEF(NULL, "tx-size-search-method", 0,
+            "Set transform block size search method "
+            "(0: Full RD (default), 1: Fast RD, 2: use largest allowed)");
+static const arg_def_t enable_flip_idtx =
+    ARG_DEF(NULL, "enable-flip-idtx", 1,
+            "Enable extended transform type (0: false, 1: true (default)) "
+            "including FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, "
+            "ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, "
+            "H_ADST, V_FLIPADST, H_FLIPADST");
+static const arg_def_t enable_dist_wtd_comp =
+    ARG_DEF(NULL, "enable-dist-wtd-comp", 1,
+            "Enable distance-weighted compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_masked_comp =
+    ARG_DEF(NULL, "enable-masked-comp", 1,
+            "Enable masked (wedge/diff-wtd) compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_onesided_comp =
+    ARG_DEF(NULL, "enable-onesided-comp", 1,
+            "Enable one sided compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_interintra_comp =
+    ARG_DEF(NULL, "enable-interintra-comp", 1,
+            "Enable interintra compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_smooth_interintra =
+    ARG_DEF(NULL, "enable-smooth-interintra", 1,
+            "Enable smooth interintra mode "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_diff_wtd_comp =
+    ARG_DEF(NULL, "enable-diff-wtd-comp", 1,
+            "Enable difference-weighted compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_interinter_wedge =
+    ARG_DEF(NULL, "enable-interinter-wedge", 1,
+            "Enable interinter wedge compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_interintra_wedge =
+    ARG_DEF(NULL, "enable-interintra-wedge", 1,
+            "Enable interintra wedge compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_global_motion =
+    ARG_DEF(NULL, "enable-global-motion", 1,
+            "Enable global motion "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_warped_motion =
+    ARG_DEF(NULL, "enable-warped-motion", 1,
+            "Enable local warped motion "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_filter_intra =
+    ARG_DEF(NULL, "enable-filter-intra", 1,
+            "Enable filter intra prediction mode "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_smooth_intra =
+    ARG_DEF(NULL, "enable-smooth-intra", 1,
+            "Enable smooth intra prediction modes "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_paeth_intra =
+    ARG_DEF(NULL, "enable-paeth-intra", 1,
+            "Enable Paeth intra prediction mode (0: false, 1: true (default))");
+static const arg_def_t enable_cfl_intra =
+    ARG_DEF(NULL, "enable-cfl-intra", 1,
+            "Enable chroma from luma intra prediction mode "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_obmc = ARG_DEF(
+    NULL, "enable-obmc", 1, "Enable OBMC (0: false, 1: true (default))");
+static const arg_def_t enable_palette =
+    ARG_DEF(NULL, "enable-palette", 1,
+            "Enable palette prediction mode (0: false, 1: true (default))");
+static const arg_def_t enable_intrabc =
+    ARG_DEF(NULL, "enable-intrabc", 1,
+            "Enable intra block copy prediction mode "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_angle_delta =
+    ARG_DEF(NULL, "enable-angle-delta", 1,
+            "Enable intra angle delta (0: false, 1: true (default))");
 static const arg_def_t disable_trellis_quant =
     ARG_DEF(NULL, "disable-trellis-quant", 1,
             "Disable trellis optimization of quantized coefficients (0: false ("
-            "default) 1: true)");
+            "default) 1: true  2: partial true)");
 static const arg_def_t enable_qm =
     ARG_DEF(NULL, "enable-qm", 1,
             "Enable quantisation matrices (0: false (default), 1: true)");
@@ -448,6 +558,25 @@ static const arg_def_t qm_min = ARG_DEF(
     NULL, "qm-min", 1, "Min quant matrix flatness (0..15), default is 8");
 static const arg_def_t qm_max = ARG_DEF(
     NULL, "qm-max", 1, "Max quant matrix flatness (0..15), default is 15");
+static const arg_def_t reduced_tx_type_set = ARG_DEF(
+    NULL, "reduced-tx-type-set", 1, "Use reduced set of transform types");
+static const arg_def_t use_intra_dct_only =
+    ARG_DEF(NULL, "use-intra-dct-only", 1, "Use DCT only for INTRA modes");
+static const arg_def_t use_inter_dct_only =
+    ARG_DEF(NULL, "use-inter-dct-only", 1, "Use DCT only for INTER modes");
+static const arg_def_t use_intra_default_tx_only =
+    ARG_DEF(NULL, "use-intra-default-tx-only", 1,
+            "Use Default-transform only for INTRA modes");
+static const arg_def_t quant_b_adapt =
+    ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b");
+static const arg_def_t coeff_cost_upd_freq =
+    ARG_DEF(NULL, "coeff-cost-upd-freq", 1,
+            "Update freq for coeff costs"
+            "0: SB, 1: SB Row per Tile, 2: Tile");
+static const arg_def_t mode_cost_upd_freq =
+    ARG_DEF(NULL, "mode-cost-upd-freq", 1,
+            "Update freq for mode costs"
+            "0: SB, 1: SB Row per Tile, 2: Tile");
 #if CONFIG_DIST_8X8
 static const arg_def_t enable_dist_8x8 =
     ARG_DEF(NULL, "enable-dist-8x8", 1,
@@ -515,6 +644,25 @@ static const arg_def_t min_gf_interval = ARG_DEF(
 static const arg_def_t max_gf_interval = ARG_DEF(
     NULL, "max-gf-interval", 1,
     "max gf/arf frame interval (default 0, indicating in-built behavior)");
+static const arg_def_t gf_max_pyr_height =
+    ARG_DEF(NULL, "gf-max-pyr-height", 1,
+            "maximum height for GF group pyramid structure (0 to 4 (default))");
+static const arg_def_t max_reference_frames = ARG_DEF(
+    NULL, "max-reference-frames", 1,
+    "maximum number of reference frames allowed per frame (3 to 7 (default))");
+static const arg_def_t reduced_reference_set =
+    ARG_DEF(NULL, "reduced-reference-set", 1,
+            "Use reduced set of single and compound references (0: off "
+            "(default), 1: on)");
+static const arg_def_t target_seq_level_idx =
+    ARG_DEF(NULL, "target-seq-level-idx", 1,
+            "Target sequence level index. "
+            "Possible values are in the form of \"ABxy\"(pad leading zeros if "
+            "less than 4 digits). "
+            "AB: Operating point(OP) index; "
+            "xy: Target level index for the OP. "
+            "E.g. \"0\" means target level index 0 for the 0th OP; "
+            "\"1021\" means target level index 21 for the 10th OP.");
 
 static const struct arg_enum_list color_primaries_enum[] = {
   { "bt709", AOM_CICP_CP_BT_709 },
@@ -620,6 +768,12 @@ static const struct arg_enum_list superblock_size_enum[] = {
 static const arg_def_t superblock_size = ARG_DEF_ENUM(
     NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum);
 
+static const arg_def_t set_tier_mask =
+    ARG_DEF(NULL, "set-tier-mask", 1,
+            "Set bit mask to specify which tier each of the 32 possible "
+            "operating points conforms to. "
+            "Bit value 0(defualt): Main Tier; 1: High Tier.");
+
 static const arg_def_t *av1_args[] = { &cpu_used_av1,
                                        &auto_altref,
                                        &sharpness,
@@ -638,10 +792,46 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1,
                                        &lossless,
                                        &enable_cdef,
                                        &enable_restoration,
+                                       &enable_rect_partitions,
+                                       &enable_ab_partitions,
+                                       &enable_1to4_partitions,
+                                       &min_partition_size,
+                                       &max_partition_size,
+                                       &enable_dual_filter,
+                                       &enable_intra_edge_filter,
+                                       &enable_order_hint,
+                                       &enable_tx64,
+                                       &tx_size_search_method,
+                                       &enable_flip_idtx,
+                                       &enable_dist_wtd_comp,
+                                       &enable_masked_comp,
+                                       &enable_onesided_comp,
+                                       &enable_interintra_comp,
+                                       &enable_smooth_interintra,
+                                       &enable_diff_wtd_comp,
+                                       &enable_interinter_wedge,
+                                       &enable_interintra_wedge,
+                                       &enable_global_motion,
+                                       &enable_warped_motion,
+                                       &enable_filter_intra,
+                                       &enable_smooth_intra,
+                                       &enable_paeth_intra,
+                                       &enable_cfl_intra,
+                                       &enable_obmc,
+                                       &enable_palette,
+                                       &enable_intrabc,
+                                       &enable_angle_delta,
                                        &disable_trellis_quant,
                                        &enable_qm,
                                        &qm_min,
                                        &qm_max,
+                                       &reduced_tx_type_set,
+                                       &use_intra_dct_only,
+                                       &use_inter_dct_only,
+                                       &use_intra_default_tx_only,
+                                       &quant_b_adapt,
+                                       &coeff_cost_upd_freq,
+                                       &mode_cost_upd_freq,
 #if CONFIG_DIST_8X8
                                        &enable_dist_8x8,
 #endif
@@ -659,6 +849,7 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1,
                                        &input_chroma_sample_position,
                                        &min_gf_interval,
                                        &max_gf_interval,
+                                       &gf_max_pyr_height,
                                        &superblock_size,
                                        &num_tg,
                                        &mtu_size,
@@ -668,8 +859,12 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1,
 #if CONFIG_DENOISE
                                        &denoise_noise_level,
                                        &denoise_block_size,
-#endif
+#endif  // CONFIG_DENOISE
+                                       &max_reference_frames,
+                                       &reduced_reference_set,
                                        &enable_ref_frame_mvs,
+                                       &target_seq_level_idx,
+                                       &set_tier_mask,
                                        &bitdeptharg,
                                        &inbitdeptharg,
                                        &input_chroma_subsampling_x,
@@ -696,10 +891,46 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                         AV1E_SET_LOSSLESS,
                                         AV1E_SET_ENABLE_CDEF,
                                         AV1E_SET_ENABLE_RESTORATION,
+                                        AV1E_SET_ENABLE_RECT_PARTITIONS,
+                                        AV1E_SET_ENABLE_AB_PARTITIONS,
+                                        AV1E_SET_ENABLE_1TO4_PARTITIONS,
+                                        AV1E_SET_MIN_PARTITION_SIZE,
+                                        AV1E_SET_MAX_PARTITION_SIZE,
+                                        AV1E_SET_ENABLE_DUAL_FILTER,
+                                        AV1E_SET_ENABLE_INTRA_EDGE_FILTER,
+                                        AV1E_SET_ENABLE_ORDER_HINT,
+                                        AV1E_SET_ENABLE_TX64,
+                                        AV1E_SET_TX_SIZE_SEARCH_METHOD,
+                                        AV1E_SET_ENABLE_FLIP_IDTX,
+                                        AV1E_SET_ENABLE_DIST_WTD_COMP,
+                                        AV1E_SET_ENABLE_MASKED_COMP,
+                                        AV1E_SET_ENABLE_ONESIDED_COMP,
+                                        AV1E_SET_ENABLE_INTERINTRA_COMP,
+                                        AV1E_SET_ENABLE_SMOOTH_INTERINTRA,
+                                        AV1E_SET_ENABLE_DIFF_WTD_COMP,
+                                        AV1E_SET_ENABLE_INTERINTER_WEDGE,
+                                        AV1E_SET_ENABLE_INTERINTRA_WEDGE,
+                                        AV1E_SET_ENABLE_GLOBAL_MOTION,
+                                        AV1E_SET_ENABLE_WARPED_MOTION,
+                                        AV1E_SET_ENABLE_FILTER_INTRA,
+                                        AV1E_SET_ENABLE_SMOOTH_INTRA,
+                                        AV1E_SET_ENABLE_PAETH_INTRA,
+                                        AV1E_SET_ENABLE_CFL_INTRA,
+                                        AV1E_SET_ENABLE_OBMC,
+                                        AV1E_SET_ENABLE_PALETTE,
+                                        AV1E_SET_ENABLE_INTRABC,
+                                        AV1E_SET_ENABLE_ANGLE_DELTA,
                                         AV1E_SET_DISABLE_TRELLIS_QUANT,
                                         AV1E_SET_ENABLE_QM,
                                         AV1E_SET_QM_MIN,
                                         AV1E_SET_QM_MAX,
+                                        AV1E_SET_REDUCED_TX_TYPE_SET,
+                                        AV1E_SET_INTRA_DCT_ONLY,
+                                        AV1E_SET_INTER_DCT_ONLY,
+                                        AV1E_SET_INTRA_DEFAULT_TX_ONLY,
+                                        AV1E_SET_QUANT_B_ADAPT,
+                                        AV1E_SET_COEFF_COST_UPD_FREQ,
+                                        AV1E_SET_MODE_COST_UPD_FREQ,
 #if CONFIG_DIST_8X8
                                         AV1E_SET_ENABLE_DIST_8X8,
 #endif
@@ -717,6 +948,7 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                         AV1E_SET_CHROMA_SAMPLE_POSITION,
                                         AV1E_SET_MIN_GF_INTERVAL,
                                         AV1E_SET_MAX_GF_INTERVAL,
+                                        AV1E_SET_GF_MAX_PYRAMID_HEIGHT,
                                         AV1E_SET_SUPERBLOCK_SIZE,
                                         AV1E_SET_NUM_TG,
                                         AV1E_SET_MTU,
@@ -726,12 +958,12 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
 #if CONFIG_DENOISE
                                         AV1E_SET_DENOISE_NOISE_LEVEL,
                                         AV1E_SET_DENOISE_BLOCK_SIZE,
-#endif
+#endif  // CONFIG_DENOISE
+                                        AV1E_SET_MAX_REFERENCE_FRAMES,
+                                        AV1E_SET_REDUCED_REFERENCE_SET,
                                         AV1E_SET_ENABLE_REF_FRAME_MVS,
-                                        AV1E_SET_ENABLE_DF,
-                                        AV1E_SET_ENABLE_ORDER_HINT,
-                                        AV1E_SET_ENABLE_JNT_COMP,
-                                        AV1E_SET_ENABLE_SUPERRES,
+                                        AV1E_SET_TARGET_SEQ_LEVEL_IDX,
+                                        AV1E_SET_TIER_MASK,
                                         0 };
 #endif  // CONFIG_AV1_ENCODER
 
@@ -798,9 +1030,6 @@ struct stream_config {
   struct aom_codec_enc_cfg cfg;
   const char *out_fn;
   const char *stats_fn;
-#if CONFIG_FP_MB_STATS
-  const char *fpmb_stats_fn;
-#endif
   stereo_format_t stereo_fmt;
   int arg_ctrls[ARG_CTRL_CNT_MAX][2];
   int arg_ctrl_cnt;
@@ -828,9 +1057,6 @@ struct stream_state {
   uint64_t cx_time;
   size_t nbytes;
   stats_io_t stats;
-#if CONFIG_FP_MB_STATS
-  stats_io_t fpmb_stats;
-#endif
   struct aom_image *img;
   aom_codec_ctx_t decoder;
   int mismatch_seen;
@@ -916,7 +1142,9 @@ static void parse_global_config(struct AvxEncoderConfig *global, int argc,
     } else if (arg_match(&arg, &usage, argi))
       global->usage = arg_parse_uint(&arg);
     else if (arg_match(&arg, &good_dl, argi))
-      warn("Deprecated --good option! Ignoring\n");
+      global->usage = AOM_USAGE_GOOD_QUALITY;  // Good quality usage
+    else if (arg_match(&arg, &rt_dl, argi))
+      global->usage = AOM_USAGE_REALTIME;  // Real-time usage
     else if (arg_match(&arg, &use_yv12, argi))
       global->color_type = YV12;
     else if (arg_match(&arg, &use_i420, argi))
@@ -969,11 +1197,19 @@ static void parse_global_config(struct AvxEncoderConfig *global, int argc,
     // Make default AV1 passes = 2 until there is a better quality 1-pass
     // encoder
     if (global->codec != NULL && global->codec->name != NULL)
-      global->passes = (strcmp(global->codec->name, "av1") == 0) ? 2 : 1;
+      global->passes = (strcmp(global->codec->name, "av1") == 0 &&
+                        global->usage != AOM_USAGE_REALTIME)
+                           ? 2
+                           : 1;
 #else
     global->passes = 1;
 #endif
   }
+
+  if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) {
+    warn("Enforcing one-pass encoding in realtime mode\n");
+    global->passes = 1;
+  }
 }
 
 static void open_input_file(struct AvxInputContext *input,
@@ -1090,6 +1326,17 @@ static void set_config_arg_ctrls(struct stream_config *config, int key,
     return;
   }
 
+  // For target level, the settings should accumulate rather than overwrite,
+  // so we simply append it.
+  if (key == AV1E_SET_TARGET_SEQ_LEVEL_IDX) {
+    j = config->arg_ctrl_cnt;
+    assert(j < (int)ARG_CTRL_CNT_MAX);
+    config->arg_ctrls[j][0] = key;
+    config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg);
+    ++config->arg_ctrl_cnt;
+    return;
+  }
+
   /* Point either to the next free element or the first instance of this
    * control.
    */
@@ -1159,10 +1406,6 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
       }
     } else if (arg_match(&arg, &fpf_name, argi)) {
       config->stats_fn = arg.val;
-#if CONFIG_FP_MB_STATS
-    } else if (arg_match(&arg, &fpmbf_name, argi)) {
-      config->fpmb_stats_fn = arg.val;
-#endif
     } else if (arg_match(&arg, &use_webm, argi)) {
 #if CONFIG_WEBM_IO
       config->write_webm = 1;
@@ -1207,8 +1450,15 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
       config->cfg.g_error_resilient = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &lag_in_frames, argi)) {
       config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
+      if (global->usage == AOM_USAGE_REALTIME &&
+          config->cfg.rc_end_usage == AOM_CBR &&
+          config->cfg.g_lag_in_frames != 0) {
+        warn("non-zero %s option ignored in realtime CBR mode.\n", arg.name);
+        config->cfg.g_lag_in_frames = 0;
+      }
     } else if (arg_match(&arg, &large_scale_tile, argi)) {
       config->cfg.large_scale_tile = arg_parse_uint(&arg);
+      if (config->cfg.large_scale_tile) global->codec = get_aom_lst_encoder();
     } else if (arg_match(&arg, &monochrome, argi)) {
       config->cfg.monochrome = 1;
     } else if (arg_match(&arg, &full_still_picture_hdr, argi)) {
@@ -1349,17 +1599,6 @@ static void validate_stream_config(const struct stream_state *stream,
         fatal("Stream %d: duplicate stats file (from stream %d)",
               streami->index, stream->index);
     }
-
-#if CONFIG_FP_MB_STATS
-    /* Check for two streams sharing a mb stats file. */
-    if (streami != stream) {
-      const char *a = stream->config.fpmb_stats_fn;
-      const char *b = streami->config.fpmb_stats_fn;
-      if (a && b && !strcmp(a, b))
-        fatal("Stream %d: duplicate mb stats file (from stream %d)",
-              streami->index, stream->index);
-    }
-#endif
   }
 }
 
@@ -1524,26 +1763,11 @@ static void setup_pass(struct stream_state *stream,
       fatal("Failed to open statistics store");
   }
 
-#if CONFIG_FP_MB_STATS
-  if (stream->config.fpmb_stats_fn) {
-    if (!stats_open_file(&stream->fpmb_stats, stream->config.fpmb_stats_fn,
-                         pass))
-      fatal("Failed to open mb statistics store");
-  } else {
-    if (!stats_open_mem(&stream->fpmb_stats, pass))
-      fatal("Failed to open mb statistics store");
-  }
-#endif
-
   stream->config.cfg.g_pass = global->passes == 2
                                   ? pass ? AOM_RC_LAST_PASS : AOM_RC_FIRST_PASS
                                   : AOM_RC_ONE_PASS;
   if (pass) {
     stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
-#if CONFIG_FP_MB_STATS
-    stream->config.cfg.rc_firstpass_mb_stats_in =
-        stats_get(&stream->fpmb_stats);
-#endif
   }
 
   stream->cx_time = 0;
@@ -1772,13 +1996,6 @@ static void get_cx_data(struct stream_state *stream,
                     pkt->data.twopass_stats.sz);
         stream->nbytes += pkt->data.raw.sz;
         break;
-#if CONFIG_FP_MB_STATS
-      case AOM_CODEC_FPMB_STATS_PKT:
-        stats_write(&stream->fpmb_stats, pkt->data.firstpass_mb_stats.buf,
-                    pkt->data.firstpass_mb_stats.sz);
-        stream->nbytes += pkt->data.raw.sz;
-        break;
-#endif
       case AOM_CODEC_PSNR_PKT:
 
         if (global->show_psnr) {
@@ -1966,6 +2183,10 @@ int main(int argc, const char **argv_) {
   FOREACH_STREAM(stream, streams) {
     check_encoder_config(global.disable_warning_prompt, &global,
                          &stream->config.cfg);
+
+    // If large_scale_tile = 1, only support to output to ivf format.
+    if (stream->config.cfg.large_scale_tile && !stream->config.write_ivf)
+      die("only support ivf output format while large-scale-tile=1\n");
   }
 
   /* Handle non-option arguments */
@@ -2371,12 +2592,6 @@ int main(int argc, const char **argv_) {
       stats_close(&stream->stats, global.passes - 1);
     }
 
-#if CONFIG_FP_MB_STATS
-    FOREACH_STREAM(stream, streams) {
-      stats_close(&stream->fpmb_stats, global.passes - 1);
-    }
-#endif
-
     if (global.pass) break;
   }
 
diff --git a/libaom/av1/av1.cmake b/libaom/av1/av1.cmake
index 8c92615..fb9678a 100644
--- a/libaom/av1/av1.cmake
+++ b/libaom/av1/av1.cmake
@@ -137,6 +137,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/encodemb.h"
             "${AOM_ROOT}/av1/encoder/encodemv.c"
             "${AOM_ROOT}/av1/encoder/encodemv.h"
+            "${AOM_ROOT}/av1/encoder/encode_strategy.c"
+            "${AOM_ROOT}/av1/encoder/encode_strategy.h"
             "${AOM_ROOT}/av1/encoder/encoder.c"
             "${AOM_ROOT}/av1/encoder/encoder.h"
             "${AOM_ROOT}/av1/encoder/encodetxb.c"
@@ -149,6 +151,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/firstpass.h"
             "${AOM_ROOT}/av1/encoder/global_motion.c"
             "${AOM_ROOT}/av1/encoder/global_motion.h"
+            "${AOM_ROOT}/av1/encoder/gop_structure.c"
+            "${AOM_ROOT}/av1/encoder/gop_structure.h"
             "${AOM_ROOT}/av1/encoder/grain_test_vectors.h"
             "${AOM_ROOT}/av1/encoder/hash.c"
             "${AOM_ROOT}/av1/encoder/hash.h"
@@ -156,6 +160,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/hash_motion.h"
             "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
             "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+            "${AOM_ROOT}/av1/encoder/level.c"
+            "${AOM_ROOT}/av1/encoder/level.h"
             "${AOM_ROOT}/av1/encoder/lookahead.c"
             "${AOM_ROOT}/av1/encoder/lookahead.h"
             "${AOM_ROOT}/av1/encoder/mbgraph.c"
@@ -166,6 +172,10 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/ml.h"
             "${AOM_ROOT}/av1/encoder/palette.c"
             "${AOM_ROOT}/av1/encoder/palette.h"
+            "${AOM_ROOT}/av1/encoder/partition_strategy.h"
+            "${AOM_ROOT}/av1/encoder/partition_strategy.c"
+            "${AOM_ROOT}/av1/encoder/pass2_strategy.h"
+            "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
             "${AOM_ROOT}/av1/encoder/pickcdef.c"
             "${AOM_ROOT}/av1/encoder/picklpf.c"
             "${AOM_ROOT}/av1/encoder/picklpf.h"
@@ -189,7 +199,11 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/temporal_filter.h"
             "${AOM_ROOT}/av1/encoder/tokenize.c"
             "${AOM_ROOT}/av1/encoder/tokenize.h"
+            "${AOM_ROOT}/av1/encoder/tpl_model.c"
+            "${AOM_ROOT}/av1/encoder/tpl_model.h"
             "${AOM_ROOT}/av1/encoder/wedge_utils.c"
+            "${AOM_ROOT}/av1/encoder/var_based_part.c"
+            "${AOM_ROOT}/av1/encoder/var_based_part.h"
             "${AOM_ROOT}/third_party/fastfeat/fast.c"
             "${AOM_ROOT}/third_party/fastfeat/fast.h"
             "${AOM_ROOT}/third_party/fastfeat/fast_9.c"
@@ -253,8 +267,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
             "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
 
 list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
-            "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm"
-            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_apply_sse2.asm")
+            "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c"
@@ -277,14 +290,20 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_constants.h"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/corner_match_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c"
@@ -340,15 +359,7 @@ endif()
 function(setup_av1_targets)
   add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES})
   list(APPEND AOM_LIB_TARGETS aom_av1_common)
-
-  create_dummy_source_file("aom_av1" "c" "dummy_source_file")
-  add_library(aom_av1 OBJECT "${dummy_source_file}")
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_common>)
-  list(APPEND AOM_LIB_TARGETS aom_av1)
-
-  # Not all generators support libraries consisting only of object files. Add a
-  # dummy source file to the aom_av1 target.
-  add_dummy_source_file_to_target("aom_av1" "c")
 
   if(CONFIG_AV1_DECODER)
     add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES})
@@ -446,13 +457,13 @@ function(setup_av1_targets)
 
   if(HAVE_NEON)
     if(AOM_AV1_COMMON_INTRIN_NEON)
-      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon"
+      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
                                     "aom_av1_common"
                                     "AOM_AV1_COMMON_INTRIN_NEON" "aom")
     endif()
 
     if(AOM_AV1_ENCODER_INTRIN_NEON)
-      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon"
+      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
                                     "aom_av1_encoder"
                                     "AOM_AV1_ENCODER_INTRIN_NEON" "aom")
     endif()
@@ -470,13 +481,7 @@ function(setup_av1_targets)
                                   "AOM_AV1_ENCODER_INTRIN_MSA" "aom")
   endif()
 
-  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
-  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
-
   # Pass the new lib targets up to the parent scope instance of
   # $AOM_LIB_TARGETS.
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
 endfunction()
-
-function(setup_av1_test_targets)
-endfunction()
diff --git a/libaom/av1/av1_cx_iface.c b/libaom/av1/av1_cx_iface.c
index 43a6028..e8cd508 100644
--- a/libaom/av1/av1_cx_iface.c
+++ b/libaom/av1/av1_cx_iface.c
@@ -26,10 +26,6 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/firstpass.h"
 
-#if CONFIG_REDUCED_ENCODER_BORDER
-#include "common/tools_common.h"
-#endif  // CONFIG_REDUCED_ENCODER_BORDER
-
 #define MAG_SIZE (4)
 #define MAX_NUM_ENHANCEMENT_LAYERS 3
 
@@ -48,6 +44,7 @@ struct av1_extracfg {
   unsigned int arnr_strength;
   unsigned int min_gf_interval;
   unsigned int max_gf_interval;
+  unsigned int gf_max_pyr_height;
   aom_tune_metric tuning;
   unsigned int cq_level;  // constrained quality level
   unsigned int rc_max_intra_bitrate_pct;
@@ -56,6 +53,7 @@ struct av1_extracfg {
   unsigned int lossless;
   unsigned int enable_cdef;
   unsigned int enable_restoration;
+  unsigned int enable_obmc;
   unsigned int disable_trellis_quant;
   unsigned int enable_qm;
   unsigned int qm_y;
@@ -71,7 +69,7 @@ struct av1_extracfg {
 
   aom_timing_info_type_t timing_info_type;
   unsigned int frame_parallel_decoding_mode;
-  int use_dual_filter;
+  int enable_dual_filter;
   AQ_MODE aq_mode;
   DELTAQ_MODE deltaq_mode;
   unsigned int frame_periodic_boost;
@@ -93,13 +91,39 @@ struct av1_extracfg {
   const char *film_grain_table_filename;
   unsigned int motion_vector_unit_test;
   unsigned int cdf_update_mode;
-  int enable_order_hint;
-  int enable_jnt_comp;
-  int enable_ref_frame_mvs;  // sequence level
-  int allow_ref_frame_mvs;   // frame level
-  int enable_warped_motion;  // sequence level
-  int allow_warped_motion;   // frame level
+  int enable_rect_partitions;    // enable rectangular partitions for sequence
+  int enable_ab_partitions;      // enable AB partitions for sequence
+  int enable_1to4_partitions;    // enable 1:4 and 4:1 partitions for sequence
+  int min_partition_size;        // min partition size [4,8,16,32,64,128]
+  int max_partition_size;        // max partition size [4,8,16,32,64,128]
+  int enable_intra_edge_filter;  // enable intra-edge filter for sequence
+  int enable_order_hint;         // enable order hint for sequence
+  int enable_tx64;               // enable 64-pt transform usage for sequence
+  int tx_size_search_method;     // set transform block size search method
+  int enable_flip_idtx;          // enable flip and identity transform types
+  int enable_dist_wtd_comp;      // enable dist wtd compound for sequence
+  int max_reference_frames;      // maximum number of references per frame
+  int enable_reduced_reference_set;  // enable reduced set of references
+  int enable_ref_frame_mvs;          // sequence level
+  int allow_ref_frame_mvs;           // frame level
+  int enable_masked_comp;            // enable masked compound for sequence
+  int enable_onesided_comp;          // enable one sided compound for sequence
+  int enable_interintra_comp;        // enable interintra compound for sequence
+  int enable_smooth_interintra;      // enable smooth interintra mode usage
+  int enable_diff_wtd_comp;          // enable diff-wtd compound usage
+  int enable_interinter_wedge;       // enable interinter-wedge compound usage
+  int enable_interintra_wedge;       // enable interintra-wedge compound usage
+  int enable_global_motion;          // enable global motion usage for sequence
+  int enable_warped_motion;          // sequence level
+  int allow_warped_motion;           // frame level
+  int enable_filter_intra;           // enable filter intra for sequence
+  int enable_smooth_intra;           // enable smooth intra modes for sequence
+  int enable_paeth_intra;            // enable Paeth intra mode for sequence
+  int enable_cfl_intra;              // enable CFL uv intra mode for sequence
   int enable_superres;
+  int enable_palette;
+  int enable_intrabc;
+  int enable_angle_delta;
 #if CONFIG_DENOISE
   float noise_level;
   int noise_block_size;
@@ -107,6 +131,17 @@ struct av1_extracfg {
 
   unsigned int chroma_subsampling_x;
   unsigned int chroma_subsampling_y;
+  int reduced_tx_type_set;
+  int use_intra_dct_only;
+  int use_inter_dct_only;
+  int use_intra_default_tx_only;
+  int quant_b_adapt;
+  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+  // Bit mask to specify which tier each of the 32 possible operating points
+  // conforms to.
+  unsigned int tier_mask;
+  COST_UPDATE_TYPE coeff_cost_upd_freq;
+  COST_UPDATE_TYPE mode_cost_upd_freq;
 };
 
 static struct av1_extracfg default_extra_cfg = {
@@ -116,7 +151,7 @@ static struct av1_extracfg default_extra_cfg = {
   0,                       // noise_sensitivity
   CONFIG_SHARP_SETTINGS,   // sharpness
   0,                       // static_thresh
-  0,                       // row_mt
+  1,                       // row_mt
   0,                       // tile_columns
   0,                       // tile_rows
   0,                       // enable_tpl_model
@@ -124,6 +159,7 @@ static struct av1_extracfg default_extra_cfg = {
   5,                       // arnr_strength
   0,                       // min_gf_interval; 0 -> default decision
   0,                       // max_gf_interval; 0 -> default decision
+  4,                       // gf_max_pyr_height
   AOM_TUNE_PSNR,           // tuning
   10,                      // cq_level
   0,                       // rc_max_intra_bitrate_pct
@@ -132,6 +168,7 @@ static struct av1_extracfg default_extra_cfg = {
   0,                       // lossless
   !CONFIG_SHARP_SETTINGS,  // enable_cdef
   1,                       // enable_restoration
+  1,                       // enable_obmc
   0,                       // disable_trellis_quant
   0,                       // enable_qm
   DEFAULT_QM_Y,            // qm_y
@@ -145,7 +182,7 @@ static struct av1_extracfg default_extra_cfg = {
   1,                            // max number of tile groups
   0,                            // mtu_size
   AOM_TIMING_UNSPECIFIED,       // No picture timing signaling in bitstream
-  1,                            // frame_parallel_decoding_mode
+  0,                            // frame_parallel_decoding_mode
   1,                            // enable dual filter
   NO_AQ,                        // aq_mode
   NO_DELTA_Q,                   // deltaq_mode
@@ -167,19 +204,57 @@ static struct av1_extracfg default_extra_cfg = {
   0,                            // film_grain_table_filename
   0,                            // motion_vector_unit_test
   1,                            // CDF update mode
+  1,                            // enable rectangular partitions
+  1,                            // enable ab shape partitions
+  1,                            // enable 1:4 and 4:1 partitions
+  4,                            // min_partition_size
+  128,                          // max_partition_size
+  1,                            // enable intra edge filter
   1,                            // frame order hint
-  1,                            // jnt_comp
+  1,                            // enable 64-pt transform usage
+  0,                            // transform block size search method
+  1,                            // enable flip and identity transform
+  1,                            // dist-wtd compound
+  7,                            // max_reference_frames
+  0,                            // enable_reduced_reference_set
   1,                            // enable_ref_frame_mvs sequence level
   1,                            // allow ref_frame_mvs frame level
+  1,                            // enable masked compound at sequence level
+  1,                            // enable one sided compound at sequence level
+  1,                            // enable interintra compound at sequence level
+  1,                            // enable smooth interintra mode
+  1,                            // enable difference-weighted compound
+  1,                            // enable interinter wedge compound
+  1,                            // enable interintra wedge compound
+  1,                            // enable_global_motion usage
   1,                            // enable_warped_motion at sequence level
   1,                            // allow_warped_motion at frame level
+  1,                            // enable filter intra at sequence level
+  1,                            // enable smooth intra modes usage for sequence
+  1,                            // enable Paeth intra mode usage for sequence
+  1,                            // enable CFL uv intra mode usage for sequence
   1,                            // superres
+  1,                            // enable palette
+  !CONFIG_SHARP_SETTINGS,       // enable intrabc
+  1,                            // enable angle delta
 #if CONFIG_DENOISE
   0,   // noise_level
   32,  // noise_block_size
 #endif
   0,  // chroma_subsampling_x
   0,  // chroma_subsampling_y
+  0,  // reduced_tx_type_set
+  0,  // use_intra_dct_only
+  0,  // use_inter_dct_only
+  0,  // use_intra_default_tx_only
+  0,  // quant_b_adapt
+  {
+      31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+      31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+  },            // target_seq_level_idx
+  0,            // tier_mask
+  COST_UPD_SB,  // coeff_cost_upd_freq
+  COST_UPD_SB,  // mode_cost_upd_freq
 };
 
 struct aom_codec_alg_priv {
@@ -251,6 +326,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1);
   RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTAQ_MODE_COUNT - 1);
   RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
+  RANGE_CHECK_HI(cfg, g_usage, 1);
   RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
   RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
   RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
@@ -266,6 +342,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
     RANGE_CHECK(extra_cfg, max_gf_interval, MAX(2, extra_cfg->min_gf_interval),
                 (MAX_LAG_BUFFERS - 1));
   }
+  RANGE_CHECK_HI(extra_cfg, gf_max_pyr_height, 4);
 
   RANGE_CHECK_HI(cfg, rc_resize_mode, RESIZE_MODES - 1);
   RANGE_CHECK(cfg, rc_resize_denominator, SCALE_NUMERATOR,
@@ -382,9 +459,26 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
 #endif
   }
 
+  RANGE_CHECK(extra_cfg, max_reference_frames, 3, 7);
+  RANGE_CHECK(extra_cfg, enable_reduced_reference_set, 0, 1);
   RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1);
   RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1);
 
+  RANGE_CHECK_HI(extra_cfg, disable_trellis_quant, 3);
+  RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 2);
+  RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 2);
+
+  RANGE_CHECK(extra_cfg, min_partition_size, 4, 128);
+  RANGE_CHECK(extra_cfg, max_partition_size, 4, 128);
+  RANGE_CHECK_HI(extra_cfg, min_partition_size, extra_cfg->max_partition_size);
+
+  RANGE_CHECK(extra_cfg, tx_size_search_method, 0, 2);
+
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    if (!is_valid_seq_level_idx(extra_cfg->target_seq_level_idx[i]))
+      ERROR("Target sequence level index is invalid");
+  }
+
   return AOM_CODEC_OK;
 }
 
@@ -452,6 +546,7 @@ static aom_codec_err_t set_encoder_config(
   oxcf->profile = cfg->g_profile;
   oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled;
   oxcf->max_threads = (int)cfg->g_threads;
+  oxcf->mode = (cfg->g_usage == 1) ? REALTIME : GOOD;
   oxcf->width = cfg->g_w;
   oxcf->height = cfg->g_h;
   oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width;
@@ -494,7 +589,6 @@ static aom_codec_err_t set_encoder_config(
     oxcf->init_framerate = 30;
     oxcf->timing_info_present = 0;
   }
-  oxcf->mode = GOOD;
   oxcf->cfg = &cfg->cfg;
 
   switch (cfg->g_pass) {
@@ -522,6 +616,10 @@ static aom_codec_err_t set_encoder_config(
 
   oxcf->enable_cdef = extra_cfg->enable_cdef;
   oxcf->enable_restoration = extra_cfg->enable_restoration;
+  oxcf->enable_obmc = extra_cfg->enable_obmc;
+  oxcf->enable_palette = extra_cfg->enable_palette;
+  oxcf->enable_intrabc = extra_cfg->enable_intrabc;
+  oxcf->enable_angle_delta = extra_cfg->enable_angle_delta;
   oxcf->disable_trellis_quant = extra_cfg->disable_trellis_quant;
   oxcf->using_qm = extra_cfg->enable_qm;
   oxcf->qm_y = extra_cfg->qm_y;
@@ -529,6 +627,13 @@ static aom_codec_err_t set_encoder_config(
   oxcf->qm_v = extra_cfg->qm_v;
   oxcf->qm_minlevel = extra_cfg->qm_min;
   oxcf->qm_maxlevel = extra_cfg->qm_max;
+  oxcf->reduced_tx_type_set = extra_cfg->reduced_tx_type_set;
+  oxcf->use_intra_dct_only = extra_cfg->use_intra_dct_only;
+  oxcf->use_inter_dct_only = extra_cfg->use_inter_dct_only;
+  oxcf->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only;
+  oxcf->quant_b_adapt = extra_cfg->quant_b_adapt;
+  oxcf->coeff_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
+  oxcf->mode_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
 #if CONFIG_DIST_8X8
   oxcf->using_dist_8x8 = extra_cfg->enable_dist_8x8;
   if (extra_cfg->tuning == AOM_TUNE_CDEF_DIST ||
@@ -539,7 +644,6 @@ static aom_codec_err_t set_encoder_config(
   // In large-scale tile encoding mode, num_tile_groups is always 1.
   if (cfg->large_scale_tile) oxcf->num_tile_groups = 1;
   oxcf->mtu = extra_cfg->mtu_size;
-  oxcf->enable_tpl_model = extra_cfg->enable_tpl_model;
 
   // FIXME(debargha): Should this be:
   // oxcf->allow_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs &
@@ -579,6 +683,9 @@ static aom_codec_err_t set_encoder_config(
     }
   }
 
+  oxcf->enable_tpl_model =
+      extra_cfg->enable_tpl_model && (oxcf->superres_mode == SUPERRES_NONE);
+
   oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
   oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
   oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
@@ -604,10 +711,6 @@ static aom_codec_err_t set_encoder_config(
 
   oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in;
 
-#if CONFIG_FP_MB_STATS
-  oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
-#endif
-
   oxcf->color_primaries = extra_cfg->color_primaries;
   oxcf->transfer_characteristics = extra_cfg->transfer_characteristics;
   oxcf->matrix_coefficients = extra_cfg->matrix_coefficients;
@@ -623,6 +726,7 @@ static aom_codec_err_t set_encoder_config(
   oxcf->arnr_strength = extra_cfg->arnr_strength;
   oxcf->min_gf_interval = extra_cfg->min_gf_interval;
   oxcf->max_gf_interval = extra_cfg->max_gf_interval;
+  oxcf->gf_max_pyr_height = extra_cfg->gf_max_pyr_height;
 
   oxcf->tuning = extra_cfg->tuning;
   oxcf->content = extra_cfg->content;
@@ -659,16 +763,43 @@ static aom_codec_err_t set_encoder_config(
 
   oxcf->monochrome = cfg->monochrome;
   oxcf->full_still_picture_hdr = cfg->full_still_picture_hdr;
-  oxcf->enable_dual_filter = extra_cfg->use_dual_filter;
+  oxcf->enable_dual_filter = extra_cfg->enable_dual_filter;
+  oxcf->enable_rect_partitions = extra_cfg->enable_rect_partitions;
+  oxcf->enable_ab_partitions = extra_cfg->enable_ab_partitions;
+  oxcf->enable_1to4_partitions = extra_cfg->enable_1to4_partitions;
+  oxcf->min_partition_size = extra_cfg->min_partition_size;
+  oxcf->max_partition_size = extra_cfg->max_partition_size;
+  oxcf->enable_intra_edge_filter = extra_cfg->enable_intra_edge_filter;
+  oxcf->enable_tx64 = extra_cfg->enable_tx64;
+  oxcf->tx_size_search_method = extra_cfg->tx_size_search_method;
+  oxcf->enable_flip_idtx = extra_cfg->enable_flip_idtx;
   oxcf->enable_order_hint = extra_cfg->enable_order_hint;
-  oxcf->enable_jnt_comp =
-      extra_cfg->enable_jnt_comp & extra_cfg->enable_order_hint;
+  oxcf->enable_dist_wtd_comp =
+      extra_cfg->enable_dist_wtd_comp & extra_cfg->enable_order_hint;
+  oxcf->max_reference_frames = extra_cfg->max_reference_frames;
+  oxcf->enable_reduced_reference_set = extra_cfg->enable_reduced_reference_set;
+  oxcf->enable_masked_comp = extra_cfg->enable_masked_comp;
+  oxcf->enable_onesided_comp = extra_cfg->enable_onesided_comp;
+  oxcf->enable_diff_wtd_comp =
+      extra_cfg->enable_masked_comp & extra_cfg->enable_diff_wtd_comp;
+  oxcf->enable_interinter_wedge =
+      extra_cfg->enable_masked_comp & extra_cfg->enable_interinter_wedge;
+  oxcf->enable_interintra_comp = extra_cfg->enable_interintra_comp;
+  oxcf->enable_smooth_interintra =
+      extra_cfg->enable_interintra_comp && extra_cfg->enable_smooth_interintra;
+  oxcf->enable_interintra_wedge =
+      extra_cfg->enable_interintra_comp & extra_cfg->enable_interintra_wedge;
   oxcf->enable_ref_frame_mvs =
       extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint;
 
+  oxcf->enable_global_motion = extra_cfg->enable_global_motion;
   oxcf->enable_warped_motion = extra_cfg->enable_warped_motion;
   oxcf->allow_warped_motion =
       extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion;
+  oxcf->enable_filter_intra = extra_cfg->enable_filter_intra;
+  oxcf->enable_smooth_intra = extra_cfg->enable_smooth_intra;
+  oxcf->enable_paeth_intra = extra_cfg->enable_paeth_intra;
+  oxcf->enable_cfl_intra = extra_cfg->enable_cfl_intra;
 
   oxcf->enable_superres =
       (oxcf->superres_mode != SUPERRES_NONE) && extra_cfg->enable_superres;
@@ -710,23 +841,14 @@ static aom_codec_err_t set_encoder_config(
   oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
   oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
 
-#if CONFIG_REDUCED_ENCODER_BORDER
-  if (oxcf->superres_mode != SUPERRES_NONE ||
-      oxcf->resize_mode != RESIZE_NONE) {
-    warn(
-        "Superres / resize cannot be used with CONFIG_REDUCED_ENCODER_BORDER. "
-        "Disabling superres/resize.\n");
-    // return AOM_CODEC_INVALID_PARAM;
-    disable_superres(oxcf);
-    oxcf->resize_mode = RESIZE_NONE;
-    oxcf->resize_scale_denominator = SCALE_NUMERATOR;
-    oxcf->resize_kf_scale_denominator = SCALE_NUMERATOR;
-  }
-#endif  // CONFIG_REDUCED_ENCODER_BORDER
-
   oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
   oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
-
+  oxcf->border_in_pixels = (oxcf->resize_mode || oxcf->superres_mode)
+                               ? AOM_BORDER_IN_PIXELS
+                               : AOM_ENC_NO_SCALE_BORDER;
+  memcpy(oxcf->target_seq_level_idx, extra_cfg->target_seq_level_idx,
+         sizeof(oxcf->target_seq_level_idx));
+  oxcf->tier_mask = extra_cfg->tier_mask;
   return AOM_CODEC_OK;
 }
 
@@ -939,6 +1061,13 @@ static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_obmc = CAST(AV1E_SET_ENABLE_OBMC, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_disable_trellis_quant(aom_codec_alg_priv_t *ctx,
                                                       va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1007,10 +1136,55 @@ static aom_codec_err_t ctrl_set_timing_info_type(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-static aom_codec_err_t ctrl_set_enable_df(aom_codec_alg_priv_t *ctx,
-                                          va_list args) {
+static aom_codec_err_t ctrl_set_enable_dual_filter(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_dual_filter = CAST(AV1E_SET_ENABLE_DUAL_FILTER, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_rect_partitions(
+    aom_codec_alg_priv_t *ctx, va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.use_dual_filter = CAST(AV1E_SET_ENABLE_DF, args);
+  extra_cfg.enable_rect_partitions =
+      CAST(AV1E_SET_ENABLE_RECT_PARTITIONS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_ab_partitions(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_ab_partitions = CAST(AV1E_SET_ENABLE_AB_PARTITIONS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_1to4_partitions(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_1to4_partitions =
+      CAST(AV1E_SET_ENABLE_1TO4_PARTITIONS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_min_partition_size(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.min_partition_size = CAST(AV1E_SET_MIN_PARTITION_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_partition_size(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.max_partition_size = CAST(AV1E_SET_MAX_PARTITION_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_intra_edge_filter(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_intra_edge_filter =
+      CAST(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1021,10 +1195,46 @@ static aom_codec_err_t ctrl_set_enable_order_hint(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-static aom_codec_err_t ctrl_set_enable_jnt_comp(aom_codec_alg_priv_t *ctx,
-                                                va_list args) {
+static aom_codec_err_t ctrl_set_enable_tx64(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_tx64 = CAST(AV1E_SET_ENABLE_TX64, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tx_size_search_method(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tx_size_search_method = CAST(AV1E_SET_TX_SIZE_SEARCH_METHOD, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_flip_idtx(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_flip_idtx = CAST(AV1E_SET_ENABLE_FLIP_IDTX, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_dist_wtd_comp(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_jnt_comp = CAST(AV1E_SET_ENABLE_JNT_COMP, args);
+  extra_cfg.enable_dist_wtd_comp = CAST(AV1E_SET_ENABLE_DIST_WTD_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_reference_frames(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.max_reference_frames = CAST(AV1E_SET_MAX_REFERENCE_FRAMES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_reduced_reference_set(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_reduced_reference_set =
+      CAST(AV1E_SET_REDUCED_REFERENCE_SET, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1042,6 +1252,66 @@ static aom_codec_err_t ctrl_set_allow_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_masked_comp(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_masked_comp = CAST(AV1E_SET_ENABLE_MASKED_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_onesided_comp(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_onesided_comp = CAST(AV1E_SET_ENABLE_ONESIDED_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_interintra_comp(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_interintra_comp =
+      CAST(AV1E_SET_ENABLE_INTERINTRA_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_smooth_interintra(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_smooth_interintra =
+      CAST(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_diff_wtd_comp(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_diff_wtd_comp = CAST(AV1E_SET_ENABLE_DIFF_WTD_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_interinter_wedge(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_interinter_wedge =
+      CAST(AV1E_SET_ENABLE_INTERINTER_WEDGE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_interintra_wedge(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_interintra_wedge =
+      CAST(AV1E_SET_ENABLE_INTERINTRA_WEDGE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_global_motion(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_global_motion = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1056,6 +1326,34 @@ static aom_codec_err_t ctrl_set_allow_warped_motion(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_filter_intra(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_filter_intra = CAST(AV1E_SET_ENABLE_FILTER_INTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_smooth_intra(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_smooth_intra = CAST(AV1E_SET_ENABLE_SMOOTH_INTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_paeth_intra(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_paeth_intra = CAST(AV1E_SET_ENABLE_PAETH_INTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_cfl_intra(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_cfl_intra = CAST(AV1E_SET_ENABLE_CFL_INTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1063,6 +1361,27 @@ static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_palette(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_palette = CAST(AV1E_SET_ENABLE_PALETTE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_intrabc(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_intrabc = CAST(AV1E_SET_ENABLE_INTRABC, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_angle_delta(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_angle_delta = CAST(AV1E_SET_ENABLE_ANGLE_DELTA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_error_resilient_mode(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1099,6 +1418,56 @@ static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_reduced_tx_type_set(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.reduced_tx_type_set = CAST(AV1E_SET_REDUCED_TX_TYPE_SET, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_intra_dct_only(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.use_intra_dct_only = CAST(AV1E_SET_INTRA_DCT_ONLY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_inter_dct_only(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.use_inter_dct_only = CAST(AV1E_SET_INTER_DCT_ONLY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_intra_default_tx_only(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.use_intra_default_tx_only =
+      CAST(AV1E_SET_INTRA_DEFAULT_TX_ONLY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_quant_b_adapt(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.quant_b_adapt = CAST(AV1E_SET_QUANT_B_ADAPT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_coeff_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.coeff_cost_upd_freq = CAST(AV1E_SET_COEFF_COST_UPD_FREQ, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_mode_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.mode_cost_upd_freq = CAST(AV1E_SET_MODE_COST_UPD_FREQ, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_film_grain_test_vector(
     aom_codec_alg_priv_t *ctx, va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1152,6 +1521,13 @@ static aom_codec_err_t ctrl_set_max_gf_interval(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_gf_max_pyr_height(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.gf_max_pyr_height = CAST(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_frame_periodic_boost(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1167,6 +1543,26 @@ static aom_codec_err_t ctrl_enable_motion_vector_unit_test(
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_target_seq_level_idx(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  const int val = CAST(AV1E_SET_TARGET_SEQ_LEVEL_IDX, args);
+  const int level = val % 100;
+  const int operating_point_idx = val / 100;
+  if (operating_point_idx >= 0 &&
+      operating_point_idx < MAX_NUM_OPERATING_POINTS) {
+    extra_cfg.target_seq_level_idx[operating_point_idx] = (AV1_LEVEL)level;
+  }
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tier_mask(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tier_mask = CAST(AV1E_SET_TIER_MASK, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx,
                                     aom_codec_priv_enc_mr_cfg_t *data) {
   aom_codec_err_t res = AOM_CODEC_OK;
@@ -1269,8 +1665,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
       }
     }
   }
-
-  if (ctx->oxcf.mode != GOOD) {
+  if (ctx->oxcf.mode != GOOD && ctx->oxcf.mode != REALTIME) {
     ctx->oxcf.mode = GOOD;
     av1_change_config(ctx->cpi, &ctx->oxcf);
   }
@@ -1328,6 +1723,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
     unsigned char *cx_data = ctx->cx_data;
     size_t cx_data_sz = ctx->cx_data_sz;
 
+    assert(!(cx_data == NULL && cx_data_sz != 0));
+
     /* Any pending invisible frames? */
     if (ctx->pending_cx_data) {
       memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz);
@@ -1355,12 +1752,6 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
            -1 != av1_get_compressed_data(cpi, &lib_flags, &frame_size, cx_data,
                                          &dst_time_stamp, &dst_end_time_stamp,
                                          !img, timebase)) {
-      if (cpi->common.seq_params.frame_id_numbers_present_flag) {
-        if (cpi->common.invalid_delta_frame_id_minus_1) {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
-                             "Invalid delta_frame_id_minus_1");
-        }
-      }
       cpi->seq_params_locked = 1;
       if (frame_size) {
         if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data;
@@ -1380,8 +1771,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
                     frame_size);
           }
           const uint32_t obu_header_offset = 0;
-          obu_header_size = write_obu_header(
-              OBU_TEMPORAL_DELIMITER, 0,
+          obu_header_size = av1_write_obu_header(
+              cpi, OBU_TEMPORAL_DELIMITER, 0,
               (uint8_t *)(ctx->pending_cx_data + obu_header_offset));
 
           // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
@@ -1742,6 +2133,13 @@ static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  return av1_get_seq_level_idx(ctx->cpi, arg);
+}
+
 static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1_COPY_REFERENCE, ctrl_copy_reference },
   { AOME_USE_REFERENCE, ctrl_use_reference },
@@ -1773,6 +2171,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_LOSSLESS, ctrl_set_lossless },
   { AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef },
   { AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration },
+  { AV1E_SET_ENABLE_OBMC, ctrl_set_enable_obmc },
   { AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant },
   { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm },
   { AV1E_SET_QM_Y, ctrl_set_qm_y },
@@ -1789,15 +2188,48 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode },
   { AV1E_SET_ERROR_RESILIENT_MODE, ctrl_set_error_resilient_mode },
   { AV1E_SET_S_FRAME_MODE, ctrl_set_s_frame_mode },
-  { AV1E_SET_ENABLE_DF, ctrl_set_enable_df },
+  { AV1E_SET_ENABLE_RECT_PARTITIONS, ctrl_set_enable_rect_partitions },
+  { AV1E_SET_ENABLE_AB_PARTITIONS, ctrl_set_enable_ab_partitions },
+  { AV1E_SET_ENABLE_1TO4_PARTITIONS, ctrl_set_enable_1to4_partitions },
+  { AV1E_SET_MIN_PARTITION_SIZE, ctrl_set_min_partition_size },
+  { AV1E_SET_MAX_PARTITION_SIZE, ctrl_set_max_partition_size },
+  { AV1E_SET_ENABLE_DUAL_FILTER, ctrl_set_enable_dual_filter },
+  { AV1E_SET_ENABLE_INTRA_EDGE_FILTER, ctrl_set_enable_intra_edge_filter },
   { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint },
-  { AV1E_SET_ENABLE_JNT_COMP, ctrl_set_enable_jnt_comp },
+  { AV1E_SET_ENABLE_TX64, ctrl_set_enable_tx64 },
+  { AV1E_SET_TX_SIZE_SEARCH_METHOD, ctrl_set_tx_size_search_method },
+  { AV1E_SET_ENABLE_FLIP_IDTX, ctrl_set_enable_flip_idtx },
+  { AV1E_SET_ENABLE_DIST_WTD_COMP, ctrl_set_enable_dist_wtd_comp },
+  { AV1E_SET_MAX_REFERENCE_FRAMES, ctrl_set_max_reference_frames },
+  { AV1E_SET_REDUCED_REFERENCE_SET, ctrl_set_enable_reduced_reference_set },
   { AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs },
   { AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs },
+  { AV1E_SET_ENABLE_MASKED_COMP, ctrl_set_enable_masked_comp },
+  { AV1E_SET_ENABLE_ONESIDED_COMP, ctrl_set_enable_onesided_comp },
+  { AV1E_SET_ENABLE_INTERINTRA_COMP, ctrl_set_enable_interintra_comp },
+  { AV1E_SET_ENABLE_SMOOTH_INTERINTRA, ctrl_set_enable_smooth_interintra },
+  { AV1E_SET_ENABLE_DIFF_WTD_COMP, ctrl_set_enable_diff_wtd_comp },
+  { AV1E_SET_ENABLE_INTERINTER_WEDGE, ctrl_set_enable_interinter_wedge },
+  { AV1E_SET_ENABLE_INTERINTRA_WEDGE, ctrl_set_enable_interintra_wedge },
+  { AV1E_SET_ENABLE_GLOBAL_MOTION, ctrl_set_enable_global_motion },
   { AV1E_SET_ENABLE_WARPED_MOTION, ctrl_set_enable_warped_motion },
   { AV1E_SET_ALLOW_WARPED_MOTION, ctrl_set_allow_warped_motion },
+  { AV1E_SET_ENABLE_FILTER_INTRA, ctrl_set_enable_filter_intra },
+  { AV1E_SET_ENABLE_SMOOTH_INTRA, ctrl_set_enable_smooth_intra },
+  { AV1E_SET_ENABLE_PAETH_INTRA, ctrl_set_enable_paeth_intra },
+  { AV1E_SET_ENABLE_CFL_INTRA, ctrl_set_enable_cfl_intra },
   { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres },
+  { AV1E_SET_ENABLE_PALETTE, ctrl_set_enable_palette },
+  { AV1E_SET_ENABLE_INTRABC, ctrl_set_enable_intrabc },
+  { AV1E_SET_ENABLE_ANGLE_DELTA, ctrl_set_enable_angle_delta },
   { AV1E_SET_AQ_MODE, ctrl_set_aq_mode },
+  { AV1E_SET_REDUCED_TX_TYPE_SET, ctrl_set_reduced_tx_type_set },
+  { AV1E_SET_INTRA_DCT_ONLY, ctrl_set_intra_dct_only },
+  { AV1E_SET_INTER_DCT_ONLY, ctrl_set_inter_dct_only },
+  { AV1E_SET_INTRA_DEFAULT_TX_ONLY, ctrl_set_intra_default_tx_only },
+  { AV1E_SET_QUANT_B_ADAPT, ctrl_set_quant_b_adapt },
+  { AV1E_SET_COEFF_COST_UPD_FREQ, ctrl_set_coeff_cost_upd_freq },
+  { AV1E_SET_MODE_COST_UPD_FREQ, ctrl_set_mode_cost_upd_freq },
   { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
   { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
   { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
@@ -1810,6 +2242,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity },
   { AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval },
   { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval },
+  { AV1E_SET_GF_MAX_PYRAMID_HEIGHT, ctrl_set_gf_max_pyr_height },
   { AV1E_SET_RENDER_SIZE, ctrl_set_render_size },
   { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
   { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
@@ -1820,6 +2253,8 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size },
 #endif  // CONFIG_FILM_GRAIN
   { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
+  { AV1E_SET_TARGET_SEQ_LEVEL_IDX, ctrl_set_target_seq_level_idx },
+  { AV1E_SET_TIER_MASK, ctrl_set_tier_mask },
 
   // Getters
   { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
@@ -1830,6 +2265,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
   { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x },
   { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y },
+  { AV1E_GET_SEQ_LEVEL_IDX, ctrl_get_seq_level_idx },
   { -1, NULL },
 };
 
@@ -1837,7 +2273,7 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
   { 0,
     {
         // NOLINT
-        0,  // g_usage
+        0,  // g_usage - non-realtime usage
         0,  // g_threads
         0,  // g_profile
 
@@ -1862,11 +2298,11 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         SCALE_NUMERATOR,  // rc_resize_denominator
         SCALE_NUMERATOR,  // rc_resize_kf_denominator
 
-        0,                // rc_superres_mode
+        SUPERRES_NONE,    // rc_superres_mode
         SCALE_NUMERATOR,  // rc_superres_denominator
         SCALE_NUMERATOR,  // rc_superres_kf_denominator
         63,               // rc_superres_qthresh
-        63,               // rc_superres_kf_qthresh
+        32,               // rc_superres_kf_qthresh
 
         AOM_VBR,      // rc_end_usage
         { NULL, 0 },  // rc_twopass_stats_in
@@ -1902,6 +2338,74 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         { 0 },        // tile_heights
         { 1 },        // config file
     } },
+  { 1,
+    {
+        // NOLINT
+        1,  // g_usage - real-time usage
+        0,  // g_threads
+        0,  // g_profile
+
+        320,         // g_width
+        240,         // g_height
+        0,           // g_limit
+        0,           // g_forced_max_frame_width
+        0,           // g_forced_max_frame_height
+        AOM_BITS_8,  // g_bit_depth
+        8,           // g_input_bit_depth
+
+        { 1, 30 },  // g_timebase
+
+        0,  // g_error_resilient
+
+        AOM_RC_ONE_PASS,  // g_pass
+
+        1,  // g_lag_in_frames
+
+        0,                // rc_dropframe_thresh
+        RESIZE_NONE,      // rc_resize_mode
+        SCALE_NUMERATOR,  // rc_resize_denominator
+        SCALE_NUMERATOR,  // rc_resize_kf_denominator
+
+        0,                // rc_superres_mode
+        SCALE_NUMERATOR,  // rc_superres_denominator
+        SCALE_NUMERATOR,  // rc_superres_kf_denominator
+        63,               // rc_superres_qthresh
+        32,               // rc_superres_kf_qthresh
+
+        AOM_CBR,      // rc_end_usage
+        { NULL, 0 },  // rc_twopass_stats_in
+        { NULL, 0 },  // rc_firstpass_mb_stats_in
+        256,          // rc_target_bandwidth
+        0,            // rc_min_quantizer
+        63,           // rc_max_quantizer
+        25,           // rc_undershoot_pct
+        25,           // rc_overshoot_pct
+
+        6000,  // rc_max_buffer_size
+        4000,  // rc_buffer_initial_size
+        5000,  // rc_buffer_optimal_size
+
+        50,    // rc_two_pass_vbrbias
+        0,     // rc_two_pass_vbrmin_section
+        2000,  // rc_two_pass_vbrmax_section
+
+        // keyframing settings (kf)
+        0,            // fwd_kf_enabled
+        AOM_KF_AUTO,  // g_kfmode
+        0,            // kf_min_dist
+        9999,         // kf_max_dist
+        0,            // sframe_dist
+        1,            // sframe_mode
+        0,            // large_scale_tile
+        0,            // monochrome
+        0,            // full_still_picture_hdr
+        0,            // save_as_annexb
+        0,            // tile_width_count
+        0,            // tile_height_count
+        { 0 },        // tile_widths
+        { 0 },        // tile_heights
+        { 1 },        // config file
+    } },
 };
 
 #ifndef VERSION_STRING
@@ -1925,7 +2429,7 @@ CODEC_INTERFACE(aom_codec_av1_cx) = {
   },
   {
       // NOLINT
-      1,                           // 1 cfg map
+      2,                           // 2 cfg map
       encoder_usage_cfg_map,       // aom_codec_enc_cfg_map_t
       encoder_encode,              // aom_codec_encode_fn_t
       encoder_get_cxdata,          // aom_codec_get_cx_data_fn_t
diff --git a/libaom/av1/av1_dx_iface.c b/libaom/av1/av1_dx_iface.c
index 08da650..ca872d7 100644
--- a/libaom/av1/av1_dx_iface.c
+++ b/libaom/av1/av1_dx_iface.c
@@ -44,7 +44,7 @@ struct aom_codec_alg_priv {
   int img_avail;
   int flushed;
   int invert_tile_order;
-  int last_show_frame;  // Index of last output frame.
+  RefCntBuffer *last_show_frame;  // Last output frame buffer
   int byte_alignment;
   int skip_loop_filter;
   int skip_film_grain;
@@ -154,6 +154,49 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t parse_timing_info(struct aom_read_bit_buffer *rb) {
+  const uint32_t num_units_in_display_tick =
+      aom_rb_read_unsigned_literal(rb, 32);
+  const uint32_t time_scale = aom_rb_read_unsigned_literal(rb, 32);
+  if (num_units_in_display_tick == 0 || time_scale == 0)
+    return AOM_CODEC_UNSUP_BITSTREAM;
+  const uint8_t equal_picture_interval = aom_rb_read_bit(rb);
+  if (equal_picture_interval) {
+    const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
+    if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
+      // num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.
+      return AOM_CODEC_UNSUP_BITSTREAM;
+    }
+  }
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_decoder_model_info(
+    struct aom_read_bit_buffer *rb, int *buffer_delay_length_minus_1) {
+  *buffer_delay_length_minus_1 = aom_rb_read_literal(rb, 5);
+  const uint32_t num_units_in_decoding_tick =
+      aom_rb_read_unsigned_literal(rb, 32);
+  const uint8_t buffer_removal_time_length_minus_1 = aom_rb_read_literal(rb, 5);
+  const uint8_t frame_presentation_time_length_minus_1 =
+      aom_rb_read_literal(rb, 5);
+  (void)num_units_in_decoding_tick;
+  (void)buffer_removal_time_length_minus_1;
+  (void)frame_presentation_time_length_minus_1;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_op_parameters_info(
+    struct aom_read_bit_buffer *rb, int buffer_delay_length_minus_1) {
+  const int n = buffer_delay_length_minus_1 + 1;
+  const uint32_t decoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n);
+  const uint32_t encoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n);
+  const uint8_t low_delay_mode_flag = aom_rb_read_bit(rb);
+  (void)decoder_buffer_delay;
+  (void)encoder_buffer_delay;
+  (void)low_delay_mode_flag;
+  return AOM_CODEC_OK;
+}
+
 // Parses the operating points (including operating_point_idc, seq_level_idx,
 // and seq_tier) and then sets si->number_spatial_layers and
 // si->number_temporal_layers based on operating_point_idc[0].
@@ -161,10 +204,23 @@ static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb,
                                               int is_reduced_header,
                                               aom_codec_stream_info_t *si) {
   int operating_point_idc0 = 0;
-
   if (is_reduced_header) {
     aom_rb_read_literal(rb, LEVEL_BITS);  // level
   } else {
+    uint8_t decoder_model_info_present_flag = 0;
+    int buffer_delay_length_minus_1 = 0;
+    aom_codec_err_t status;
+    const uint8_t timing_info_present_flag = aom_rb_read_bit(rb);
+    if (timing_info_present_flag) {
+      if ((status = parse_timing_info(rb)) != AOM_CODEC_OK) return status;
+      decoder_model_info_present_flag = aom_rb_read_bit(rb);
+      if (decoder_model_info_present_flag) {
+        if ((status = parse_decoder_model_info(
+                 rb, &buffer_delay_length_minus_1)) != AOM_CODEC_OK)
+          return status;
+      }
+    }
+    const uint8_t initial_display_delay_present_flag = aom_rb_read_bit(rb);
     const uint8_t operating_points_cnt_minus_1 =
         aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
     for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) {
@@ -173,6 +229,20 @@ static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb,
       if (i == 0) operating_point_idc0 = operating_point_idc;
       int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);  // level
       if (seq_level_idx > 7) aom_rb_read_bit(rb);               // tier
+      if (decoder_model_info_present_flag) {
+        const uint8_t decoder_model_present_for_this_op = aom_rb_read_bit(rb);
+        if (decoder_model_present_for_this_op) {
+          if ((status = parse_op_parameters_info(
+                   rb, buffer_delay_length_minus_1)) != AOM_CODEC_OK)
+            return status;
+        }
+      }
+      if (initial_display_delay_present_flag) {
+        const uint8_t initial_display_delay_present_for_this_op =
+            aom_rb_read_bit(rb);
+        if (initial_display_delay_present_for_this_op)
+          aom_rb_read_literal(rb, 4);  // initial_display_delay_minus_1
+      }
     }
   }
 
@@ -203,7 +273,7 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
   memset(&obu_header, 0, sizeof(obu_header));
   size_t payload_size = 0;
   size_t bytes_read = 0;
-  int reduced_still_picture_hdr = 0;
+  uint8_t reduced_still_picture_hdr = 0;
   aom_codec_err_t status = aom_read_obu_header_and_size(
       data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
   if (status != AOM_CODEC_OK) return status;
@@ -232,7 +302,7 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
       struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
 
       av1_read_profile(&rb);  // profile
-      const int still_picture = aom_rb_read_bit(&rb);
+      const uint8_t still_picture = aom_rb_read_bit(&rb);
       reduced_still_picture_hdr = aom_rb_read_bit(&rb);
 
       if (!still_picture && reduced_still_picture_hdr) {
@@ -317,7 +387,6 @@ static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
     AV1_COMMON *const cm = &frame_worker_data->pbi->common;
     BufferPool *const pool = cm->buffer_pool;
 
-    cm->new_fb_idx = INVALID_IDX;
     cm->cur_frame = NULL;
     cm->byte_alignment = ctx->byte_alignment;
     cm->skip_loop_filter = ctx->skip_loop_filter;
@@ -357,7 +426,6 @@ static int frame_worker_hook(void *arg1, void *arg2) {
 
   if (result != 0) {
     // Check decode result in serial decode.
-    frame_worker_data->pbi->common.cur_frame->buf.corrupted = 1;
     frame_worker_data->pbi->need_resync = 1;
   }
   return !result;
@@ -367,7 +435,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
   int i;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 
-  ctx->last_show_frame = -1;
+  ctx->last_show_frame = NULL;
   ctx->next_output_worker_id = 0;
   ctx->need_resync = 1;
   ctx->num_frame_workers = 1;
@@ -449,8 +517,7 @@ static INLINE void check_resync(aom_codec_alg_priv_t *const ctx,
                                 const AV1Decoder *const pbi) {
   // Clear resync flag if worker got a key frame or intra only frame.
   if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
-      (pbi->common.current_frame.intra_only ||
-       pbi->common.current_frame.frame_type == KEY_FRAME))
+      frame_is_intra_only(&pbi->common))
     ctx->need_resync = 0;
 }
 
@@ -529,7 +596,7 @@ static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx,
 
   data2->idx = -1;
   for (int i = 0; i < REF_FRAMES; ++i)
-    if (cm->ref_frame_map[i] == cm->new_fb_idx) data2->idx = i;
+    if (cm->ref_frame_map[i] == cm->cur_frame) data2->idx = i;
   data2->buf = data;
   data2->show_existing = cm->show_existing_frame;
   return res;
@@ -551,7 +618,6 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
   // arguments are invalid.
   if (ctx->frame_workers) {
     BufferPool *const pool = ctx->buffer_pool;
-    RefCntBuffer *const frame_bufs = pool->frame_bufs;
     lock_buffer_pool(pool);
     for (int i = 0; i < ctx->num_frame_workers; ++i) {
       AVxWorker *const worker = &ctx->frame_workers[i];
@@ -559,7 +625,7 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
           (FrameWorkerData *)worker->data1;
       struct AV1Decoder *pbi = frame_worker_data->pbi;
       for (size_t j = 0; j < pbi->num_output_frames; j++) {
-        decrease_ref_count(pbi->output_frame_index[j], frame_bufs, pool);
+        decrease_ref_count(pbi->output_frames[j], pool);
       }
       pbi->num_output_frames = 0;
     }
@@ -696,7 +762,6 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
           (FrameWorkerData *)worker->data1;
       AV1Decoder *const pbi = frame_worker_data->pbi;
       AV1_COMMON *const cm = &pbi->common;
-      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
       ctx->next_output_worker_id =
           (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
       // Wait for the frame from worker thread.
@@ -709,8 +774,8 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
         aom_film_grain_t *grain_params;
         if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd,
                               &grain_params) == 0) {
-          const int buf_idx = pbi->output_frame_index[*index];
-          ctx->last_show_frame = buf_idx;
+          RefCntBuffer *const output_frame_buf = pbi->output_frames[*index];
+          ctx->last_show_frame = output_frame_buf;
           if (ctx->need_resync) return NULL;
           yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
 
@@ -725,8 +790,10 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
           const int num_planes = av1_num_planes(cm);
           if (pbi->ext_tile_debug && cm->single_tile_decoding &&
               pbi->dec_tile_row >= 0) {
+            int tile_width, tile_height;
+            av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
             const int tile_row = AOMMIN(pbi->dec_tile_row, cm->tile_rows - 1);
-            const int mi_row = tile_row * cm->tile_height;
+            const int mi_row = tile_row * tile_height;
             const int ssy = ctx->img.y_chroma_shift;
             int plane;
             ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
@@ -736,14 +803,15 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
                     mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
               }
             }
-            ctx->img.d_h =
-                AOMMIN(cm->tile_height, cm->mi_rows - mi_row) * MI_SIZE;
+            ctx->img.d_h = AOMMIN(tile_height, cm->mi_rows - mi_row) * MI_SIZE;
           }
 
           if (pbi->ext_tile_debug && cm->single_tile_decoding &&
               pbi->dec_tile_col >= 0) {
+            int tile_width, tile_height;
+            av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
             const int tile_col = AOMMIN(pbi->dec_tile_col, cm->tile_cols - 1);
-            const int mi_col = tile_col * cm->tile_width;
+            const int mi_col = tile_col * tile_width;
             const int ssx = ctx->img.x_chroma_shift;
             const int is_hbd =
                 (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
@@ -755,11 +823,10 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
                     mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
               }
             }
-            ctx->img.d_w =
-                AOMMIN(cm->tile_width, cm->mi_cols - mi_col) * MI_SIZE;
+            ctx->img.d_w = AOMMIN(tile_width, cm->mi_cols - mi_col) * MI_SIZE;
           }
 
-          ctx->img.fb_priv = frame_bufs[buf_idx].raw_frame_buffer.priv;
+          ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv;
           img = &ctx->img;
           img->temporal_id = cm->temporal_layer_id;
           img->spatial_id = cm->spatial_layer_id;
@@ -911,7 +978,8 @@ static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx,
       AVxWorker *const worker = ctx->frame_workers;
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
-      *update_info = frame_worker_data->pbi->refresh_frame_flags;
+      *update_info =
+          frame_worker_data->pbi->common.current_frame.refresh_frame_flags;
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
@@ -940,11 +1008,10 @@ static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx,
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       AV1Decoder *const pbi = frame_worker_data->pbi;
-      RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs;
       if (pbi->seen_frame_header && pbi->num_output_frames == 0)
         return AOM_CODEC_ERROR;
-      if (ctx->last_show_frame >= 0)
-        *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
+      if (ctx->last_show_frame != NULL)
+        *corrupted = ctx->last_show_frame->buf.corrupted;
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
@@ -1124,8 +1191,9 @@ static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx,
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
-      *tile_size =
-          ((cm->tile_width * MI_SIZE) << 16) + cm->tile_height * MI_SIZE;
+      int tile_width, tile_height;
+      av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+      *tile_size = ((tile_width * MI_SIZE) << 16) + tile_height * MI_SIZE;
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
diff --git a/libaom/av1/av1_iface_common.h b/libaom/av1/av1_iface_common.h
index 713d8c3..5568c89 100644
--- a/libaom/av1/av1_iface_common.h
+++ b/libaom/av1/av1_iface_common.h
@@ -124,7 +124,12 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
   } else {
     yv12->flags = 0;
   }
-  yv12->border = (yv12->y_stride - img->w) / 2;
+
+  // Note(yunqing): if img is allocated the same as the frame buffer, y_stride
+  // is 32-byte aligned. Also, handle the cases while allocating img without a
+  // border or stride_align is less than 32.
+  int border = (yv12->y_stride - (int)((img->w + 31) & ~31)) / 2;
+  yv12->border = (border < 0) ? 0 : border;
   yv12->subsampling_x = img->x_chroma_shift;
   yv12->subsampling_y = img->y_chroma_shift;
   return AOM_CODEC_OK;
diff --git a/libaom/av1/common/alloccommon.c b/libaom/av1/common/alloccommon.c
index 39b6b73..1c8528a 100644
--- a/libaom/av1/common/alloccommon.c
+++ b/libaom/av1/common/alloccommon.c
@@ -139,7 +139,7 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
   // Now we need to allocate enough space to store the line buffers for the
   // stripes
   const int frame_w = cm->superres_upscaled_width;
-  const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+  const int use_highbd = cm->seq_params.use_highbitdepth;
 
   for (int p = 0; p < num_planes; ++p) {
     const int is_uv = p > 0;
diff --git a/libaom/av1/common/arm/av1_txfm_neon.c b/libaom/av1/common/arm/av1_txfm_neon.c
index de3c547..7e3a05a 100644
--- a/libaom/av1/common/arm/av1_txfm_neon.c
+++ b/libaom/av1/common/arm/av1_txfm_neon.c
@@ -12,6 +12,8 @@
 #include <arm_neon.h>
 #include <assert.h>
 
+#include "config/av1_rtcd.h"
+
 #include "aom_ports/mem.h"
 #include "av1/common/arm/mem_neon.h"
 
diff --git a/libaom/av1/common/arm/jnt_convolve_neon.c b/libaom/av1/common/arm/jnt_convolve_neon.c
index e5674ef..379ff98 100644
--- a/libaom/av1/common/arm/jnt_convolve_neon.c
+++ b/libaom/av1/common/arm/jnt_convolve_neon.c
@@ -23,19 +23,17 @@
 #include "av1/common/arm/transpose_neon.h"
 
 #if !defined(__aarch64__)
-static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
-                                   const uint16_t fwd_offset,
-                                   const uint16_t bck_offset,
-                                   const int16x4_t sub_const_vec,
-                                   const int16_t round_bits,
-                                   const int use_jnt_comp_avg, uint8x8_t *t0) {
+static INLINE void compute_avg_4x1(
+    uint16x4_t res0, uint16x4_t d0, const uint16_t fwd_offset,
+    const uint16_t bck_offset, const int16x4_t sub_const_vec,
+    const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) {
   int16x4_t tmp0;
   uint16x4_t tmp_u0;
   uint32x4_t sum0;
   int32x4_t dst0;
   int16x8_t tmp4;
 
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_comp_avg) {
     const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
 
     sum0 = vmull_n_u16(res0, fwd_offset);
@@ -65,12 +63,10 @@ static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
   }
 }
 
-static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
-                                   const uint16_t fwd_offset,
-                                   const uint16_t bck_offset,
-                                   const int16x4_t sub_const,
-                                   const int16_t round_bits,
-                                   const int use_jnt_comp_avg, uint8x8_t *t0) {
+static INLINE void compute_avg_8x1(
+    uint16x8_t res0, uint16x8_t d0, const uint16_t fwd_offset,
+    const uint16_t bck_offset, const int16x4_t sub_const,
+    const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) {
   int16x4_t tmp0, tmp2;
   int16x8_t f0;
   uint32x4_t sum0, sum2;
@@ -78,7 +74,7 @@ static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
 
   uint16x8_t tmp_u0;
 
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_comp_avg) {
     const int32x4_t sub_const_vec = vmovl_s16(sub_const);
     const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
 
@@ -123,7 +119,7 @@ static INLINE void compute_avg_4x4(
     uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
     const uint16_t fwd_offset, const uint16_t bck_offset,
     const int16x4_t sub_const_vec, const int16_t round_bits,
-    const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+    const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
   int16x4_t tmp0, tmp1, tmp2, tmp3;
   uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
   uint32x4_t sum0, sum1, sum2, sum3;
@@ -132,7 +128,7 @@ static INLINE void compute_avg_4x4(
   int16x8_t tmp4, tmp5;
   const int16x8_t zero = vdupq_n_s16(0);
 
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_comp_avg) {
     const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
     const int32x4_t const_vec = vmovl_s16(sub_const_vec);
 
@@ -203,8 +199,8 @@ static INLINE void compute_avg_8x4(
     uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
     const uint16_t fwd_offset, const uint16_t bck_offset,
     const int16x4_t sub_const, const int16_t round_bits,
-    const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
-    uint8x8_t *t3) {
+    const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1,
+    uint8x8_t *t2, uint8x8_t *t3) {
   int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int16x8_t f0, f1, f2, f3;
   uint32x4_t sum0, sum1, sum2, sum3;
@@ -214,7 +210,7 @@ static INLINE void compute_avg_8x4(
   uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
   const int16x8_t zero = vdupq_n_s16(0);
 
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_comp_avg) {
     const int32x4_t sub_const_vec = vmovl_s16(sub_const);
     const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
 
@@ -319,7 +315,7 @@ static INLINE void compute_avg_8x4(
   }
 }
 
-static INLINE void jnt_convolve_2d_horiz_neon(
+static INLINE void dist_wtd_convolve_2d_horiz_neon(
     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
     int16_t *x_filter_tmp, const int im_h, int w, const int round_0) {
   const int bd = 8;
@@ -563,7 +559,7 @@ static INLINE void jnt_convolve_2d_horiz_neon(
   }
 }
 
-static INLINE void jnt_convolve_2d_vert_neon(
+static INLINE void dist_wtd_convolve_2d_vert_neon(
     int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
     ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) {
   uint8_t *dst_u8_ptr, *d_u8;
@@ -587,7 +583,7 @@ static INLINE void jnt_convolve_2d_vert_neon(
   const uint16_t fwd_offset = conv_params->fwd_offset;
   const uint16_t bck_offset = conv_params->bck_offset;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
   uint16x4_t res4, d0;
@@ -652,8 +648,8 @@ static INLINE void jnt_convolve_2d_vert_neon(
         d += (dst_stride << 2);
 
         compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset,
-                        bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg,
-                        &t0, &t1);
+                        bck_offset, sub_const_vec, round_bits,
+                        use_dist_wtd_comp_avg, &t0, &t1);
 
         vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
         d_u8 += dst8_stride;
@@ -691,7 +687,7 @@ static INLINE void jnt_convolve_2d_vert_neon(
         d += (dst_stride);
 
         compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec,
-                        round_bits, use_jnt_comp_avg, &t0);
+                        round_bits, use_dist_wtd_comp_avg, &t0);
 
         vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
         d_u8 += dst8_stride;
@@ -717,12 +713,12 @@ static INLINE void jnt_convolve_2d_vert_neon(
   } while (w > 0);
 }
 
-void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
-                              int dst8_stride, int w, int h,
-                              const InterpFilterParams *filter_params_x,
-                              const InterpFilterParams *filter_params_y,
-                              const int subpel_x_q4, const int subpel_y_q4,
-                              ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
+                                   uint8_t *dst8, int dst8_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
 
@@ -748,19 +744,18 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
   vst1q_s16(&x_filter_tmp[0], filter_x_coef);
 
-  jnt_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
-                             x_filter_tmp, im_h, w, round_0);
+  dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+                                  x_filter_tmp, im_h, w, round_0);
 
-  jnt_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, conv_params,
-                            y_filter, h, w);
+  dist_wtd_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride,
+                                 conv_params, y_filter, h, w);
 }
 
-void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
-                                   uint8_t *dst8, int dst8_stride, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
-                                   ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_neon(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params) {
   uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
       tmp_shift3;
   uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3;
@@ -811,7 +806,7 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
           compute_avg_8x4(tmp_q0, tmp_q1, tmp_q2, tmp_q3, res_q0, res_q1,
                           res_q2, res_q3, conv_params->fwd_offset,
                           conv_params->bck_offset, sub_const_vec, bits,
-                          conv_params->use_jnt_comp_avg, &tmp_shift0,
+                          conv_params->use_dist_wtd_comp_avg, &tmp_shift0,
                           &tmp_shift1, &tmp_shift2, &tmp_shift3);
 
           vst1_u8(dst8_1 + (0 * dst8_stride), tmp_shift0);
@@ -854,7 +849,7 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
 
         compute_avg_4x4(tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7,
                         conv_params->fwd_offset, conv_params->bck_offset,
-                        sub_const_vec, bits, conv_params->use_jnt_comp_avg,
+                        sub_const_vec, bits, conv_params->use_dist_wtd_comp_avg,
                         &tmp_shift0, &tmp_shift1);
 
         vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 0);
@@ -881,12 +876,12 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
   }
 }
 
-void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
-                             int dst8_stride, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
 
@@ -902,7 +897,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const uint16_t fwd_offset = conv_params->fwd_offset;
   const uint16_t bck_offset = conv_params->bck_offset;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   (void)filter_params_y;
   (void)subpel_y_q4;
@@ -1031,8 +1026,8 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
           compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
                           vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
                           vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
-                          round_offset_vec, round_bits, use_jnt_comp_avg, &t0,
-                          &t1);
+                          round_offset_vec, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1);
 
           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
                         0);  // 00 01 02 03
@@ -1103,7 +1098,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
           compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
                           bck_offset, round_offset_vec, round_bits,
-                          use_jnt_comp_avg, &t0);
+                          use_dist_wtd_comp_avg, &t0);
 
           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
                         0);  // 00 01 02 03
@@ -1231,11 +1226,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
           load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
           d_tmp += (dst_stride << 2);
 
-          compute_avg_8x4(
-              res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
-              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
-              vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
-              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+          compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+                          vreinterpretq_u16_s16(res1),
+                          vreinterpretq_u16_s16(res2),
+                          vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1, &t2, &t3);
 
           store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
           d_u8 += (dst8_stride << 2);
@@ -1243,11 +1239,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
           load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
           d_tmp += (dst_stride << 2);
 
-          compute_avg_8x4(
-              res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
-              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
-              vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
-              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+          compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+                          vreinterpretq_u16_s16(res5),
+                          vreinterpretq_u16_s16(res6),
+                          vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1, &t2, &t3);
 
           store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
           d_u8 += (dst8_stride << 2);
@@ -1319,7 +1316,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
           compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
                           bck_offset, round_offset64, round_bits,
-                          use_jnt_comp_avg, &t0);
+                          use_dist_wtd_comp_avg, &t0);
 
           vst1_u8(d_u8, t0);
           d_u8 += (dst8_stride);
@@ -1342,12 +1339,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   }
 }
 
-void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
-                             int dst8_stride, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
 
@@ -1363,7 +1360,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const uint16_t fwd_offset = conv_params->fwd_offset;
   const uint16_t bck_offset = conv_params->bck_offset;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int shift_value = (conv_params->round_1 - 1 - bits);
 
   (void)filter_params_x;
@@ -1489,8 +1486,8 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
           compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
                           vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
                           vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
-                          round_offset64, round_bits, use_jnt_comp_avg, &t0,
-                          &t1);
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1);
 
           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
           d_u8 += dst8_stride;
@@ -1535,7 +1532,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
           compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
                           bck_offset, round_offset64, round_bits,
-                          use_jnt_comp_avg, &t0);
+                          use_dist_wtd_comp_avg, &t0);
 
           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
           d_u8 += dst8_stride;
@@ -1654,11 +1651,12 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
           load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
           d_tmp += (dst_stride << 2);
 
-          compute_avg_8x4(
-              res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
-              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
-              vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
-              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+          compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+                          vreinterpretq_u16_s16(res1),
+                          vreinterpretq_u16_s16(res2),
+                          vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1, &t2, &t3);
 
           store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
           d_u8 += (dst8_stride << 2);
@@ -1666,11 +1664,12 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
           load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
           d_tmp += (dst_stride << 2);
 
-          compute_avg_8x4(
-              res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
-              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
-              vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
-              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+          compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+                          vreinterpretq_u16_s16(res5),
+                          vreinterpretq_u16_s16(res6),
+                          vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1, &t2, &t3);
 
           store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
           d_u8 += (dst8_stride << 2);
@@ -1718,7 +1717,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
           compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
                           bck_offset, round_offset64, round_bits,
-                          use_jnt_comp_avg, &t0);
+                          use_dist_wtd_comp_avg, &t0);
 
           vst1_u8(d_u8, t0);
           d_u8 += (dst8_stride);
diff --git a/libaom/av1/common/arm/warp_plane_neon.c b/libaom/av1/common/arm/warp_plane_neon.c
index 7f02d42..1062cc3 100644
--- a/libaom/av1/common/arm/warp_plane_neon.c
+++ b/libaom/av1/common/arm/warp_plane_neon.c
@@ -640,7 +640,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
             uint16x4_t tmp16_lo = vld1_u16(p);
             int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
             int16x4_t tmp16_low;
-            if (conv_params->use_jnt_comp_avg) {
+            if (conv_params->use_dist_wtd_comp_avg) {
               res_lo = vmulq_s32(res_lo, bwd);
               tmp32_lo = vmulq_s32(tmp32_lo, fwd);
               tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
@@ -671,7 +671,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
               uint16x4_t tmp16_hi = vld1_u16(p4);
               int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
               int16x4_t tmp16_high;
-              if (conv_params->use_jnt_comp_avg) {
+              if (conv_params->use_dist_wtd_comp_avg) {
                 res_hi = vmulq_s32(res_hi, bwd);
                 tmp32_hi = vmulq_s32(tmp32_hi, fwd);
                 tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
diff --git a/libaom/av1/common/av1_inv_txfm2d.c b/libaom/av1/common/av1_inv_txfm2d.c
index 4f2d57b..fc9c8d2 100644
--- a/libaom/av1/common/av1_inv_txfm2d.c
+++ b/libaom/av1/common/av1_inv_txfm2d.c
@@ -228,7 +228,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
     (void)real_range_row;
     if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) {
       // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
-      // so opt_range_col >= real_range_col will not hold
+      // so opt_range_row >= real_range_row will not hold
       stage_range_row[i] = opt_range_row;
     } else {
       assert(opt_range_row >= real_range_row);
@@ -241,7 +241,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
         cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1;
     (void)real_range_col;
     if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) {
-      // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+      // the adst4 may use 1 extra bit on top of opt_range_col at stage 1
       // so opt_range_col >= real_range_col will not hold
       stage_range_col[i] = opt_range_col;
     } else {
diff --git a/libaom/av1/common/av1_loopfilter.c b/libaom/av1/common/av1_loopfilter.c
index c5a86fb..0aa1f9b 100644
--- a/libaom/av1/common/av1_loopfilter.c
+++ b/libaom/av1/common/av1_loopfilter.c
@@ -32,7 +32,7 @@ static const int delta_lf_id_lut[MAX_MB_PLANE][2] = {
   { 0, 1 }, { 2, 2 }, { 3, 3 }
 };
 
-typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
+enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR);
 
 static const int mode_lf_lut[] = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
@@ -1426,9 +1426,9 @@ static void highbd_filter_selectively_horiz(
                                                lfi->hev_thr, lfin->mblim,
                                                lfin->lim, lfin->hev_thr, bd);
           } else {
-            aom_highbd_lpf_horizontal_14_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                                lfi->hev_thr, lfin->mblim,
-                                                lfin->lim, lfin->hev_thr, bd);
+            aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                              lfi->hev_thr, lfin->mblim,
+                                              lfin->lim, lfin->hev_thr, bd);
           }
           count = 2;
         } else {
diff --git a/libaom/av1/common/av1_rtcd_defs.pl b/libaom/av1/common/av1_rtcd_defs.pl
index 7049f16..aca5ec7 100755..100644
--- a/libaom/av1/common/av1_rtcd_defs.pl
+++ b/libaom/av1/common/av1_rtcd_defs.pl
@@ -81,8 +81,11 @@ specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
 
 # directional intra predictor functions
 add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
+specialize qw/av1_dr_prediction_z1 avx2/;
 add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
+specialize qw/av1_dr_prediction_z2 avx2/;
 add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
+specialize qw/av1_dr_prediction_z3 avx2/;
 
 # FILTER_INTRA predictor functions
 add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
@@ -108,31 +111,19 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
 
 #inv txfm
 add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
+# TODO(http://crbug.com/aomedia/2350): avx2 is disabled due to test vector
+# mismatches.
+specialize qw/av1_inv_txfm_add ssse3 neon/;
 
 add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
+# TODO(http://crbug.com/aomedia/2350): avx2 is disabled due to test vector
+# mismatches.
+specialize qw/av1_highbd_inv_txfm_add sse4_1/;
 
 add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
 add_proto qw/void av1_highbd_inv_txfm_add_8x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x16/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x16/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_32x32/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x32/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x32 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_32x16/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x16 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x32/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x32 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_32x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x8 sse4_1 avx2/;
+specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
 add_proto qw/void av1_highbd_inv_txfm_add_4x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1/;
 add_proto qw/void av1_highbd_inv_txfm_add_8x4/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
@@ -173,7 +164,9 @@ add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *out
 add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
 specialize qw/av1_highbd_dr_prediction_z1 avx2/;
 add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
-#specialize qw/av1_highbd_dr_prediction_z2 avx2/;
+# TODO(niva213@gmail.com): Re-enable avx2 after fixing valgrind issue
+# https://crbug.com/aomedia/2316
+# specialize qw/av1_highbd_dr_prediction_z2 avx2/;
 add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
 specialize qw/av1_highbd_dr_prediction_z3 avx2/;
 
@@ -187,6 +180,10 @@ specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
 add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
 specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
 
+# Helper functions.
+add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
+specialize "av1_round_shift_array", qw/sse4_1 neon/;
+
 #
 # Encoder functions below this point.
 #
@@ -221,9 +218,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   specialize qw/av1_fwd_txfm2d_8x4 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x16 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2/;
   add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x8 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2/;
   add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   specialize qw/av1_fwd_txfm2d_16x32 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -239,14 +236,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2/;
   add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2/;
   add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2/;
 
   add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2/;
   add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   specialize qw/av1_fwd_txfm2d_32x64 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -263,17 +260,18 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
 
-  add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-  specialize qw/av1_temporal_filter_apply sse2 msa/;
+  add_proto qw/void av1_apply_temporal_filter/, "const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred, int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1, int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred, int uv_buf_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
+  specialize qw/av1_apply_temporal_filter sse4_1/;
 
   add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
   # ENCODEMB INVOKE
 
   add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/av1_highbd_block_error sse2/;
+  specialize qw/av1_highbd_block_error sse2 avx2/;
 
-  add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+  add_proto qw/void av1_highbd_apply_temporal_filter/, "const uint8_t *yf, int y_stride, const uint8_t *yp, int y_buf_stride, const uint8_t *uf, const uint8_t *vf, int uv_stride, const uint8_t *up, const uint8_t *vp, int uv_buf_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
+  specialize qw/av1_highbd_apply_temporal_filter sse4_1/;
 
   add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
   specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
@@ -347,7 +345,7 @@ specialize qw/av1_highbd_warp_affine sse4_1/;
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/double compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
-  specialize qw/compute_cross_correlation sse4_1/;
+  specialize qw/compute_cross_correlation sse4_1 avx2/;
 }
 
 # LOOP_RESTORATION functions
@@ -366,18 +364,18 @@ add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint
 add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
 add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
 add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
 add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
 add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
 add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
 add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
 
   add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
   add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
@@ -387,19 +385,19 @@ add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int sr
   specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_2d_scale sse4_1/;
-  specialize qw/av1_jnt_convolve_2d sse2 ssse3 avx2 neon/;
-  specialize qw/av1_jnt_convolve_2d_copy sse2 avx2 neon/;
-  specialize qw/av1_jnt_convolve_x sse2 avx2 neon/;
-  specialize qw/av1_jnt_convolve_y sse2 avx2 neon/;
+  specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon/;
+  specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
+  specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon/;
+  specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/;
   specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
   specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
   specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
   specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
   specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
-  specialize qw/av1_highbd_jnt_convolve_2d sse4_1 avx2/;
-  specialize qw/av1_highbd_jnt_convolve_x sse4_1 avx2/;
-  specialize qw/av1_highbd_jnt_convolve_y sse4_1 avx2/;
-  specialize qw/av1_highbd_jnt_convolve_2d_copy sse4_1 avx2/;
+  specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2/;
+  specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2/;
+  specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2/;
+  specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2/;
 
 # INTRA_EDGE functions
 add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
diff --git a/libaom/av1/common/av1_txfm.c b/libaom/av1/common/av1_txfm.c
index 4fbb756..ac43402 100644
--- a/libaom/av1/common/av1_txfm.c
+++ b/libaom/av1/common/av1_txfm.c
@@ -10,6 +10,7 @@
  */
 
 #include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "av1/common/av1_txfm.h"
 
diff --git a/libaom/av1/common/av1_txfm.h b/libaom/av1/common/av1_txfm.h
index 59d64ca..20049b6 100644
--- a/libaom/av1/common/av1_txfm.h
+++ b/libaom/av1/common/av1_txfm.h
@@ -59,7 +59,9 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
   const int64_t min_value = -(1LL << (bit - 1));
   if (value < min_value || value > max_value) {
     fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit);
+#if !CONFIG_AV1_ENCODER
     assert(0);
+#endif
   }
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
 #if DO_RANGE_CHECK_CLAMP
@@ -110,7 +112,7 @@ typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit,
 typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride,
                               TX_TYPE tx_type, int bd);
 
-typedef enum TXFM_TYPE {
+enum {
   TXFM_TYPE_DCT4,
   TXFM_TYPE_DCT8,
   TXFM_TYPE_DCT16,
@@ -125,7 +127,7 @@ typedef enum TXFM_TYPE {
   TXFM_TYPE_IDENTITY32,
   TXFM_TYPES,
   TXFM_TYPE_INVALID,
-} TXFM_TYPE;
+} UENUM1BYTE(TXFM_TYPE);
 
 typedef struct TXFM_2D_FLIP_CFG {
   TX_SIZE tx_size;
diff --git a/libaom/av1/common/blockd.h b/libaom/av1/common/blockd.h
index d6727b8..91ef3df 100644
--- a/libaom/av1/common/blockd.h
+++ b/libaom/av1/common/blockd.h
@@ -38,19 +38,19 @@ extern "C" {
 #define MAX_DIFFWTD_MASK_BITS 1
 
 // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
-typedef enum ATTRIBUTE_PACKED {
+enum {
   DIFFWTD_38 = 0,
   DIFFWTD_38_INV,
   DIFFWTD_MASK_TYPES,
-} DIFFWTD_MASK_TYPE;
+} UENUM1BYTE(DIFFWTD_MASK_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   KEY_FRAME = 0,
   INTER_FRAME = 1,
   INTRA_ONLY_FRAME = 2,  // replaces intra-only
   S_FRAME = 3,
   FRAME_TYPES,
-} FRAME_TYPE;
+} UENUM1BYTE(FRAME_TYPE);
 
 static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
   return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
@@ -157,15 +157,15 @@ static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
    is a single probability table. */
 
 typedef struct {
-  // Number of base colors for Y (0) and UV (1)
-  uint8_t palette_size[2];
   // Value of base colors for Y, U, and V
   uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
+  // Number of base colors for Y (0) and UV (1)
+  uint8_t palette_size[2];
 } PALETTE_MODE_INFO;
 
 typedef struct {
-  uint8_t use_filter_intra;
   FILTER_INTRA_MODE filter_intra_mode;
+  uint8_t use_filter_intra;
 } FILTER_INTRA_MODE_INFO;
 
 static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = {
@@ -190,11 +190,6 @@ typedef struct RD_STATS {
   int64_t ref_rdcost;
   int zero_rate;
   uint8_t invalid_rate;
-#if CONFIG_ONE_PASS_SVM
-  int eob, eob_0, eob_1, eob_2, eob_3;
-  int64_t rd, rd_0, rd_1, rd_2, rd_3;
-  int64_t y_sse, sse_0, sse_1, sse_2, sse_3;
-#endif
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost[MAX_MB_PLANE];
   int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
@@ -205,10 +200,10 @@ typedef struct RD_STATS {
 // This struct is used to group function args that are commonly
 // sent together in functions related to interinter compound modes
 typedef struct {
+  uint8_t *seg_mask;
   int wedge_index;
   int wedge_sign;
   DIFFWTD_MASK_TYPE mask_type;
-  uint8_t *seg_mask;
   COMPOUND_TYPE type;
 } INTERINTER_COMPOUND_DATA;
 
@@ -216,48 +211,18 @@ typedef struct {
 #define TXK_TYPE_BUF_LEN 64
 // This structure now relates to 4x4 block regions.
 typedef struct MB_MODE_INFO {
-  // Common for both INTER and INTRA blocks
-  BLOCK_SIZE sb_type;
-  PREDICTION_MODE mode;
-  TX_SIZE tx_size;
-  uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
-  int8_t skip;
-  int8_t skip_mode;
-  int8_t segment_id;
-  int8_t seg_id_predicted;  // valid only when temporal_update is enabled
-
-  // Only for INTRA blocks
-  UV_PREDICTION_MODE uv_mode;
-
   PALETTE_MODE_INFO palette_mode_info;
-  uint8_t use_intrabc;
-
+  WarpedMotionParams wm_params;
+  // interinter members
+  INTERINTER_COMPOUND_DATA interinter_comp;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  int_mv mv[2];
   // Only for INTER blocks
   InterpFilters interp_filters;
-  MV_REFERENCE_FRAME ref_frame[2];
-
-  TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
-
-  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-
-  // The actual prediction angle is the base angle + (angle_delta * step).
-  int8_t angle_delta[PLANE_TYPES];
-
-  // interintra members
-  INTERINTRA_MODE interintra_mode;
   // TODO(debargha): Consolidate these flags
-  int use_wedge_interintra;
   int interintra_wedge_index;
   int interintra_wedge_sign;
-  // interinter members
-  INTERINTER_COMPOUND_DATA interinter_comp;
-  MOTION_MODE motion_mode;
   int overlappable_neighbors[2];
-  int_mv mv[2];
-  uint8_t ref_mv_idx;
-  PARTITION_TYPE partition;
-  /* deringing gain *per-superblock* */
-  int8_t cdef_strength;
   int current_qindex;
   int delta_lf_from_base;
   int delta_lf[FRAME_LF_COUNT];
@@ -267,15 +232,43 @@ typedef struct MB_MODE_INFO {
   int mi_col;
 #endif
   int num_proj_ref;
-  WarpedMotionParams wm_params;
 
   // Index of the alpha Cb and alpha Cr combination
   int cfl_alpha_idx;
   // Joint sign of alpha Cb and alpha Cr
   int cfl_alpha_signs;
 
-  int compound_idx;
+  // Indicate if masked compound is used(1) or not(0).
   int comp_group_idx;
+  // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used.
+  int compound_idx;
+#if CONFIG_INSPECTION
+  int16_t tx_skip[TXK_TYPE_BUF_LEN];
+#endif
+  // Common for both INTER and INTRA blocks
+  BLOCK_SIZE sb_type;
+  PREDICTION_MODE mode;
+  // Only for INTRA blocks
+  UV_PREDICTION_MODE uv_mode;
+  // interintra members
+  INTERINTRA_MODE interintra_mode;
+  MOTION_MODE motion_mode;
+  PARTITION_TYPE partition;
+  TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
+  MV_REFERENCE_FRAME ref_frame[2];
+  int8_t use_wedge_interintra;
+  int8_t skip;
+  int8_t skip_mode;
+  uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  TX_SIZE tx_size;
+  int8_t segment_id;
+  int8_t seg_id_predicted;  // valid only when temporal_update is enabled
+  uint8_t use_intrabc;
+  // The actual prediction angle is the base angle + (angle_delta * step).
+  int8_t angle_delta[PLANE_TYPES];
+  /* deringing gain *per-superblock* */
+  int8_t cdef_strength;
+  uint8_t ref_mv_idx;
 } MB_MODE_INFO;
 
 static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
@@ -375,7 +368,7 @@ static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
 }
 #endif
 
-enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
+enum { MV_PRECISION_Q3, MV_PRECISION_Q4 } UENUM1BYTE(mv_precision);
 
 struct buf_2d {
   uint8_t *buf;
@@ -431,14 +424,6 @@ typedef struct macroblockd_plane {
 #define BLOCK_OFFSET(x, i) \
   ((x) + (i) * (1 << (tx_size_wide_log2[0] + tx_size_high_log2[0])))
 
-struct RefCntBuffer;
-
-typedef struct RefBuffer {
-  int map_idx;  // frame map idx
-  struct RefCntBuffer *buf;
-  struct scale_factors sf;
-} RefBuffer;
-
 typedef struct {
   DECLARE_ALIGNED(16, InterpKernel, vfilter);
   DECLARE_ALIGNED(16, InterpKernel, hfilter);
@@ -494,11 +479,13 @@ typedef struct cfl_ctx {
   int is_chroma_reference;
 } CFL_CTX;
 
-typedef struct jnt_comp_params {
-  int use_jnt_comp_avg;
+typedef struct dist_wtd_comp_params {
+  int use_dist_wtd_comp_avg;
   int fwd_offset;
   int bck_offset;
-} JNT_COMP_PARAMS;
+} DIST_WTD_COMP_PARAMS;
+
+struct scale_factors;
 
 // Most/all of the pointers are mere pointers to actual arrays are allocated
 // elsewhere. This is mostly for coding convenience.
@@ -526,8 +513,8 @@ typedef struct macroblockd {
   int mb_to_top_edge;
   int mb_to_bottom_edge;
 
-  /* pointers to reference frames */
-  const RefBuffer *block_refs[2];
+  /* pointers to reference frame scale factors */
+  const struct scale_factors *block_ref_scale_factors[2];
 
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
@@ -596,7 +583,7 @@ typedef struct macroblockd {
   uint8_t *mc_buf[2];
   CFL_CTX cfl;
 
-  JNT_COMP_PARAMS jcp_param;
+  DIST_WTD_COMP_PARAMS jcp_param;
 
   uint16_t cb_offset[MAX_MB_PLANE];
   uint16_t txb_offset[MAX_MB_PLANE];
@@ -606,7 +593,7 @@ typedef struct macroblockd {
   uint8_t *tmp_obmc_bufs[2];
 } MACROBLOCKD;
 
-static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
+static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
   return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
 }
 
@@ -781,11 +768,13 @@ static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
 
 static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
                                           const MACROBLOCKD *xd,
-                                          TX_SIZE tx_size) {
+                                          TX_SIZE tx_size,
+                                          int is_screen_content_type) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
 
   if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
-      xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+      xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 ||
+      is_screen_content_type)
     return DCT_DCT;
 
   return intra_mode_to_tx_type(mbmi, plane_type);
@@ -1049,7 +1038,8 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
     if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
     assert(!has_second_ref(mbmi));
     if (mbmi->num_proj_ref >= 1 &&
-        (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) {
+        (allow_warped_motion &&
+         !av1_is_scaled(xd->block_ref_scale_factors[0]))) {
       if (xd->cur_frame_force_integer_mv) {
         return OBMC_CAUSAL;
       }
diff --git a/libaom/av1/common/cdef.c b/libaom/av1/common/cdef.c
index 556dede..63f9883 100644
--- a/libaom/av1/common/cdef.c
+++ b/libaom/av1/common/cdef.c
@@ -80,7 +80,6 @@ int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
       if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) {
         dlist[count].by = r >> r_shift;
         dlist[count].bx = c >> c_shift;
-        dlist[count].skip = 0;
         count++;
       }
     }
diff --git a/libaom/av1/common/cdef_block.c b/libaom/av1/common/cdef_block.c
index 845df37..dfd5882 100644
--- a/libaom/av1/common/cdef_block.c
+++ b/libaom/av1/common/cdef_block.c
@@ -232,8 +232,8 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
   }
 
   for (bi = 0; bi < cdef_count; bi++) {
-    int t = dlist[bi].skip ? 0 : pri_strength;
-    int s = dlist[bi].skip ? 0 : sec_strength;
+    int t = pri_strength;
+    int s = sec_strength;
     by = dlist[bi].by;
     bx = dlist[bi].bx;
     if (dst8)
diff --git a/libaom/av1/common/cdef_block.h b/libaom/av1/common/cdef_block.h
index 0e921e0..8321d48 100644
--- a/libaom/av1/common/cdef_block.h
+++ b/libaom/av1/common/cdef_block.h
@@ -38,7 +38,6 @@ DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
 typedef struct {
   uint8_t by;
   uint8_t bx;
-  uint8_t skip;
 } cdef_list;
 
 typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
diff --git a/libaom/av1/common/cfl.c b/libaom/av1/common/cfl.c
index 99410be..65e18e8 100644
--- a/libaom/av1/common/cfl.c
+++ b/libaom/av1/common/cfl.c
@@ -37,7 +37,7 @@ void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
   assert(pred_plane < CFL_PRED_PLANES);
   assert(width <= CFL_BUF_LINE);
 
-  if (get_bitdepth_data_path_index(xd)) {
+  if (is_cur_buf_hbd(xd)) {
     uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
     memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
     return;
@@ -69,7 +69,7 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
   assert(pred_plane < CFL_PRED_PLANES);
   assert(width <= CFL_BUF_LINE);
   assert(height <= CFL_BUF_LINE);
-  if (get_bitdepth_data_path_index(xd)) {
+  if (is_cur_buf_hbd(xd)) {
     uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
     cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
                          width, height);
@@ -196,7 +196,7 @@ void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
       cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
   assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
          CFL_BUF_SQUARE);
-  if (get_bitdepth_data_path_index(xd)) {
+  if (is_cur_buf_hbd(xd)) {
     uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
     get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3,
                                 xd->bd);
@@ -388,8 +388,7 @@ void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
     assert(!((row & 1) && tx_size_high[tx_size] != 4));
     sub8x8_adjust_offset(cfl, &row, &col);
   }
-  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size,
-            get_bitdepth_data_path_index(xd));
+  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd));
 }
 
 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
@@ -405,5 +404,5 @@ void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
   const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
   tx_size = get_tx_size(width, height);
   cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
-            get_bitdepth_data_path_index(xd));
+            is_cur_buf_hbd(xd));
 }
diff --git a/libaom/av1/common/convolve.c b/libaom/av1/common/convolve.c
index 8ba3ed4..5a55ece 100644
--- a/libaom/av1/common/convolve.c
+++ b/libaom/av1/common/convolve.c
@@ -238,16 +238,16 @@ void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
   (void)conv_params;
 
   for (int y = 0; y < h; ++y) {
-    memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+    memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
   }
 }
 
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
-                           int dst8_stride, int w, int h,
-                           const InterpFilterParams *filter_params_x,
-                           const InterpFilterParams *filter_params_y,
-                           const int subpel_x_q4, const int subpel_y_q4,
-                           ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
+                                uint8_t *dst8, int dst8_stride, int w, int h,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
+                                const int subpel_x_q4, const int subpel_y_q4,
+                                ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
@@ -290,7 +290,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
       if (conv_params->do_average) {
         int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -308,12 +308,12 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
   }
 }
 
-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
-                          int dst8_stride, int w, int h,
-                          const InterpFilterParams *filter_params_x,
-                          const InterpFilterParams *filter_params_y,
-                          const int subpel_x_q4, const int subpel_y_q4,
-                          ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride,
+                               uint8_t *dst8, int dst8_stride, int w, int h,
+                               const InterpFilterParams *filter_params_x,
+                               const InterpFilterParams *filter_params_y,
+                               const int subpel_x_q4, const int subpel_y_q4,
+                               ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -341,7 +341,7 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
 
       if (conv_params->do_average) {
         int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -358,12 +358,12 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
   }
 }
 
-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
-                          int dst8_stride, int w, int h,
-                          const InterpFilterParams *filter_params_x,
-                          const InterpFilterParams *filter_params_y,
-                          const int subpel_x_q4, const int subpel_y_q4,
-                          ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride,
+                               uint8_t *dst8, int dst8_stride, int w, int h,
+                               const InterpFilterParams *filter_params_x,
+                               const InterpFilterParams *filter_params_y,
+                               const int subpel_x_q4, const int subpel_y_q4,
+                               ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -391,7 +391,7 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
 
       if (conv_params->do_average) {
         int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -408,12 +408,11 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
   }
 }
 
-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst8, int dst8_stride, int w, int h,
-                                const InterpFilterParams *filter_params_x,
-                                const InterpFilterParams *filter_params_y,
-                                const int subpel_x_q4, const int subpel_y_q4,
-                                ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_c(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bits =
@@ -434,7 +433,7 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
 
       if (conv_params->do_average) {
         int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -511,7 +510,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           int32_t tmp = dst16[y * dst16_stride + x];
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
             tmp = tmp >> DIST_PRECISION_BITS;
           } else {
@@ -632,7 +631,7 @@ void av1_highbd_convolve_2d_copy_sr_c(
   (void)bd;
 
   for (int y = 0; y < h; ++y) {
-    memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+    memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
   }
 }
 
@@ -748,13 +747,11 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
-                                  uint16_t *dst16, int dst16_stride, int w,
-                                  int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
-                                  ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_2d_c(
+    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
+    int w, int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   int x, y, k;
   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -799,7 +796,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
       if (conv_params->do_average) {
         int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -817,13 +814,11 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
-                                 uint16_t *dst16, int dst16_stride, int w,
-                                 int h,
-                                 const InterpFilterParams *filter_params_x,
-                                 const InterpFilterParams *filter_params_y,
-                                 const int subpel_x_q4, const int subpel_y_q4,
-                                 ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_x_c(
+    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
+    int w, int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -851,7 +846,7 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
 
       if (conv_params->do_average) {
         int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -868,13 +863,11 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
-                                 uint16_t *dst16, int dst16_stride, int w,
-                                 int h,
-                                 const InterpFilterParams *filter_params_x,
-                                 const InterpFilterParams *filter_params_y,
-                                 const int subpel_x_q4, const int subpel_y_q4,
-                                 ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_y_c(
+    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
+    int w, int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -902,7 +895,7 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
 
       if (conv_params->do_average) {
         int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -919,7 +912,7 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_highbd_jnt_convolve_2d_copy_c(
+void av1_highbd_dist_wtd_convolve_2d_copy_c(
     const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
     int w, int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -943,7 +936,7 @@ void av1_highbd_jnt_convolve_2d_copy_c(
       res += round_offset;
       if (conv_params->do_average) {
         int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -1019,7 +1012,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           int32_t tmp = dst16[y * dst16_stride + x];
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
             tmp = tmp >> DIST_PRECISION_BITS;
           } else {
diff --git a/libaom/av1/common/convolve.h b/libaom/av1/common/convolve.h
index d0972db..e5479e6 100644
--- a/libaom/av1/common/convolve.h
+++ b/libaom/av1/common/convolve.h
@@ -26,7 +26,7 @@ typedef struct ConvolveParams {
   int round_1;
   int plane;
   int is_compound;
-  int use_jnt_comp_avg;
+  int use_dist_wtd_comp_avg;
   int fwd_offset;
   int bck_offset;
 } ConvolveParams;
diff --git a/libaom/av1/common/debugmodes.c b/libaom/av1/common/debugmodes.c
index 5242f19..b26c7dd 100644
--- a/libaom/av1/common/debugmodes.c
+++ b/libaom/av1/common/debugmodes.c
@@ -40,7 +40,7 @@ static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
       mi++;
     }
     fprintf(file, "\n");
-    mi += MAX_MIB_SIZE;
+    mi += cm->mi_stride - cols;
   }
   fprintf(file, "\n");
 }
@@ -68,7 +68,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
       mi++;
     }
     fprintf(mvs, "\n");
-    mi += MAX_MIB_SIZE;
+    mi += cm->mi_stride - cols;
   }
   fprintf(mvs, "\n");
 
@@ -82,7 +82,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
       mi++;
     }
     fprintf(mvs, "\n");
-    mi += MAX_MIB_SIZE;
+    mi += cm->mi_stride - cols;
   }
   fprintf(mvs, "\n");
 
diff --git a/libaom/av1/common/entropy.c b/libaom/av1/common/entropy.c
index 4f95ef6..f63ac98 100644
--- a/libaom/av1/common/entropy.c
+++ b/libaom/av1/common/entropy.c
@@ -101,7 +101,7 @@ void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
   RESET_CDF_COUNTER(fc->refmv_cdf, 2);
   RESET_CDF_COUNTER(fc->drl_cdf, 2);
   RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
-  RESET_CDF_COUNTER(fc->compound_type_cdf, COMPOUND_TYPES - 1);
+  RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES);
   RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16);
   RESET_CDF_COUNTER(fc->interintra_cdf, 2);
   RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2);
diff --git a/libaom/av1/common/entropy.h b/libaom/av1/common/entropy.h
index 991692c..41218d3 100644
--- a/libaom/av1/common/entropy.h
+++ b/libaom/av1/common/entropy.h
@@ -54,12 +54,12 @@ extern "C" {
 
 #define BASE_CONTEXT_POSITION_NUM 12
 
-typedef enum TX_CLASS {
+enum {
   TX_CLASS_2D = 0,
   TX_CLASS_HORIZ = 1,
   TX_CLASS_VERT = 2,
   TX_CLASSES = 3,
-} TX_CLASS;
+} UENUM1BYTE(TX_CLASS);
 
 #define DCT_MAX_VALUE 16384
 #define DCT_MAX_VALUE_HIGH10 65536
diff --git a/libaom/av1/common/entropymode.c b/libaom/av1/common/entropymode.c
index 51bbea7..90702ac 100644
--- a/libaom/av1/common/entropymode.c
+++ b/libaom/av1/common/entropymode.c
@@ -488,17 +488,17 @@ static const aom_cdf_prob
       { AOM_CDF2(16384) }
     };
 
-static const aom_cdf_prob
-    default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)] = {
-      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
-      { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
-      { AOM_CDF2(9770) },  { AOM_CDF2(9100) },  { AOM_CDF2(8233) },
-      { AOM_CDF2(6172) },  { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
-      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
-      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
-      { AOM_CDF2(11820) }, { AOM_CDF2(7701) },  { AOM_CDF2(16384) },
-      { AOM_CDF2(16384) }
-    };
+static const aom_cdf_prob default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+    MASKED_COMPOUND_TYPES)] = {
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
+  { AOM_CDF2(9770) },  { AOM_CDF2(9100) },  { AOM_CDF2(8233) },
+  { AOM_CDF2(6172) },  { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(11820) }, { AOM_CDF2(7701) },  { AOM_CDF2(16384) },
+  { AOM_CDF2(16384) }
+};
 
 static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)] =
     { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
@@ -1072,9 +1072,9 @@ void av1_setup_frame_contexts(AV1_COMMON *cm) {
   // TODO(jack.haughton@argondesign.com): don't think this should be necessary,
   // but could do with fuller testing
   if (cm->large_scale_tile) {
-    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-      if (cm->current_frame.frame_refs[i].buf != NULL)
-        cm->current_frame.frame_refs[i].buf->frame_context = *cm->fc;
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      RefCntBuffer *const buf = get_ref_frame_buf(cm, i);
+      if (buf != NULL) buf->frame_context = *cm->fc;
     }
     for (int i = 0; i < FRAME_BUFFERS; ++i)
       cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc;
@@ -1086,10 +1086,8 @@ void av1_setup_past_independence(AV1_COMMON *cm) {
   // Features disabled, 0, with delta coding (Default state).
   av1_clearall_segfeatures(&cm->seg);
 
-  cm->current_frame_seg_map = cm->cur_frame->seg_map;
-
-  if (cm->current_frame_seg_map)
-    memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+  if (cm->cur_frame->seg_map)
+    memset(cm->cur_frame->seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
   // reset mode ref deltas
   av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
@@ -1099,7 +1097,6 @@ void av1_setup_past_independence(AV1_COMMON *cm) {
   av1_default_coef_probs(cm);
   init_mode_probs(cm->fc);
   av1_init_mv_probs(cm);
-  av1_init_lv_map(cm);
   cm->fc->initialized = 1;
   av1_setup_frame_contexts(cm);
 
diff --git a/libaom/av1/common/entropymode.h b/libaom/av1/common/entropymode.h
index 7047f34..69b5218 100644
--- a/libaom/av1/common/entropymode.h
+++ b/libaom/av1/common/entropymode.h
@@ -92,7 +92,8 @@ typedef struct frame_contexts {
 
   aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS]
                                       [CDF_SIZE(INTER_COMPOUND_MODES)];
-  aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)];
+  aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL]
+                                [CDF_SIZE(MASKED_COMPOUND_TYPES)];
   aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)];
   aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)];
   aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
diff --git a/libaom/av1/common/entropymv.h b/libaom/av1/common/entropymv.h
index fa818a2..cddc807 100644
--- a/libaom/av1/common/entropymv.h
+++ b/libaom/av1/common/entropymv.h
@@ -30,12 +30,12 @@ void av1_init_mv_probs(struct AV1Common *cm);
 
 /* Symbols for coding which components are zero jointly */
 #define MV_JOINTS 4
-typedef enum {
+enum {
   MV_JOINT_ZERO = 0,   /* Zero vector */
   MV_JOINT_HNZVZ = 1,  /* Vert zero, hor nonzero */
   MV_JOINT_HZVNZ = 2,  /* Hor zero, vert nonzero */
   MV_JOINT_HNZVNZ = 3, /* Both components nonzero */
-} MV_JOINT_TYPE;
+} UENUM1BYTE(MV_JOINT_TYPE);
 
 static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
   return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
@@ -47,7 +47,7 @@ static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
 
 /* Symbols for coding magnitude class of nonzero components */
 #define MV_CLASSES 11
-typedef enum {
+enum {
   MV_CLASS_0 = 0,   /* (0, 2]     integer pel */
   MV_CLASS_1 = 1,   /* (2, 4]     integer pel */
   MV_CLASS_2 = 2,   /* (4, 8]     integer pel */
@@ -59,7 +59,7 @@ typedef enum {
   MV_CLASS_8 = 8,   /* (256, 512] integer pel */
   MV_CLASS_9 = 9,   /* (512, 1024] integer pel */
   MV_CLASS_10 = 10, /* (1024,2048] integer pel */
-} MV_CLASS_TYPE;
+} UENUM1BYTE(MV_CLASS_TYPE);
 
 #define CLASS0_BITS 1 /* bits at integer precision for class 0 */
 #define CLASS0_SIZE (1 << CLASS0_BITS)
@@ -91,11 +91,11 @@ typedef struct {
   nmv_component comps[2];
 } nmv_context;
 
-typedef enum {
+enum {
   MV_SUBPEL_NONE = -1,
   MV_SUBPEL_LOW_PRECISION = 0,
   MV_SUBPEL_HIGH_PRECISION,
-} MvSubpelPrecision;
+} SENUM1BYTE(MvSubpelPrecision);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libaom/av1/common/enums.h b/libaom/av1/common/enums.h
index eb17c58..fbacc89 100644
--- a/libaom/av1/common/enums.h
+++ b/libaom/av1/common/enums.h
@@ -16,6 +16,7 @@
 
 #include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -84,21 +85,12 @@ extern "C" {
 // Profile 2.  8-bit and 10-bit 4:2:2
 //            12-bit  4:0:0, 4:2:2 and 4:4:4
 // Since we have three bits for the profiles, it can be extended later.
-typedef enum BITSTREAM_PROFILE {
+enum {
   PROFILE_0,
   PROFILE_1,
   PROFILE_2,
   MAX_PROFILES,
-} BITSTREAM_PROFILE;
-
-#define LEVEL_MAJOR_BITS 3
-#define LEVEL_MINOR_BITS 2
-#define LEVEL_BITS (LEVEL_MAJOR_BITS + LEVEL_MINOR_BITS)
-
-#define LEVEL_MAJOR_MIN 2
-#define LEVEL_MAJOR_MAX ((1 << LEVEL_MAJOR_BITS) - 1 + LEVEL_MAJOR_MIN)
-#define LEVEL_MINOR_MIN 0
-#define LEVEL_MINOR_MAX ((1 << LEVEL_MINOR_BITS) - 1)
+} SENUM1BYTE(BITSTREAM_PROFILE);
 
 #define OP_POINTS_CNT_MINUS_1_BITS 5
 #define OP_POINTS_IDC_BITS 12
@@ -138,7 +130,7 @@ typedef enum ATTRIBUTE_PACKED {
 // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
 #define SQR_BLOCK_SIZES 6
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   PARTITION_NONE,
   PARTITION_HORZ,
   PARTITION_VERT,
@@ -152,7 +144,7 @@ typedef enum ATTRIBUTE_PACKED {
   EXT_PARTITION_TYPES,
   PARTITION_TYPES = PARTITION_SPLIT + 1,
   PARTITION_INVALID = 255
-} PARTITION_TYPE;
+} UENUM1BYTE(PARTITION_TYPE);
 
 typedef char PARTITION_CONTEXT;
 #define PARTITION_PLOFFSET 4  // number of probability models per block size
@@ -160,12 +152,7 @@ typedef char PARTITION_CONTEXT;
 #define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
 
 // block transform size
-#if defined(_MSC_VER)
-typedef uint8_t TX_SIZE;
-enum ATTRIBUTE_PACKED {
-#else
-typedef enum ATTRIBUTE_PACKED {
-#endif
+enum {
   TX_4X4,             // 4x4 transform
   TX_8X8,             // 8x8 transform
   TX_16X16,           // 16x16 transform
@@ -189,11 +176,7 @@ typedef enum ATTRIBUTE_PACKED {
   TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
   TX_SIZES_LARGEST = TX_64X64,
   TX_INVALID = 255  // Invalid transform size
-#if defined(_MSC_VER)
-};
-#else
-} TX_SIZE;
-#endif
+} UENUM1BYTE(TX_SIZE);
 
 #define TX_SIZE_LUMA_MIN (TX_4X4)
 /* We don't need to code a transform size unless the allowed size is at least
@@ -215,7 +198,7 @@ typedef enum ATTRIBUTE_PACKED {
 #define TX_PAD_HOR 4
 // Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability
 // check.
-#define TX_PAD_TOP 2
+#define TX_PAD_TOP 0
 #define TX_PAD_BOTTOM 4
 #define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
 // Pad 16 extra bytes to avoid reading overflow in SIMD optimization.
@@ -227,23 +210,23 @@ typedef enum ATTRIBUTE_PACKED {
 #define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
 
 // frame transform mode
-typedef enum ATTRIBUTE_PACKED {
+enum {
   ONLY_4X4,         // use only 4x4 transform
   TX_MODE_LARGEST,  // transform size is the largest possible for pu size
   TX_MODE_SELECT,   // transform specified for each block
   TX_MODES,
-} TX_MODE;
+} UENUM1BYTE(TX_MODE);
 
 // 1D tx types
-typedef enum ATTRIBUTE_PACKED {
+enum {
   DCT_1D,
   ADST_1D,
   FLIPADST_1D,
   IDTX_1D,
   TX_TYPES_1D,
-} TX_TYPE_1D;
+} UENUM1BYTE(TX_TYPE_1D);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   DCT_DCT,            // DCT in both horizontal and vertical
   ADST_DCT,           // ADST in vertical, DCT in horizontal
   DCT_ADST,           // DCT in vertical, ADST in horizontal
@@ -261,9 +244,9 @@ typedef enum ATTRIBUTE_PACKED {
   V_FLIPADST,         // FLIPADST in vertical, identity in horizontal
   H_FLIPADST,         // Identity in vertical, FLIPADST in horizontal
   TX_TYPES,
-} TX_TYPE;
+} UENUM1BYTE(TX_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   REG_REG,
   REG_SMOOTH,
   REG_SHARP,
@@ -273,9 +256,9 @@ typedef enum ATTRIBUTE_PACKED {
   SHARP_REG,
   SHARP_SMOOTH,
   SHARP_SHARP,
-} DUAL_FILTER_TYPE;
+} UENUM1BYTE(DUAL_FILTER_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   // DCT only
   EXT_TX_SET_DCTONLY,
   // DCT + Identity only
@@ -289,7 +272,7 @@ typedef enum ATTRIBUTE_PACKED {
   // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
   EXT_TX_SET_ALL16,
   EXT_TX_SET_TYPES
-} TxSetType;
+} UENUM1BYTE(TxSetType);
 
 #define IS_2D_TRANSFORM(tx_type) (tx_type < IDTX)
 
@@ -297,7 +280,7 @@ typedef enum ATTRIBUTE_PACKED {
 #define EXT_TX_SETS_INTER 4  // Sets of transform selections for INTER
 #define EXT_TX_SETS_INTRA 3  // Sets of transform selections for INTRA
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   AOM_LAST_FLAG = 1 << 0,
   AOM_LAST2_FLAG = 1 << 1,
   AOM_LAST3_FLAG = 1 << 2,
@@ -306,19 +289,15 @@ typedef enum ATTRIBUTE_PACKED {
   AOM_ALT2_FLAG = 1 << 5,
   AOM_ALT_FLAG = 1 << 6,
   AOM_REFFRAME_ALL = (1 << 7) - 1
-} AOM_REFFRAME;
+} UENUM1BYTE(AOM_REFFRAME);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   UNIDIR_COMP_REFERENCE,
   BIDIR_COMP_REFERENCE,
   COMP_REFERENCE_TYPES,
-} COMP_REFERENCE_TYPE;
+} UENUM1BYTE(COMP_REFERENCE_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
-  PLANE_TYPE_Y,
-  PLANE_TYPE_UV,
-  PLANE_TYPES
-} PLANE_TYPE;
+enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE);
 
 #define CFL_ALPHABET_SIZE_LOG2 4
 #define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
@@ -326,24 +305,20 @@ typedef enum ATTRIBUTE_PACKED {
 #define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
 #define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
 
-typedef enum ATTRIBUTE_PACKED {
-  CFL_PRED_U,
-  CFL_PRED_V,
-  CFL_PRED_PLANES
-} CFL_PRED_TYPE;
+enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } UENUM1BYTE(CFL_PRED_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   CFL_SIGN_ZERO,
   CFL_SIGN_NEG,
   CFL_SIGN_POS,
   CFL_SIGNS
-} CFL_SIGN_TYPE;
+} UENUM1BYTE(CFL_SIGN_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   CFL_DISALLOWED,
   CFL_ALLOWED,
   CFL_ALLOWED_TYPES
-} CFL_ALLOWED_TYPE;
+} UENUM1BYTE(CFL_ALLOWED_TYPE);
 
 // CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid
 #define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1)
@@ -360,12 +335,12 @@ typedef enum ATTRIBUTE_PACKED {
 #define CFL_CONTEXT_V(js) \
   (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   PALETTE_MAP,
   COLOR_MAP_TYPES,
-} COLOR_MAP_TYPE;
+} UENUM1BYTE(COLOR_MAP_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   TWO_COLORS,
   THREE_COLORS,
   FOUR_COLORS,
@@ -374,9 +349,9 @@ typedef enum ATTRIBUTE_PACKED {
   SEVEN_COLORS,
   EIGHT_COLORS,
   PALETTE_SIZES
-} PALETTE_SIZE;
+} UENUM1BYTE(PALETTE_SIZE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   PALETTE_COLOR_ONE,
   PALETTE_COLOR_TWO,
   PALETTE_COLOR_THREE,
@@ -386,11 +361,11 @@ typedef enum ATTRIBUTE_PACKED {
   PALETTE_COLOR_SEVEN,
   PALETTE_COLOR_EIGHT,
   PALETTE_COLORS
-} PALETTE_COLOR;
+} UENUM1BYTE(PALETTE_COLOR);
 
 // Note: All directional predictors must be between V_PRED and D67_PRED (both
 // inclusive).
-typedef enum ATTRIBUTE_PACKED {
+enum {
   DC_PRED,        // Average of above and left pixels
   V_PRED,         // Vertical
   H_PRED,         // Horizontal
@@ -431,11 +406,11 @@ typedef enum ATTRIBUTE_PACKED {
   INTER_MODE_END = MB_MODE_COUNT,
   INTRA_MODES = PAETH_PRED + 1,  // PAETH_PRED has to be the last intra mode.
   INTRA_INVALID = MB_MODE_COUNT  // For uv_mode in inter blocks
-} PREDICTION_MODE;
+} UENUM1BYTE(PREDICTION_MODE);
 
 // TODO(ltrudeau) Do we really want to pack this?
 // TODO(ltrudeau) Do we match with PREDICTION_MODE?
-typedef enum ATTRIBUTE_PACKED {
+enum {
   UV_DC_PRED,        // Average of above and left pixels
   UV_V_PRED,         // Vertical
   UV_H_PRED,         // Horizontal
@@ -452,38 +427,71 @@ typedef enum ATTRIBUTE_PACKED {
   UV_CFL_PRED,       // Chroma-from-Luma
   UV_INTRA_MODES,
   UV_MODE_INVALID,  // For uv_mode in inter blocks
-} UV_PREDICTION_MODE;
+} UENUM1BYTE(UV_PREDICTION_MODE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   SIMPLE_TRANSLATION,
   OBMC_CAUSAL,    // 2-sided OBMC
   WARPED_CAUSAL,  // 2-sided WARPED
   MOTION_MODES
-} MOTION_MODE;
+} UENUM1BYTE(MOTION_MODE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   II_DC_PRED,
   II_V_PRED,
   II_H_PRED,
   II_SMOOTH_PRED,
   INTERINTRA_MODES
-} INTERINTRA_MODE;
+} UENUM1BYTE(INTERINTRA_MODE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   COMPOUND_AVERAGE,
+  COMPOUND_DISTWTD,
   COMPOUND_WEDGE,
   COMPOUND_DIFFWTD,
   COMPOUND_TYPES,
-} COMPOUND_TYPE;
+  MASKED_COMPOUND_TYPES = 2,
+} UENUM1BYTE(COMPOUND_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   FILTER_DC_PRED,
   FILTER_V_PRED,
   FILTER_H_PRED,
   FILTER_D157_PRED,
   FILTER_PAETH_PRED,
   FILTER_INTRA_MODES,
-} FILTER_INTRA_MODE;
+} UENUM1BYTE(FILTER_INTRA_MODE);
+
+enum {
+  SEQ_LEVEL_2_0,
+  SEQ_LEVEL_2_1,
+  SEQ_LEVEL_2_2,
+  SEQ_LEVEL_2_3,
+  SEQ_LEVEL_3_0,
+  SEQ_LEVEL_3_1,
+  SEQ_LEVEL_3_2,
+  SEQ_LEVEL_3_3,
+  SEQ_LEVEL_4_0,
+  SEQ_LEVEL_4_1,
+  SEQ_LEVEL_4_2,
+  SEQ_LEVEL_4_3,
+  SEQ_LEVEL_5_0,
+  SEQ_LEVEL_5_1,
+  SEQ_LEVEL_5_2,
+  SEQ_LEVEL_5_3,
+  SEQ_LEVEL_6_0,
+  SEQ_LEVEL_6_1,
+  SEQ_LEVEL_6_2,
+  SEQ_LEVEL_6_3,
+  SEQ_LEVEL_7_0,
+  SEQ_LEVEL_7_1,
+  SEQ_LEVEL_7_2,
+  SEQ_LEVEL_7_3,
+  SEQ_LEVELS,
+  SEQ_LEVEL_MAX = 31
+} UENUM1BYTE(AV1_LEVEL);
+
+#define LEVEL_BITS 5
 
 #define DIRECTIONAL_MODES 8
 #define MAX_ANGLE_DELTA 3
@@ -540,7 +548,7 @@ typedef enum ATTRIBUTE_PACKED {
 typedef uint8_t TXFM_CONTEXT;
 
 // An enum for single reference types (and some derived values).
-enum ATTRIBUTE_PACKED {
+enum {
   NONE_FRAME = -1,
   INTRA_FRAME,
   LAST_FRAME,
@@ -572,14 +580,14 @@ enum ATTRIBUTE_PACKED {
 #define REF_FRAMES_LOG2 3
 
 // REF_FRAMES for the cm->ref_frame_map array, 1 scratch frame for the new
-// frame in cm->new_fb_idx, INTER_REFS_PER_FRAME for scaled references on the
-// encoder in the cpi->scaled_ref_idx array.
+// frame in cm->cur_frame, INTER_REFS_PER_FRAME for scaled references on the
+// encoder in the cpi->scaled_ref_buf array.
 #define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME)
 
 #define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
 #define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   LAST_LAST2_FRAMES,      // { LAST_FRAME, LAST2_FRAME }
   LAST_LAST3_FRAMES,      // { LAST_FRAME, LAST3_FRAME }
   LAST_GOLDEN_FRAMES,     // { LAST_FRAME, GOLDEN_FRAME }
@@ -593,7 +601,7 @@ typedef enum ATTRIBUTE_PACKED {
   // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs
   //       that are explicitly signaled.
   UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1,
-} UNIDIR_COMP_REF;
+} UENUM1BYTE(UNIDIR_COMP_REF);
 
 #define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS)
 
@@ -608,14 +616,14 @@ typedef enum ATTRIBUTE_PACKED {
 // NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum.
 typedef int8_t MV_REFERENCE_FRAME;
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   RESTORE_NONE,
   RESTORE_WIENER,
   RESTORE_SGRPROJ,
   RESTORE_SWITCHABLE,
   RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
   RESTORE_TYPES = 4,
-} RestorationType;
+} UENUM1BYTE(RestorationType);
 
 #define SUPERRES_SCALE_BITS 3
 #define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
diff --git a/libaom/av1/common/filter.h b/libaom/av1/common/filter.h
index d7ef5c9..184f5b2 100644
--- a/libaom/av1/common/filter.h
+++ b/libaom/av1/common/filter.h
@@ -37,12 +37,12 @@ typedef enum ATTRIBUTE_PACKED {
   EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS,
 } InterpFilter;
 
-typedef enum {
+enum {
   USE_2_TAPS_ORIG = 0,  // This is used in temporal filtering.
   USE_2_TAPS,
   USE_4_TAPS,
   USE_8_TAPS,
-} SUBPEL_SEARCH_TYPE;
+} UENUM1BYTE(SUBPEL_SEARCH_TYPE);
 
 // Pack two InterpFilter's into a uint32_t: since there are at most 10 filters,
 // we can use 16 bits for each and have more than enough space. This reduces
diff --git a/libaom/av1/common/idct.c b/libaom/av1/common/idct.c
index 55925a5..bff438f 100644
--- a/libaom/av1/common/idct.c
+++ b/libaom/av1/common/idct.c
@@ -204,7 +204,7 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
   txfm_param->eob = eob;
   txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
   txfm_param->bd = xd->bd;
-  txfm_param->is_hbd = get_bitdepth_data_path_index(xd);
+  txfm_param->is_hbd = is_cur_buf_hbd(xd);
   txfm_param->tx_set_type = av1_get_ext_tx_set_type(
       txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
 }
diff --git a/libaom/av1/common/mv.h b/libaom/av1/common/mv.h
index 5b02251..d097f9e 100644
--- a/libaom/av1/common/mv.h
+++ b/libaom/av1/common/mv.h
@@ -56,13 +56,13 @@ typedef struct mv32 {
 #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
 
 /* clang-format off */
-typedef enum ATTRIBUTE_PACKED {
+enum {
   IDENTITY = 0,      // identity transformation, 0-parameter
   TRANSLATION = 1,   // translational motion 2-parameter
   ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
   AFFINE = 3,        // affine, 6-parameter
   TRANS_TYPES,
-} TransformationType;
+} UENUM1BYTE(TransformationType);
 /* clang-format on */
 
 // Number of types used for global motion (must be >= 3 and <= TRANS_TYPES)
@@ -87,18 +87,18 @@ static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
 //  z .  y'  =   m4 m5 m1 *  y
 //       1]      m6 m7 1)    1]
 typedef struct {
-  TransformationType wmtype;
   int32_t wmmat[8];
   int16_t alpha, beta, gamma, delta;
+  TransformationType wmtype;
   int8_t invalid;
 } WarpedMotionParams;
 
 /* clang-format off */
 static const WarpedMotionParams default_warp_params = {
-  IDENTITY,
   { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0,
     0 },
   0, 0, 0, 0,
+  IDENTITY,
   0,
 };
 /* clang-format on */
@@ -263,7 +263,7 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
   return res;
 }
 
-static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
+static INLINE TransformationType get_wmtype(const WarpedMotionParams *gm) {
   if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] &&
       gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) {
     return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION);
diff --git a/libaom/av1/common/mvref_common.c b/libaom/av1/common/mvref_common.c
index b3d9c2f..e38891f 100644
--- a/libaom/av1/common/mvref_common.c
+++ b/libaom/av1/common/mvref_common.c
@@ -347,8 +347,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 
   if (rf[1] == NONE_FRAME) {
     int cur_frame_index = cm->cur_frame->order_hint;
-    const RefCntBuffer *const buf_0 =
-        cm->current_frame.frame_refs[FWD_RF_OFFSET(rf[0])].buf;
+    const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
     int frame0_index = buf_0->order_hint;
     int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
                                          cur_frame_index, frame0_index);
@@ -383,14 +382,12 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   } else {
     // Process compound inter mode
     int cur_frame_index = cm->cur_frame->order_hint;
-    const RefCntBuffer *const buf_0 =
-        cm->current_frame.frame_refs[FWD_RF_OFFSET(rf[0])].buf;
+    const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
     int frame0_index = buf_0->order_hint;
 
     int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
                                          cur_frame_index, frame0_index);
-    const RefCntBuffer *const buf_1 =
-        cm->current_frame.frame_refs[FWD_RF_OFFSET(rf[1])].buf;
+    const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]);
     int frame1_index = buf_1->order_hint;
     int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info,
                                          cur_frame_index, frame1_index);
@@ -824,7 +821,7 @@ void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   MV_REFERENCE_FRAME rf[2];
   av1_set_ref_frame(rf, ref_frame);
 
-  if (ref_frame < REF_FRAMES) {
+  if (global_mvs != NULL && ref_frame < REF_FRAMES) {
     if (ref_frame != INTRA_FRAME) {
       global_mvs[ref_frame] = gm_get_motion_vector(
           &cm->global_motion[ref_frame], cm->allow_high_precision_mv, bsize,
@@ -871,8 +868,7 @@ void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
 
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const RefCntBuffer *const buf =
-        cm->current_frame.frame_refs[ref_frame - LAST_FRAME].buf;
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
     if (buf != NULL)
       cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint;
   }
@@ -881,8 +877,7 @@ void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
 void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const RefCntBuffer *const buf =
-        cm->current_frame.frame_refs[ref_frame - LAST_FRAME].buf;
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
     if (cm->seq_params.order_hint_info.enable_order_hint && buf != NULL) {
       const int ref_order_hint = buf->order_hint;
       cm->ref_frame_sign_bias[ref_frame] =
@@ -942,13 +937,13 @@ static int motion_field_projection(AV1_COMMON *cm,
   TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
   int ref_offset[REF_FRAMES] = { 0 };
 
-  (void)dir;
-
   const RefCntBuffer *const start_frame_buf =
-      cm->current_frame.frame_refs[FWD_RF_OFFSET(start_frame)].buf;
+      get_ref_frame_buf(cm, start_frame);
   if (start_frame_buf == NULL) return 0;
 
-  if (start_frame_buf->intra_only) return 0;
+  if (start_frame_buf->frame_type == KEY_FRAME ||
+      start_frame_buf->frame_type == INTRA_ONLY_FRAME)
+    return 0;
 
   if (start_frame_buf->mi_rows != cm->mi_rows ||
       start_frame_buf->mi_cols != cm->mi_cols)
@@ -1029,7 +1024,7 @@ void av1_setup_motion_field(AV1_COMMON *cm) {
 
   for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     const int ref_idx = ref_frame - LAST_FRAME;
-    const RefCntBuffer *const buf = cm->current_frame.frame_refs[ref_idx].buf;
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
     int order_hint = 0;
 
     if (buf != NULL) order_hint = buf->order_hint;
@@ -1074,8 +1069,7 @@ void av1_setup_motion_field(AV1_COMMON *cm) {
       ref_stamp >= 0)
     if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp;
 
-  if (ref_stamp >= 0 && ref_buf[LAST2_FRAME - LAST_FRAME] != NULL)
-    if (motion_field_projection(cm, LAST2_FRAME, 2)) --ref_stamp;
+  if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2);
 }
 
 static INLINE void record_samples(MB_MODE_INFO *mbmi, int *pts, int *pts_inref,
@@ -1293,7 +1287,7 @@ void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
 
   // Identify the nearest forward and backward references.
   for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-    const RefCntBuffer *const buf = cm->current_frame.frame_refs[i].buf;
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
     if (buf == NULL) continue;
 
     const int ref_order_hint = buf->order_hint;
@@ -1328,7 +1322,7 @@ void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
     // Identify the second nearest forward reference.
     ref_order_hints[1] = -1;
     for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-      const RefCntBuffer *const buf = cm->current_frame.frame_refs[i].buf;
+      const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
       if (buf == NULL) continue;
 
       const int ref_order_hint = buf->order_hint;
@@ -1352,38 +1346,31 @@ void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
 }
 
 typedef struct {
-  int map_idx;   // frame map index
-  int buf_idx;   // frame buffer index
-  int sort_idx;  // index based on the offset to be used for sorting
+  int map_idx;        // frame map index
+  RefCntBuffer *buf;  // frame buffer
+  int sort_idx;       // index based on the offset to be used for sorting
 } REF_FRAME_INFO;
 
+// Compares the sort_idx fields. If they are equal, then compares the map_idx
+// fields to break the tie. This ensures a stable sort.
 static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
   const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a;
   const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b;
 
-  if (info_a->sort_idx < info_b->sort_idx) return -1;
-  if (info_a->sort_idx > info_b->sort_idx) return 1;
-  return (info_a->map_idx < info_b->map_idx)
-             ? -1
-             : ((info_a->map_idx > info_b->map_idx) ? 1 : 0);
+  const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx;
+  if (sort_idx_diff != 0) return sort_idx_diff;
+  return info_a->map_idx - info_b->map_idx;
 }
 
-static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx,
+static void set_ref_frame_info(int *remapped_ref_idx, int frame_idx,
                                REF_FRAME_INFO *ref_info) {
   assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
 
-  const int buf_idx = ref_info->buf_idx;
-
-  cm->current_frame.frame_refs[frame_idx].buf =
-      &cm->buffer_pool->frame_bufs[buf_idx];
-  cm->current_frame.frame_refs[frame_idx].map_idx = ref_info->map_idx;
+  remapped_ref_idx[frame_idx] = ref_info->map_idx;
 }
 
-void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
-                        int gld_map_idx) {
-  BufferPool *const pool = cm->buffer_pool;
-  RefCntBuffer *const frame_bufs = pool->frame_bufs;
-
+void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
+                        int lst_map_idx, int gld_map_idx) {
   int lst_frame_sort_idx = -1;
   int gld_frame_sort_idx = -1;
 
@@ -1402,15 +1389,14 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
     ref_frame_info[i].map_idx = map_idx;
     ref_frame_info[i].sort_idx = -1;
 
-    const int buf_idx = cm->ref_frame_map[map_idx];
-    ref_frame_info[i].buf_idx = buf_idx;
+    RefCntBuffer *const buf = cm->ref_frame_map[map_idx];
+    ref_frame_info[i].buf = buf;
 
-    assert(buf_idx < FRAME_BUFFERS);
-    if (buf_idx < 0) continue;
-    // TODO(zoeliu@google.com): To verify the checking on ref_count.
-    if (frame_bufs[buf_idx].ref_count <= 0) continue;
+    if (buf == NULL) continue;
+    // If this assertion fails, there is a reference leak.
+    assert(buf->ref_count > 0);
 
-    const int offset = (int)frame_bufs[buf_idx].order_hint;
+    const int offset = (int)buf->order_hint;
     ref_frame_info[i].sort_idx =
         (offset == -1) ? -1
                        : cur_frame_sort_idx +
@@ -1461,7 +1447,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
 
   // == ALTREF_FRAME ==
   if (bwd_start_idx <= bwd_end_idx) {
-    set_ref_frame_info(cm, ALTREF_FRAME - LAST_FRAME,
+    set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME,
                        &ref_frame_info[bwd_end_idx]);
     ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1;
     bwd_end_idx--;
@@ -1469,7 +1455,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
 
   // == BWDREF_FRAME ==
   if (bwd_start_idx <= bwd_end_idx) {
-    set_ref_frame_info(cm, BWDREF_FRAME - LAST_FRAME,
+    set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME,
                        &ref_frame_info[bwd_start_idx]);
     ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1;
     bwd_start_idx++;
@@ -1477,7 +1463,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
 
   // == ALTREF2_FRAME ==
   if (bwd_start_idx <= bwd_end_idx) {
-    set_ref_frame_info(cm, ALTREF2_FRAME - LAST_FRAME,
+    set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME,
                        &ref_frame_info[bwd_start_idx]);
     ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1;
   }
@@ -1487,13 +1473,15 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
   for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) {
     // == LAST_FRAME ==
     if (ref_frame_info[i].map_idx == lst_map_idx) {
-      set_ref_frame_info(cm, LAST_FRAME - LAST_FRAME, &ref_frame_info[i]);
+      set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME,
+                         &ref_frame_info[i]);
       ref_flag_list[LAST_FRAME - LAST_FRAME] = 1;
     }
 
     // == GOLDEN_FRAME ==
     if (ref_frame_info[i].map_idx == gld_map_idx) {
-      set_ref_frame_info(cm, GOLDEN_FRAME - LAST_FRAME, &ref_frame_info[i]);
+      set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME,
+                         &ref_frame_info[i]);
       ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1;
     }
   }
@@ -1525,7 +1513,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
     }
     if (fwd_start_idx > fwd_end_idx) break;
 
-    set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+    set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
                        &ref_frame_info[fwd_end_idx]);
     ref_flag_list[ref_frame - LAST_FRAME] = 1;
 
@@ -1536,7 +1524,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
   for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
     const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
     if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
-    set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+    set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
                        &ref_frame_info[fwd_start_idx]);
     ref_flag_list[ref_frame - LAST_FRAME] = 1;
   }
diff --git a/libaom/av1/common/mvref_common.h b/libaom/av1/common/mvref_common.h
index 2dbd12c..0aa9d38 100644
--- a/libaom/av1/common/mvref_common.h
+++ b/libaom/av1/common/mvref_common.h
@@ -70,18 +70,6 @@ static INLINE int_mv get_sub_block_pred_mv(const MB_MODE_INFO *candidate,
   return candidate->mv[which_mv];
 }
 
-// Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
-                              const MV_REFERENCE_FRAME this_ref_frame,
-                              const int *ref_sign_bias) {
-  int_mv mv = mbmi->mv[ref];
-  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
-    mv.as_mv.row *= -1;
-    mv.as_mv.col *= -1;
-  }
-  return mv;
-}
-
 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
 static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
@@ -222,7 +210,8 @@ void av1_setup_frame_buf_refs(AV1_COMMON *cm);
 void av1_setup_frame_sign_bias(AV1_COMMON *cm);
 void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
 void av1_setup_motion_field(AV1_COMMON *cm);
-void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, int gld_map_idx);
+void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
+                        int lst_map_idx, int gld_map_idx);
 
 static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
   av1_zero(xd->neighbors_ref_counts);
@@ -255,6 +244,9 @@ void av1_copy_frame_mvs(const AV1_COMMON *const cm,
                         const MB_MODE_INFO *const mi, int mi_row, int mi_col,
                         int x_mis, int y_mis);
 
+// The global_mvs output parameter points to an array of REF_FRAMES elements.
+// The caller may pass a null global_mvs if it does not need the global_mvs
+// output.
 void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                       MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                       uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
diff --git a/libaom/av1/common/onyxc_int.h b/libaom/av1/common/onyxc_int.h
index 117afb6..8117dfc 100644
--- a/libaom/av1/common/onyxc_int.h
+++ b/libaom/av1/common/onyxc_int.h
@@ -79,14 +79,14 @@ extern "C" {
 #define TXCOEFF_TIMER 0
 #define TXCOEFF_COST_TIMER 0
 
-typedef enum {
+enum {
   SINGLE_REFERENCE = 0,
   COMPOUND_REFERENCE = 1,
   REFERENCE_MODE_SELECT = 2,
   REFERENCE_MODES = 3,
-} REFERENCE_MODE;
+} UENUM1BYTE(REFERENCE_MODE);
 
-typedef enum {
+enum {
   /**
    * Frame context updates are disabled
    */
@@ -96,7 +96,7 @@ typedef enum {
    * updates based on entropy/counts in the decoded frame
    */
   REFRESH_FRAME_CONTEXT_BACKWARD,
-} REFRESH_FRAME_CONTEXT_MODE;
+} UENUM1BYTE(REFRESH_FRAME_CONTEXT_MODE);
 
 #define MFMV_STACK_SIZE 3
 typedef struct {
@@ -109,24 +109,12 @@ typedef struct {
   MV_REFERENCE_FRAME ref_frame;
 } MV_REF;
 
-// FIXME(jack.haughton@argondesign.com): This enum was originally in
-// encoder/ratectrl.h, and is encoder specific. When we move to C++, this
-// should go back there and BufferPool should be templatized.
-typedef enum {
-  INTER_NORMAL = 0,
-  INTER_LOW = 1,
-  INTER_HIGH = 2,
-  GF_ARF_LOW = 3,
-  GF_ARF_STD = 4,
-  KF_STD = 5,
-  RATE_FACTOR_LEVELS = 6
-} RATE_FACTOR_LEVEL;
 
 typedef struct RefCntBuffer {
   // For a RefCntBuffer, the following are reference-holding variables:
   // - cm->ref_frame_map[]
-  // - cm->new_fb_idx
-  // - cm->scaled_ref_idx[] (encoder only)
+  // - cm->cur_frame
+  // - cm->scaled_ref_buf[] (encoder only)
   // - cm->next_ref_frame_map[] (decoder only)
   // - pbi->output_frame_index[] (decoder only)
   // With that definition, 'ref_count' is the number of reference-holding
@@ -136,8 +124,6 @@ typedef struct RefCntBuffer {
   // - Total 'n' of the variables / array elements above have value 'k' (that
   // is, they are pointing to buffer at index 'k').
   // Then, pool->frame_bufs[k].ref_count = n.
-  // TODO(david.turner@argondesign.com) Check whether this helpful comment is
-  // still correct after we finish restructuring
   int ref_count;
 
   unsigned int order_hint;
@@ -154,14 +140,17 @@ typedef struct RefCntBuffer {
   int height;
   WarpedMotionParams global_motion[REF_FRAMES];
   int showable_frame;  // frame can be used as show existing frame in future
-  int film_grain_params_present;
+  uint8_t film_grain_params_present;
   aom_film_grain_t film_grain_params;
   aom_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
   hash_table hash_table;
-  uint8_t intra_only;
   FRAME_TYPE frame_type;
 
+  // This is only used in the encoder but needs to be indexed per ref frame
+  // so it's extremely convenient to keep it here.
+  int interp_filter_selected[SWITCHABLE];
+
   // Inter frame reference frame delta for loop filter
   int8_t ref_deltas[REF_FRAMES];
 
@@ -169,7 +158,6 @@ typedef struct RefCntBuffer {
   int8_t mode_deltas[MAX_MODE_LF_DELTAS];
 
   FRAME_CONTEXT frame_context;
-  RATE_FACTOR_LEVEL frame_rf_level;
 } RefCntBuffer;
 
 typedef struct BufferPool {
@@ -195,18 +183,6 @@ typedef struct BufferPool {
 } BufferPool;
 
 typedef struct {
-  int base_ctx_table[2 /*row*/][2 /*col*/][3 /*sig_map*/]
-                    [BASE_CONTEXT_POSITION_NUM + 1];
-} LV_MAP_CTX_TABLE;
-typedef int BASE_CTX_TABLE[2 /*col*/][3 /*sig_map*/]
-                          [BASE_CONTEXT_POSITION_NUM + 1];
-
-typedef struct BitstreamLevel {
-  uint8_t major;
-  uint8_t minor;
-} BitstreamLevel;
-
-typedef struct {
   int cdef_pri_damping;
   int cdef_sec_damping;
   int nb_cdef_strengths;
@@ -230,11 +206,11 @@ typedef struct {
 
 typedef struct {
   int enable_order_hint;           // 0 - disable order hint, and related tools
-  int order_hint_bits_minus_1;
-                                   // jnt_comp, ref_frame_mvs, frame_sign_bias
-                                   // if 0, enable_jnt_comp and
-                                   // enable_ref_frame_mvs must be set zs 0.
-  int enable_jnt_comp;             // 0 - disable joint compound modes
+  int order_hint_bits_minus_1;     // dist_wtd_comp, ref_frame_mvs,
+                                   // frame_sign_bias
+                                   // if 0, enable_dist_wtd_comp and
+                                   // enable_ref_frame_mvs must be set as 0.
+  int enable_dist_wtd_comp;        // 0 - disable dist-wtd compound modes
                                    // 1 - enable it
   int enable_ref_frame_mvs;        // 0 - disable ref frame mvs
                                    // 1 - enable it
@@ -249,7 +225,7 @@ typedef struct SequenceHeader {
   int num_bits_height;
   int max_frame_width;
   int max_frame_height;
-  int frame_id_numbers_present_flag;
+  uint8_t frame_id_numbers_present_flag;
   int frame_id_length;
   int delta_frame_id_length;
   BLOCK_SIZE sb_size;  // Size of the superblock used for this frame
@@ -258,45 +234,44 @@ typedef struct SequenceHeader {
 
   OrderHintInfo order_hint_info;
 
-  int force_screen_content_tools;  // 0 - force off
-                                   // 1 - force on
-                                   // 2 - adaptive
-  int force_integer_mv;            // 0 - Not to force. MV can be in 1/4 or 1/8
-                                   // 1 - force to integer
-                                   // 2 - adaptive
-  int still_picture;               // Video is a single frame still picture
-  int reduced_still_picture_hdr;   // Use reduced header for still picture
-  int enable_filter_intra;         // enables/disables filterintra
-  int enable_intra_edge_filter;    // enables/disables corner/edge/upsampling
-  int enable_interintra_compound;  // enables/disables interintra_compound
-  int enable_masked_compound;      // enables/disables masked compound
-  int enable_dual_filter;          // 0 - disable dual interpolation filter
-                                   // 1 - enable vert/horiz filter selection
-  int enable_warped_motion;        // 0 - disable warped motion for sequence
-                                   // 1 - enable it for the sequence
-  int enable_superres;     // 0 - Disable superres for the sequence, and disable
-                           //     transmitting per-frame superres enabled flag.
-                           // 1 - Enable superres for the sequence, and also
-                           //     enable per-frame flag to denote if superres is
-                           //     enabled for that frame.
-  int enable_cdef;         // To turn on/off CDEF
-  int enable_restoration;  // To turn on/off loop restoration
+  uint8_t force_screen_content_tools;  // 0 - force off
+                                       // 1 - force on
+                                       // 2 - adaptive
+  uint8_t still_picture;               // Video is a single frame still picture
+  uint8_t reduced_still_picture_hdr;   // Use reduced header for still picture
+  uint8_t force_integer_mv;            // 0 - Don't force. MV can use subpel
+                                       // 1 - force to integer
+                                       // 2 - adaptive
+  uint8_t enable_filter_intra;         // enables/disables filterintra
+  uint8_t enable_intra_edge_filter;    // enables/disables edge upsampling
+  uint8_t enable_interintra_compound;  // enables/disables interintra_compound
+  uint8_t enable_masked_compound;      // enables/disables masked compound
+  uint8_t enable_dual_filter;          // 0 - disable dual interpolation filter
+                                       // 1 - enable vert/horz filter selection
+  uint8_t enable_warped_motion;        // 0 - disable warp for the sequence
+                                       // 1 - enable warp for the sequence
+  uint8_t enable_superres;             // 0 - Disable superres for the sequence
+                                       //     and no frame level superres flag
+                                       // 1 - Enable superres for the sequence
+                                       //     enable per-frame superres flag
+  uint8_t enable_cdef;                 // To turn on/off CDEF
+  uint8_t enable_restoration;          // To turn on/off loop restoration
   BITSTREAM_PROFILE profile;
 
   // Operating point info.
   int operating_points_cnt_minus_1;
   int operating_point_idc[MAX_NUM_OPERATING_POINTS];
-  int display_model_info_present_flag;
-  int decoder_model_info_present_flag;
-  BitstreamLevel level[MAX_NUM_OPERATING_POINTS];
+  uint8_t display_model_info_present_flag;
+  uint8_t decoder_model_info_present_flag;
+  AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS];
   uint8_t tier[MAX_NUM_OPERATING_POINTS];  // seq_tier in the spec. One bit: 0
                                            // or 1.
 
   // Color config.
   aom_bit_depth_t bit_depth;  // AOM_BITS_8 in profile 0 or 1,
                               // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
-  int use_highbitdepth;       // If true, we need to use 16bit frame buffers.
-  int monochrome;             // Monochorme video
+  uint8_t use_highbitdepth;   // If true, we need to use 16bit frame buffers.
+  uint8_t monochrome;         // Monochorme video
   aom_color_primaries_t color_primaries;
   aom_transfer_characteristics_t transfer_characteristics;
   aom_matrix_coefficients_t matrix_coefficients;
@@ -304,9 +279,8 @@ typedef struct SequenceHeader {
   int subsampling_x;          // Chroma subsampling for x
   int subsampling_y;          // Chroma subsampling for y
   aom_chroma_sample_position_t chroma_sample_position;
-  int separate_uv_delta_q;
-
-  int film_grain_params_present;
+  uint8_t separate_uv_delta_q;
+  uint8_t film_grain_params_present;
 } SequenceHeader;
 
 typedef struct {
@@ -318,16 +292,13 @@ typedef struct {
 
 typedef struct {
   FRAME_TYPE frame_type;
-  // Flag signaling that the frame is encoded using only INTRA modes.
-  uint8_t intra_only;
   REFERENCE_MODE reference_mode;
 
   unsigned int order_hint;
   unsigned int frame_number;
   SkipModeInfo skip_mode_info;
-  // Each Inter frame can reference INTER_REFS_PER_FRAME buffers. This maps each
-  // (inter) reference frame type to the corresponding reference buffer.
-  RefBuffer frame_refs[INTER_REFS_PER_FRAME];
+  int refresh_frame_flags;  // Which ref frames are overwritten by this frame
+  int frame_refs_short_signaling;
 } CurrentFrame;
 
 typedef struct AV1Common {
@@ -337,8 +308,6 @@ typedef struct AV1Common {
   int height;
   int render_width;
   int render_height;
-  int last_width;
-  int last_height;
   int timing_info_present;
   aom_timing_info_t timing_info;
   int buffer_removal_time_present;
@@ -347,49 +316,59 @@ typedef struct AV1Common {
   aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
   uint32_t frame_presentation_time;
 
-  int largest_tile_id;
-  size_t largest_tile_size;
   int context_update_tile_id;
 
   // Scale of the current frame with respect to itself.
   struct scale_factors sf_identity;
 
-  YV12_BUFFER_CONFIG *frame_to_show;
   RefCntBuffer *prev_frame;
 
   // TODO(hkuang): Combine this with cur_buf in macroblockd.
   RefCntBuffer *cur_frame;
 
-  // For decoder, ref_frame_map[i] maps reference type 'i' to actual index of
-  // the buffer in the buffer pool ‘cm->buffer_pool.frame_bufs’.
+  // For encoder, we have a two-level mapping from reference frame type to the
+  // corresponding buffer in the buffer pool:
+  // * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ...
+  // EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1)
+  // * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to
+  // the reference counted buffer structure RefCntBuffer, taken from the buffer
+  // pool cm->buffer_pool->frame_bufs.
+  //
+  // LAST_FRAME,                        ...,      EXTREF_FRAME
+  //      |                                           |
+  //      v                                           v
+  // remapped_ref_idx[LAST_FRAME - 1],  ...,  remapped_ref_idx[EXTREF_FRAME - 1]
+  //      |                                           |
+  //      v                                           v
+  // ref_frame_map[],                   ...,     ref_frame_map[]
+  //
+  // Note: INTRA_FRAME always refers to the current frame, so there's no need to
+  // have a remapped index for the same.
+  int remapped_ref_idx[REF_FRAMES];
+
+  struct scale_factors ref_scale_factors[REF_FRAMES];
+
+  // For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to
+  // the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
   // For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps
   // remapped reference index 'j' (that is, original reference type 'i') to
-  // actual index of the buffer in the buffer pool ‘cm->buffer_pool.frame_bufs’.
-  int ref_frame_map[REF_FRAMES];
+  // a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+  RefCntBuffer *ref_frame_map[REF_FRAMES];
 
   // Prepare ref_frame_map for the next frame.
   // Only used in frame parallel decode.
-  int next_ref_frame_map[REF_FRAMES];
-
-  // Index to the 'new' frame (i.e. the frame currently being encoded or
-  // decoded) in the buffer pool 'cm->buffer_pool'.
-  int new_fb_idx;
-
+  RefCntBuffer *next_ref_frame_map[REF_FRAMES];
   FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
 
   int show_frame;
   int showable_frame;  // frame can be used as show existing frame in future
   int show_existing_frame;
-  // Flag for a frame used as a reference - not written to the bitstream
-  int is_reference_frame;
-  int reset_decoder_state;
 
-  uint8_t last_intra_only;
   uint8_t disable_cdf_update;
   int allow_high_precision_mv;
-  int cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
+  uint8_t cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
 
-  int allow_screen_content_tools;
+  uint8_t allow_screen_content_tools;
   int allow_intrabc;
   int allow_warped_motion;
 
@@ -437,6 +416,7 @@ typedef struct AV1Common {
   int qm_v;
   int min_qmlevel;
   int max_qmlevel;
+  int use_quant_b_adapt;
 
   /* We allocate a MB_MODE_INFO struct for each macroblock, together with
      an extra row on top and column on the left to simplify prediction. */
@@ -465,8 +445,6 @@ typedef struct AV1Common {
   int allow_ref_frame_mvs;
 
   uint8_t *last_frame_seg_map;
-  uint8_t *current_frame_seg_map;
-  int seg_map_alloc_size;
 
   InterpFilter interp_filter;
 
@@ -505,17 +483,11 @@ typedef struct AV1Common {
 
   FRAME_CONTEXT *fc;              /* this frame entropy */
   FRAME_CONTEXT *default_frame_context;
-  unsigned int frame_context_idx; /* Context to use/update */
-  int fb_of_context_type[REF_FRAMES];
   int primary_ref_frame;
 
-  aom_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
-
   int error_resilient_mode;
-  int force_primary_ref_none;
 
   int tile_cols, tile_rows;
-  int last_tile_cols, last_tile_rows;
 
   int max_tile_width_sb;
   int min_log2_tile_cols;
@@ -530,6 +502,7 @@ typedef struct AV1Common {
   int tile_col_start_sb[MAX_TILE_COLS + 1];  // valid for 0 <= i <= tile_cols
   int tile_row_start_sb[MAX_TILE_ROWS + 1];  // valid for 0 <= i <= tile_rows
   int tile_width, tile_height;               // In MI units
+  int min_inner_tile_width;                  // min width of non-rightmost tile
 
   unsigned int large_scale_tile;
   unsigned int single_tile_decoding;
@@ -555,8 +528,6 @@ typedef struct AV1Common {
   int current_frame_id;
   int ref_frame_id[REF_FRAMES];
   int valid_for_referencing[REF_FRAMES];
-  int invalid_delta_frame_id_minus_1;
-  LV_MAP_CTX_TABLE coeff_ctx_table;
   TPL_MV_REF *tpl_mvs;
   int tpl_mvs_mem_size;
   // TODO(jingning): This can be combined with sign_bias later.
@@ -564,7 +535,6 @@ typedef struct AV1Common {
 
   int is_annexb;
 
-  int frame_refs_short_signaling;
   int temporal_layer_id;
   int spatial_layer_id;
   unsigned int number_temporal_layers;
@@ -608,9 +578,8 @@ static void unlock_buffer_pool(BufferPool *const pool) {
 
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
   if (index < 0 || index >= REF_FRAMES) return NULL;
-  if (cm->ref_frame_map[index] < 0) return NULL;
-  assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
-  return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
+  if (cm->ref_frame_map[index] == NULL) return NULL;
+  return &cm->ref_frame_map[index]->buf;
 }
 
 static INLINE int get_free_fb(AV1_COMMON *cm) {
@@ -646,38 +615,83 @@ static INLINE int get_free_fb(AV1_COMMON *cm) {
   return i;
 }
 
-// Modify 'idx_ptr' to reference the buffer at 'new_idx', and update the ref
+static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) {
+  // Release the previously-used frame-buffer
+  if (cm->cur_frame != NULL) {
+    --cm->cur_frame->ref_count;
+    cm->cur_frame = NULL;
+  }
+
+  // Assign a new framebuffer
+  const int new_fb_idx = get_free_fb(cm);
+  if (new_fb_idx == INVALID_IDX) return NULL;
+
+  cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx];
+  cm->cur_frame->buf.buf_8bit_valid = 0;
+  av1_zero(cm->cur_frame->interp_filter_selected);
+  return cm->cur_frame;
+}
+
+// Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref
 // counts accordingly.
-static INLINE void assign_frame_buffer(RefCntBuffer *bufs, int *idx_ptr,
-                                       int new_idx) {
-  const int old_idx = *idx_ptr;
-  if (old_idx >= 0) {
-    assert(bufs[old_idx].ref_count > 0);
-    // One less reference to the buffer at 'old_idx', so decrease ref count.
-    --bufs[old_idx].ref_count;
+static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr,
+                                       RefCntBuffer *rhs_ptr) {
+  RefCntBuffer *const old_ptr = *lhs_ptr;
+  if (old_ptr != NULL) {
+    assert(old_ptr->ref_count > 0);
+    // One less reference to the buffer at 'old_ptr', so decrease ref count.
+    --old_ptr->ref_count;
   }
 
-  *idx_ptr = new_idx;
-  // One more reference to the buffer at 'new_idx', so increase ref count.
-  ++bufs[new_idx].ref_count;
+  *lhs_ptr = rhs_ptr;
+  // One more reference to the buffer at 'rhs_ptr', so increase ref count.
+  ++rhs_ptr->ref_count;
 }
 
 static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
   return cm->current_frame.frame_type == KEY_FRAME ||
-      cm->current_frame.intra_only;
+      cm->current_frame.frame_type == INTRA_ONLY_FRAME;
 }
 
 static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
   return cm->current_frame.frame_type == S_FRAME;
 }
 
-static INLINE RefCntBuffer *get_prev_frame(const AV1_COMMON *const cm) {
-  if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
-      cm->current_frame.frame_refs[cm->primary_ref_frame].buf == NULL) {
-    return NULL;
-  } else {
-    return cm->current_frame.frame_refs[cm->primary_ref_frame].buf;
-  }
+// These functions take a reference frame label between LAST_FRAME and
+// EXTREF_FRAME inclusive.  Note that this is different to the indexing
+// previously used by the frame_refs[] array.
+static INLINE int get_ref_frame_map_idx(const AV1_COMMON *const cm,
+                                        const MV_REFERENCE_FRAME ref_frame) {
+  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
+             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
+             : INVALID_IDX;
+}
+
+static INLINE RefCntBuffer *get_ref_frame_buf(
+    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
+}
+
+// Both const and non-const versions of this function are provided so that it
+// can be used with a const AV1_COMMON if needed.
+static INLINE const struct scale_factors *get_ref_scale_factors_const(
+    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
+}
+
+static INLINE struct scale_factors *get_ref_scale_factors(
+    AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
+}
+
+static INLINE RefCntBuffer *get_primary_ref_frame_buf(
+    const AV1_COMMON *const cm) {
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) return NULL;
+  const int map_idx = get_ref_frame_map_idx(cm, cm->primary_ref_frame + 1);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
 }
 
 // Returns 1 if this frame might allow mvs from some reference frame.
@@ -1233,8 +1247,8 @@ static INLINE TX_SIZE get_tx_size(int width, int height) {
   return TX_4X4;
 }
 
-static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
-                                         TXFM_CONTEXT *left_ctx,
+static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx,
+                                         const TXFM_CONTEXT *const left_ctx,
                                          BLOCK_SIZE bsize, TX_SIZE tx_size) {
   const uint8_t txw = tx_size_wide[tx_size];
   const uint8_t txh = tx_size_high[tx_size];
@@ -1358,17 +1372,8 @@ static INLINE int is_coded_lossless(const AV1_COMMON *cm,
   return coded_lossless;
 }
 
-static INLINE int is_valid_seq_level_idx(uint8_t seq_level_idx) {
-  return seq_level_idx < 24 || seq_level_idx == 31;
-}
-
-static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) {
-  assert(bl.major >= LEVEL_MAJOR_MIN && bl.major <= LEVEL_MAJOR_MAX);
-  // Since bl.minor is unsigned a comparison will return a warning:
-  // comparison is always true due to limited range of data type
-  assert(LEVEL_MINOR_MIN == 0);
-  assert(bl.minor <= LEVEL_MINOR_MAX);
-  return ((bl.major - LEVEL_MAJOR_MIN) << LEVEL_MINOR_BITS) + bl.minor;
+static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
+  return seq_level_idx < SEQ_LEVELS || seq_level_idx == SEQ_LEVEL_MAX;
 }
 
 #ifdef __cplusplus
diff --git a/libaom/av1/common/pred_common.h b/libaom/av1/common/pred_common.h
index f667057..d9b30a9 100644
--- a/libaom/av1/common/pred_common.h
+++ b/libaom/av1/common/pred_common.h
@@ -48,20 +48,24 @@ static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
   int prev_l = -1;   // left segment_id
   int prev_u = -1;   // top segment_id
   if ((xd->up_available) && (xd->left_available)) {
-    prev_ul = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
-                             mi_row - 1, mi_col - 1);
+    prev_ul = get_segment_id(cm, cm->cur_frame->seg_map, BLOCK_4X4, mi_row - 1,
+                             mi_col - 1);
   }
   if (xd->up_available) {
-    prev_u = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
-                            mi_row - 1, mi_col - 0);
+    prev_u = get_segment_id(cm, cm->cur_frame->seg_map, BLOCK_4X4, mi_row - 1,
+                            mi_col - 0);
   }
   if (xd->left_available) {
-    prev_l = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
-                            mi_row - 0, mi_col - 1);
+    prev_l = get_segment_id(cm, cm->cur_frame->seg_map, BLOCK_4X4, mi_row - 0,
+                            mi_col - 1);
   }
+  // This property follows from the fact that get_segment_id() returns a
+  // nonnegative value. This allows us to test for all edge cases with a simple
+  // prev_ul < 0 check.
+  assert(IMPLIES(prev_ul >= 0, prev_u >= 0 && prev_l >= 0));
 
   // Pick CDF index based on number of matching/out-of-bounds segment IDs.
-  if (prev_ul < 0 || prev_u < 0 || prev_l < 0) /* Edge case */
+  if (prev_ul < 0) /* Edge cases */
     *cdf_index = 0;
   else if ((prev_ul == prev_u) && (prev_ul == prev_l))
     *cdf_index = 2;
@@ -90,10 +94,8 @@ static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
 static INLINE int get_comp_index_context(const AV1_COMMON *cm,
                                          const MACROBLOCKD *xd) {
   MB_MODE_INFO *mbmi = xd->mi[0];
-  const RefCntBuffer *const bck_buf =
-      cm->current_frame.frame_refs[mbmi->ref_frame[0] - LAST_FRAME].buf;
-  const RefCntBuffer *const fwd_buf =
-      cm->current_frame.frame_refs[mbmi->ref_frame[1] - LAST_FRAME].buf;
+  const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+  const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
   int bck_frame_index = 0, fwd_frame_index = 0;
   int cur_frame_index = cm->cur_frame->order_hint;
 
diff --git a/libaom/av1/common/reconinter.c b/libaom/av1/common/reconinter.c
index f338e1b..ea351cf 100644
--- a/libaom/av1/common/reconinter.c
+++ b/libaom/av1/common/reconinter.c
@@ -84,12 +84,11 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
   if (do_warp && xd->cur_frame_force_integer_mv == 0) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const struct buf_2d *const pre_buf = &pd->pre[ref];
-    av1_warp_plane(&final_warp_params,
-                   xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+    av1_warp_plane(&final_warp_params, is_cur_buf_hbd(xd), xd->bd,
                    pre_buf->buf0, pre_buf->width, pre_buf->height,
                    pre_buf->stride, dst, p_col, p_row, w, h, dst_stride,
                    pd->subsampling_x, pd->subsampling_y, conv_params);
-  } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  } else if (is_cur_buf_hbd(xd)) {
     highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
                            w, h, conv_params, interp_filters, is_intrabc,
                            xd->bd);
@@ -568,14 +567,15 @@ static void build_masked_compound_no_round(
   const int subh = (2 << mi_size_high_log2[sb_type]) == h;
   const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
   const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+  if (is_cur_buf_hbd(xd)) {
     aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
                                   src1_stride, mask, block_size_wide[sb_type],
                                   w, h, subw, subh, conv_params, xd->bd);
-  else
+  } else {
     aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, block_size_wide[sb_type], w,
                                  h, subw, subh, conv_params);
+  }
 }
 
 void av1_make_masked_inter_predictor(
@@ -626,20 +626,20 @@ void av1_make_masked_inter_predictor(
                                  mi->sb_type, h, w, conv_params, xd);
 }
 
-void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
-                                int order_idx, int *fwd_offset, int *bck_offset,
-                                int *use_jnt_comp_avg, int is_compound) {
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+                                     const MB_MODE_INFO *mbmi, int order_idx,
+                                     int *fwd_offset, int *bck_offset,
+                                     int *use_dist_wtd_comp_avg,
+                                     int is_compound) {
   assert(fwd_offset != NULL && bck_offset != NULL);
   if (!is_compound || mbmi->compound_idx) {
-    *use_jnt_comp_avg = 0;
+    *use_dist_wtd_comp_avg = 0;
     return;
   }
 
-  *use_jnt_comp_avg = 1;
-  const RefCntBuffer *const bck_buf =
-      cm->current_frame.frame_refs[mbmi->ref_frame[0] - LAST_FRAME].buf;
-  const RefCntBuffer *const fwd_buf =
-      cm->current_frame.frame_refs[mbmi->ref_frame[1] - LAST_FRAME].buf;
+  *use_dist_wtd_comp_avg = 1;
+  const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+  const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
   const int cur_frame_index = cm->cur_frame->order_hint;
   int bck_frame_index = 0, fwd_frame_index = 0;
 
@@ -800,53 +800,6 @@ void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
   return;
 }
 
-struct obmc_check_mv_field_ctxt {
-  MB_MODE_INFO *current_mi;
-  int mv_field_check_result;
-};
-
-static INLINE void obmc_check_identical_mv(MACROBLOCKD *xd, int rel_mi_col,
-                                           uint8_t nb_mi_width,
-                                           MB_MODE_INFO *nb_mi, void *fun_ctxt,
-                                           const int num_planes) {
-  (void)xd;
-  (void)rel_mi_col;
-  (void)nb_mi_width;
-  (void)num_planes;
-  struct obmc_check_mv_field_ctxt *ctxt =
-      (struct obmc_check_mv_field_ctxt *)fun_ctxt;
-  const MB_MODE_INFO *current_mi = ctxt->current_mi;
-
-  if (ctxt->mv_field_check_result == 0) return;
-
-  if (nb_mi->ref_frame[0] != current_mi->ref_frame[0] ||
-      nb_mi->mv[0].as_int != current_mi->mv[0].as_int ||
-      nb_mi->interp_filters != current_mi->interp_filters) {
-    ctxt->mv_field_check_result = 0;
-  }
-  return;
-}
-
-// Check if the neighbors' motions used by obmc have same parameters as for
-// the current block. If all the parameters are identical, obmc will produce
-// the same prediction as from regular bmc, therefore we can skip the
-// overlapping operations for less complexity. The parameters checked include
-// reference frame, motion vector, and interpolation filter.
-int av1_check_identical_obmc_mv_field(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col) {
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  struct obmc_check_mv_field_ctxt mv_field_check_ctxt = { xd->mi[0], 1 };
-
-  foreach_overlappable_nb_above(cm, xd, mi_col,
-                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
-                                obmc_check_identical_mv, &mv_field_check_ctxt);
-  foreach_overlappable_nb_left(cm, xd, mi_row,
-                               max_neighbor_obmc[mi_size_high_log2[bsize]],
-                               obmc_check_identical_mv, &mv_field_check_ctxt);
-
-  return mv_field_check_ctxt.mv_field_check_result;
-}
-
 struct obmc_inter_pred_ctxt {
   uint8_t **adjacent;
   int *adjacent_stride;
@@ -860,7 +813,7 @@ static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
   (void)above_mi;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hbd = is_cur_buf_hbd(xd);
   const int overlap =
       AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
 
@@ -897,7 +850,7 @@ static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const int overlap =
       AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hbd = is_cur_buf_hbd(xd);
 
   for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
@@ -968,15 +921,15 @@ void av1_setup_build_prediction_by_above_pred(
   for (int ref = 0; ref < num_refs; ++ref) {
     const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
 
-    const RefBuffer *const ref_buf =
-        &ctxt->cm->current_frame.frame_refs[frame - LAST_FRAME];
-
-    xd->block_refs[ref] = ref_buf;
-    if ((!av1_is_valid_scale(&ref_buf->sf)))
+    const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+    const struct scale_factors *const sf =
+        get_ref_scale_factors_const(ctxt->cm, frame);
+    xd->block_ref_scale_factors[ref] = sf;
+    if ((!av1_is_valid_scale(sf)))
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
-    av1_setup_pre_planes(xd, ref, &ref_buf->buf->buf, ctxt->mi_row,
-                         above_mi_col, &ref_buf->sf, num_planes);
+    av1_setup_pre_planes(xd, ref, &ref_buf->buf, ctxt->mi_row, above_mi_col, sf,
+                         num_planes);
   }
 
   xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
@@ -1006,15 +959,16 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
   for (int ref = 0; ref < num_refs; ++ref) {
     const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
 
-    const RefBuffer *const ref_buf =
-        &ctxt->cm->current_frame.frame_refs[frame - LAST_FRAME];
+    const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+    const struct scale_factors *const ref_scale_factors =
+        get_ref_scale_factors_const(ctxt->cm, frame);
 
-    xd->block_refs[ref] = ref_buf;
-    if ((!av1_is_valid_scale(&ref_buf->sf)))
+    xd->block_ref_scale_factors[ref] = ref_scale_factors;
+    if ((!av1_is_valid_scale(ref_scale_factors)))
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
-    av1_setup_pre_planes(xd, ref, &ref_buf->buf->buf, left_mi_row, ctxt->mi_col,
-                         &ref_buf->sf, num_planes);
+    av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, ctxt->mi_col,
+                         ref_scale_factors, num_planes);
   }
 
   xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
@@ -1081,12 +1035,13 @@ static void build_smooth_interintra_mask(uint8_t *mask, int stride,
   }
 }
 
-static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
-                               int wedge_index, int wedge_sign,
-                               BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
-                               uint8_t *comppred, int compstride,
-                               const uint8_t *interpred, int interstride,
-                               const uint8_t *intrapred, int intrastride) {
+static void combine_interintra(INTERINTRA_MODE mode,
+                               int8_t use_wedge_interintra, int wedge_index,
+                               int wedge_sign, BLOCK_SIZE bsize,
+                               BLOCK_SIZE plane_bsize, uint8_t *comppred,
+                               int compstride, const uint8_t *interpred,
+                               int interstride, const uint8_t *intrapred,
+                               int intrastride) {
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
 
@@ -1110,7 +1065,7 @@ static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
 }
 
 static void combine_interintra_highbd(
-    INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index,
+    INTERINTRA_MODE mode, int8_t use_wedge_interintra, int wedge_index,
     int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
     uint8_t *comppred8, int compstride, const uint8_t *interpred8,
     int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
@@ -1140,8 +1095,8 @@ static void combine_interintra_highbd(
 void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
                                                MACROBLOCKD *xd,
                                                BLOCK_SIZE bsize, int plane,
-                                               BUFFER_SET *ctx, uint8_t *dst,
-                                               int dst_stride) {
+                                               const BUFFER_SET *ctx,
+                                               uint8_t *dst, int dst_stride) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ssx = xd->plane[plane].subsampling_x;
   const int ssy = xd->plane[plane].subsampling_y;
@@ -1164,7 +1119,7 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
   const int ssx = xd->plane[plane].subsampling_x;
   const int ssy = xd->plane[plane].subsampling_y;
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     combine_interintra_highbd(
         xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
         xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
@@ -1183,9 +1138,9 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
 // build interintra_predictors for one plane
 void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *pred, int stride,
-                                         BUFFER_SET *ctx, int plane,
+                                         const BUFFER_SET *ctx, int plane,
                                          BLOCK_SIZE bsize) {
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(
         cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
@@ -1204,7 +1159,8 @@ void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
 void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                           uint8_t *upred, uint8_t *vpred,
                                           int ustride, int vstride,
-                                          BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+                                          const BUFFER_SET *ctx,
+                                          BLOCK_SIZE bsize) {
   av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
   av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
 }
diff --git a/libaom/av1/common/reconinter.h b/libaom/av1/common/reconinter.h
index b773679..9d562f9 100644
--- a/libaom/av1/common/reconinter.h
+++ b/libaom/av1/common/reconinter.h
@@ -47,7 +47,7 @@ extern "C" {
 #define WEDGE_NONE -1
 
 // Angles are with respect to horizontal anti-clockwise
-typedef enum {
+enum {
   WEDGE_HORIZONTAL = 0,
   WEDGE_VERTICAL = 1,
   WEDGE_OBLIQUE27 = 2,
@@ -55,7 +55,7 @@ typedef enum {
   WEDGE_OBLIQUE117 = 4,
   WEDGE_OBLIQUE153 = 5,
   WEDGE_DIRECTIONS
-} WedgeDirectionType;
+} UENUM1BYTE(WedgeDirectionType);
 
 // 3-tuple: {direction, x_offset, y_offset}
 typedef struct {
@@ -161,14 +161,13 @@ static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
 void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi);
 int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
                                const struct macroblockd_plane *pd, int dir);
-int av1_check_identical_obmc_mv_field(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col);
 
 static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
                                               BLOCK_SIZE sb_type) {
   const int comp_allowed = is_comp_ref_allowed(sb_type);
   switch (type) {
     case COMPOUND_AVERAGE:
+    case COMPOUND_DISTWTD:
     case COMPOUND_DIFFWTD: return comp_allowed;
     case COMPOUND_WEDGE:
       return comp_allowed && wedge_params_lookup[sb_type].bits > 0;
@@ -247,13 +246,14 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
   return clamped_mv;
 }
 
-static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
-                                       const struct scale_factors *sf) {
+static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset,
+                                           int stride,
+                                           const struct scale_factors *sf) {
   const int x =
       sf ? sf->scale_value_x(x_offset, sf) >> SCALE_EXTRA_BITS : x_offset;
   const int y =
       sf ? sf->scale_value_y(y_offset, sf) >> SCALE_EXTRA_BITS : y_offset;
-  return y * stride + x;
+  return (int64_t)y * stride + x;
 }
 
 static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
@@ -335,25 +335,28 @@ const uint8_t *av1_get_compound_type_mask(
 // build interintra_predictors for one plane
 void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *pred, int stride,
-                                         BUFFER_SET *ctx, int plane,
+                                         const BUFFER_SET *ctx, int plane,
                                          BLOCK_SIZE bsize);
 
 void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                           uint8_t *upred, uint8_t *vpred,
                                           int ustride, int vstride,
-                                          BUFFER_SET *ctx, BLOCK_SIZE bsize);
+                                          const BUFFER_SET *ctx,
+                                          BLOCK_SIZE bsize);
 
 void av1_build_intra_predictors_for_interintra(
     const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
-    BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
+    const BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
 
 void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                             const uint8_t *inter_pred, int inter_stride,
                             const uint8_t *intra_pred, int intra_stride);
 
-void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
-                                int order_idx, int *fwd_offset, int *bck_offset,
-                                int *use_jnt_comp_avg, int is_compound);
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+                                     const MB_MODE_INFO *mbmi, int order_idx,
+                                     int *fwd_offset, int *bck_offset,
+                                     int *use_dist_wtd_comp_avg,
+                                     int is_compound);
 int av1_allow_warp(const MB_MODE_INFO *const mbmi,
                    const WarpTypesAllowed *const warp_types,
                    const WarpedMotionParams *const gm_params,
diff --git a/libaom/av1/common/reconintra.c b/libaom/av1/common/reconintra.c
index df69d6b..559e499 100644
--- a/libaom/av1/common/reconintra.c
+++ b/libaom/av1/common/reconintra.c
@@ -1510,7 +1510,7 @@ void av1_predict_intra_block(
                                xd->color_index_map_offset[plane != 0];
     const uint16_t *const palette =
         mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
       uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
       for (r = 0; r < txhpx; ++r) {
         for (c = 0; c < txwpx; ++c) {
@@ -1569,7 +1569,7 @@ void av1_predict_intra_block(
       tx_size, row_off, col_off, pd->subsampling_x, pd->subsampling_y);
 
   const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     build_intra_predictors_high(
         xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
         filter_intra_mode, tx_size, disable_edge_filter,
diff --git a/libaom/av1/common/resize.c b/libaom/av1/common/resize.c
index d668eae..8b24ed0 100644
--- a/libaom/av1/common/resize.c
+++ b/libaom/av1/common/resize.c
@@ -431,6 +431,7 @@ static int32_t get_upscale_convolve_x0(int in_length, int out_length,
   return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
 }
 
+#ifndef __clang_analyzer__
 static void down2_symeven(const uint8_t *const input, int length,
                           uint8_t *output) {
   // Actual filter len = 2 * filter_len_half.
@@ -485,6 +486,7 @@ static void down2_symeven(const uint8_t *const input, int length,
     }
   }
 }
+#endif
 
 static void down2_symodd(const uint8_t *const input, int length,
                          uint8_t *output) {
@@ -850,6 +852,7 @@ static void highbd_interpolate(const uint16_t *const input, int in_length,
                           &interp_filters[0][0], SUBPEL_TAPS);
 }
 
+#ifndef __clang_analyzer__
 static void highbd_down2_symeven(const uint16_t *const input, int length,
                                  uint16_t *output, int bd) {
   // Actual filter len = 2 * filter_len_half.
@@ -957,6 +960,7 @@ static void highbd_down2_symodd(const uint16_t *const input, int length,
     }
   }
 }
+#endif
 
 static void highbd_resize_multistep(const uint16_t *const input, int length,
                                     uint16_t *output, int olength,
diff --git a/libaom/av1/common/restoration.c b/libaom/av1/common/restoration.c
index c62862b..9e472b8 100644
--- a/libaom/av1/common/restoration.c
+++ b/libaom/av1/common/restoration.c
@@ -1099,7 +1099,7 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
   const int frame_height = frame->crop_heights[0];
   if (aom_realloc_frame_buffer(
           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
-          seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS,
+          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
           cm->byte_alignment, NULL, NULL, NULL) < 0)
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate restoration dst buffer");
diff --git a/libaom/av1/common/restoration.h b/libaom/av1/common/restoration.h
index d834f92..6d6ba37 100644
--- a/libaom/av1/common/restoration.h
+++ b/libaom/av1/common/restoration.h
@@ -22,6 +22,8 @@
 extern "C" {
 #endif
 
+// Border for Loop restoration buffer
+#define AOM_RESTORATION_FRAME_BORDER 32
 #define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
 #define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
 
diff --git a/libaom/av1/common/scale.c b/libaom/av1/common/scale.c
index c525fe2..bac7bd9 100644
--- a/libaom/av1/common/scale.c
+++ b/libaom/av1/common/scale.c
@@ -97,13 +97,13 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
   // subpel_x_q4 != 0 && subpel_y_q4 != 0
   sf->convolve[1][1][0] = av1_convolve_2d_sr;
   // subpel_x_q4 == 0 && subpel_y_q4 == 0
-  sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
+  sf->convolve[0][0][1] = av1_dist_wtd_convolve_2d_copy;
   // subpel_x_q4 == 0
-  sf->convolve[0][1][1] = av1_jnt_convolve_y;
+  sf->convolve[0][1][1] = av1_dist_wtd_convolve_y;
   // subpel_y_q4 == 0
-  sf->convolve[1][0][1] = av1_jnt_convolve_x;
+  sf->convolve[1][0][1] = av1_dist_wtd_convolve_x;
   // subpel_x_q4 != 0 && subpel_y_q4 != 0
-  sf->convolve[1][1][1] = av1_jnt_convolve_2d;
+  sf->convolve[1][1][1] = av1_dist_wtd_convolve_2d;
   // AV1 High BD convolve functions
   // Special case convolve functions should produce the same result as
   // av1_highbd_convolve_2d.
@@ -116,11 +116,11 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
   // subpel_x_q4 != 0 && subpel_y_q4 != 0
   sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr;
   // subpel_x_q4 == 0 && subpel_y_q4 == 0
-  sf->highbd_convolve[0][0][1] = av1_highbd_jnt_convolve_2d_copy;
+  sf->highbd_convolve[0][0][1] = av1_highbd_dist_wtd_convolve_2d_copy;
   // subpel_x_q4 == 0
-  sf->highbd_convolve[0][1][1] = av1_highbd_jnt_convolve_y;
+  sf->highbd_convolve[0][1][1] = av1_highbd_dist_wtd_convolve_y;
   // subpel_y_q4 == 0
-  sf->highbd_convolve[1][0][1] = av1_highbd_jnt_convolve_x;
+  sf->highbd_convolve[1][0][1] = av1_highbd_dist_wtd_convolve_x;
   // subpel_x_q4 != 0 && subpel_y_q4 != 0
-  sf->highbd_convolve[1][1][1] = av1_highbd_jnt_convolve_2d;
+  sf->highbd_convolve[1][1][1] = av1_highbd_dist_wtd_convolve_2d;
 }
diff --git a/libaom/av1/common/scan.h b/libaom/av1/common/scan.h
index 233dc0e..f9c3392 100644
--- a/libaom/av1/common/scan.h
+++ b/libaom/av1/common/scan.h
@@ -25,14 +25,14 @@ extern "C" {
 
 #define MAX_NEIGHBORS 2
 
-typedef enum SCAN_MODE {
+enum {
   SCAN_MODE_ZIG_ZAG,
   SCAN_MODE_COL_DIAG,
   SCAN_MODE_ROW_DIAG,
   SCAN_MODE_COL_1D,
   SCAN_MODE_ROW_1D,
   SCAN_MODES
-} SCAN_MODE;
+} UENUM1BYTE(SCAN_MODE);
 
 extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
 extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
diff --git a/libaom/av1/common/seg_common.h b/libaom/av1/common/seg_common.h
index 8c35bba..fa7894c 100644
--- a/libaom/av1/common/seg_common.h
+++ b/libaom/av1/common/seg_common.h
@@ -24,7 +24,7 @@ extern "C" {
 #define SEG_TEMPORAL_PRED_CTXS 3
 #define SPATIAL_PREDICTION_PROBS 3
 
-typedef enum {
+enum {
   SEG_LVL_ALT_Q,       // Use alternate Quantizer ....
   SEG_LVL_ALT_LF_Y_V,  // Use alternate loop filter value on y plane vertical
   SEG_LVL_ALT_LF_Y_H,  // Use alternate loop filter value on y plane horizontal
@@ -34,7 +34,7 @@ typedef enum {
   SEG_LVL_SKIP,        // Optional Segment (0,0) + skip mode
   SEG_LVL_GLOBALMV,
   SEG_LVL_MAX
-} SEG_LVL_FEATURES;
+} UENUM1BYTE(SEG_LVL_FEATURES);
 
 struct segmentation {
   uint8_t enabled;
diff --git a/libaom/av1/common/tile_common.c b/libaom/av1/common/tile_common.c
index 1b41348..02f50f5 100644
--- a/libaom/av1/common/tile_common.c
+++ b/libaom/av1/common/tile_common.c
@@ -51,6 +51,10 @@ void av1_calculate_tile_cols(AV1_COMMON *const cm) {
   int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
   int i;
 
+  // This will be overridden if there is at least two columns of tiles
+  // (otherwise there is no inner tile width)
+  cm->min_inner_tile_width = -1;
+
   if (cm->uniform_tile_spacing_flag) {
     int start_sb;
     int size_sb = ALIGN_POWER_OF_TWO(sb_cols, cm->log2_tile_cols);
@@ -67,18 +71,29 @@ void av1_calculate_tile_cols(AV1_COMMON *const cm) {
 
     cm->tile_width = size_sb << cm->seq_params.mib_size_log2;
     cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
+    if (cm->tile_cols > 1) {
+      cm->min_inner_tile_width = cm->tile_width;
+    }
   } else {
     int max_tile_area_sb = (sb_rows * sb_cols);
     int widest_tile_sb = 1;
+    int narrowest_inner_tile_sb = 65536;
     cm->log2_tile_cols = tile_log2(1, cm->tile_cols);
     for (i = 0; i < cm->tile_cols; i++) {
       int size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
       widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
+      // ignore the rightmost tile in frame for determining the narrowest
+      if (i < cm->tile_cols - 1)
+        narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb);
     }
     if (cm->min_log2_tiles) {
       max_tile_area_sb >>= (cm->min_log2_tiles + 1);
     }
     cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
+    if (cm->tile_cols > 1) {
+      cm->min_inner_tile_width = narrowest_inner_tile_sb
+                                 << cm->seq_params.mib_size_log2;
+    }
   }
 }
 
@@ -143,30 +158,6 @@ int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
   return sb_cols;
 }
 
-int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
-  // Round the frame up to a whole number of max superblocks
-  mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2);
-
-  // Divide by the signalled number of tiles, rounding up to the multiple of
-  // the max superblock size. To do this, shift right (and round up) to get the
-  // tile size in max super-blocks and then shift left again to convert it to
-  // mi units.
-  const int shift = log2_tile_num + MAX_MIB_SIZE_LOG2;
-  const int max_sb_tile_size =
-      ALIGN_POWER_OF_TWO(mi_frame_size, shift) >> shift;
-  const int mi_tile_size = max_sb_tile_size << MAX_MIB_SIZE_LOG2;
-
-  // The actual number of tiles is the ceiling of the frame size in mi units
-  // divided by mi_size. This is at most 1 << log2_tile_num but might be
-  // strictly less if max_sb_tile_size got rounded up significantly.
-  if (ntiles) {
-    *ntiles = (mi_frame_size + mi_tile_size - 1) / mi_tile_size;
-    assert(*ntiles <= (1 << log2_tile_num));
-  }
-
-  return mi_tile_size;
-}
-
 AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
                                int is_uv) {
   AV1PixelRect r;
@@ -205,3 +196,34 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
 
   return r;
 }
+
+void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) {
+  if (cm->uniform_tile_spacing_flag) {
+    *w = cm->tile_width;
+    *h = cm->tile_height;
+  } else {
+    for (int i = 0; i < cm->tile_cols; ++i) {
+      const int tile_width_sb =
+          cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
+      const int tile_w = tile_width_sb * cm->seq_params.mib_size;
+      assert(i == 0 || tile_w == *w);  // ensure all tiles have same dimension
+      *w = tile_w;
+    }
+
+    for (int i = 0; i < cm->tile_rows; ++i) {
+      const int tile_height_sb =
+          cm->tile_row_start_sb[i + 1] - cm->tile_row_start_sb[i];
+      const int tile_h = tile_height_sb * cm->seq_params.mib_size;
+      assert(i == 0 || tile_h == *h);  // ensure all tiles have same dimension
+      *h = tile_h;
+    }
+  }
+}
+
+int is_min_tile_width_satisfied(const AV1_COMMON *cm) {
+  // Disable check if there is a single tile col in the frame
+  if (cm->tile_cols == 1) return 1;
+
+  return ((cm->min_inner_tile_width << MI_SIZE_LOG2) >=
+          (64 << av1_superres_scaled(cm)));
+}
diff --git a/libaom/av1/common/tile_common.h b/libaom/av1/common/tile_common.h
index c03553d..a235f2d 100644
--- a/libaom/av1/common/tile_common.h
+++ b/libaom/av1/common/tile_common.h
@@ -25,7 +25,6 @@ struct AV1Common;
 typedef struct TileInfo {
   int mi_row_start, mi_row_end;
   int mi_col_start, mi_col_end;
-  int tg_horz_boundary;
   int tile_row;
   int tile_col;
 } TileInfo;
@@ -37,12 +36,6 @@ void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
 
 void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
 void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
-void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
-                         int *max_log2_tile_cols);
-
-// Calculate the correct tile size (width or height) for (1 << log2_tile_num)
-// tiles horizontally or vertically in the frame.
-int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles);
 
 int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile);
 int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile);
@@ -61,10 +54,14 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info,
 #define MAX_TILE_WIDTH (4096)        // Max Tile width in pixels
 #define MAX_TILE_AREA (4096 * 2304)  // Maximum tile area in pixels
 
+void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h);
 void av1_get_tile_limits(struct AV1Common *const cm);
 void av1_calculate_tile_cols(struct AV1Common *const cm);
 void av1_calculate_tile_rows(struct AV1Common *const cm);
 
+// Checks if the minimum tile_width requirement is satisfied
+int is_min_tile_width_satisfied(const struct AV1Common *cm);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libaom/av1/common/txb_common.c b/libaom/av1/common/txb_common.c
index c96d37c..cb92bd8 100644
--- a/libaom/av1/common/txb_common.c
+++ b/libaom/av1/common/txb_common.c
@@ -453,23 +453,6 @@ const int8_t *av1_nz_map_ctx_offset[19] = {
   av1_nz_map_ctx_offset_64x32,  // TX_64x16
 };
 
-void av1_init_lv_map(AV1_COMMON *cm) {
-  LV_MAP_CTX_TABLE *coeff_ctx_table = &cm->coeff_ctx_table;
-  for (int row = 0; row < 2; ++row) {
-    for (int col = 0; col < 2; ++col) {
-      for (int sig_mag = 0; sig_mag < 3; ++sig_mag) {
-        for (int count = 0; count < BASE_CONTEXT_POSITION_NUM + 1; ++count) {
-          if (row == 0 && col == 0 && count > 5) continue;
-          if ((row == 0 || col == 0) && count > 8) continue;
-
-          coeff_ctx_table->base_ctx_table[row][col][sig_mag][count] =
-              get_base_ctx_from_count_mag(row, col, count, sig_mag);
-        }
-      }
-    }
-  }
-}
-
 const int16_t k_eob_group_start[12] = { 0,  1,  2,  3,   5,   9,
                                         17, 33, 65, 129, 257, 513 };
 const int16_t k_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
diff --git a/libaom/av1/common/txb_common.h b/libaom/av1/common/txb_common.h
index 698e95b..8a3932d 100644
--- a/libaom/av1/common/txb_common.h
+++ b/libaom/av1/common/txb_common.h
@@ -159,6 +159,19 @@ static INLINE int get_br_ctx_2d(const uint8_t *const levels,
   return mag + 14;
 }
 
+static AOM_FORCE_INLINE int get_br_ctx_eob(const int c,  // raster order
+                                           const int bwl,
+                                           const TX_CLASS tx_class) {
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  if (c == 0) return 0;
+  if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) ||
+      (tx_class == TX_CLASS_HORIZ && col == 0) ||
+      (tx_class == TX_CLASS_VERT && row == 0))
+    return 7;
+  return 14;
+}
+
 static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
                                        const int c,  // raster order
                                        const int bwl, const TX_CLASS tx_class) {
@@ -272,12 +285,10 @@ static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats(
       const int row = coeff_idx >> bwl;
       const int col = coeff_idx - (row << bwl);
       return ctx + nz_map_ctx_offset_1d[col];
-      break;
     }
     case TX_CLASS_VERT: {
       const int row = coeff_idx >> bwl;
       return ctx + nz_map_ctx_offset_1d[row];
-      break;
     }
     default: break;
   }
@@ -421,6 +432,4 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
 #undef MAX_TX_SIZE_UNIT
 }
 
-void av1_init_lv_map(AV1_COMMON *cm);
-
 #endif  // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/libaom/av1/common/warped_motion.c b/libaom/av1/common/warped_motion.c
index 4144c43..e232e10 100644
--- a/libaom/av1/common/warped_motion.c
+++ b/libaom/av1/common/warped_motion.c
@@ -485,7 +485,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
               uint16_t *dst16 =
                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
               int32_t tmp32 = *p;
-              if (conv_params->use_jnt_comp_avg) {
+              if (conv_params->use_dist_wtd_comp_avg) {
                 tmp32 = tmp32 * conv_params->fwd_offset +
                         sum * conv_params->bck_offset;
                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
@@ -563,7 +563,7 @@ static int64_t highbd_warp_error(
   uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
 
   ConvolveParams conv_params = get_conv_params(0, 0, bd);
-  conv_params.use_jnt_comp_avg = 0;
+  conv_params.use_dist_wtd_comp_avg = 0;
   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
       // avoid warping extra 8x8 blocks in the padded region of the frame
@@ -773,7 +773,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
               uint8_t *dst8 =
                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
               int32_t tmp32 = *p;
-              if (conv_params->use_jnt_comp_avg) {
+              if (conv_params->use_dist_wtd_comp_avg) {
                 tmp32 = tmp32 * conv_params->fwd_offset +
                         sum * conv_params->bck_offset;
                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
@@ -846,7 +846,7 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
   int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
   uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
   ConvolveParams conv_params = get_conv_params(0, 0, 8);
-  conv_params.use_jnt_comp_avg = 0;
+  conv_params.use_dist_wtd_comp_avg = 0;
 
   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
diff --git a/libaom/av1/common/x86/av1_convolve_scale_sse4.c b/libaom/av1/common/x86/av1_convolve_scale_sse4.c
index d9fb537..8f44238 100644
--- a/libaom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/libaom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -175,7 +175,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
             const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
             const __m128i shifted_32 =
@@ -207,7 +207,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           int32_t tmp = dst16[y * dst16_stride + x];
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
             tmp = tmp >> DIST_PRECISION_BITS;
           } else {
@@ -408,7 +408,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
           __m128i p_32 =
               _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
 
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
                                     _mm_mullo_epi32(shifted, wt1));
             shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
@@ -443,7 +443,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           int32_t tmp = dst16[y * dst16_stride + x];
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
             tmp = tmp >> DIST_PRECISION_BITS;
           } else {
diff --git a/libaom/av1/common/x86/av1_inv_txfm_ssse3.c b/libaom/av1/common/x86/av1_inv_txfm_ssse3.c
index 9841bf3..de0a561 100644
--- a/libaom/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/libaom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -2920,8 +2920,18 @@ void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
                             const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   if (!txfm_param->lossless) {
-    av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
-                                   txfm_param->tx_size, txfm_param->eob);
+    switch (txfm_param->tx_size) {
+      case TX_4X16:
+      case TX_16X4:
+        // TODO(http://crbug.com/aomedia/2350): the ssse3 versions cause test
+        // vector mismatches.
+        av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+        break;
+      default:
+        av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
+                                       txfm_param->tx_size, txfm_param->eob);
+        break;
+    }
   } else {
     av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
   }
diff --git a/libaom/av1/common/x86/av1_inv_txfm_ssse3.h b/libaom/av1/common/x86/av1_inv_txfm_ssse3.h
index 66bd339..7d5055d 100644
--- a/libaom/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/libaom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -72,13 +72,13 @@ static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
 }
 
 // 1D itx types
-typedef enum ATTRIBUTE_PACKED {
+enum {
   IDCT_1D,
   IADST_1D,
   IFLIPADST_1D = IADST_1D,
   IIDENTITY_1D,
   ITX_TYPES_1D,
-} ITX_TYPE_1D;
+} UENUM1BYTE(ITX_TYPE_1D);
 
 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
   IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
diff --git a/libaom/av1/common/x86/av1_txfm_sse4.c b/libaom/av1/common/x86/av1_txfm_sse4.c
index 90b9879..65ccd19 100644
--- a/libaom/av1/common/x86/av1_txfm_sse4.c
+++ b/libaom/av1/common/x86/av1_txfm_sse4.c
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "av1/common/av1_txfm.h"
 #include "av1/common/x86/av1_txfm_sse4.h"
diff --git a/libaom/av1/common/x86/convolve_2d_avx2.c b/libaom/av1/common/x86/convolve_2d_avx2.c
index 0acafd0..ae12a60 100644
--- a/libaom/av1/common/x86/convolve_2d_avx2.c
+++ b/libaom/av1/common/x86/convolve_2d_avx2.c
@@ -27,31 +27,15 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                              const int subpel_x_q4, const int subpel_y_q4,
                              ConvolveParams *conv_params) {
   const int bd = 8;
-
-  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
-  int im_h = h + filter_params_y->taps - 1;
   int im_stride = 8;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
+  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   const int bits =
       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
 
-  __m256i filt[4], coeffs_h[4], coeffs_v[4];
-
   assert(conv_params->round_0 > 0);
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
-
   const __m256i round_const_h = _mm256_set1_epi16(
       ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
   const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -65,58 +49,96 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
       ((1 << (offset_bits - conv_params->round_1)) >> 1));
   const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
 
-  for (j = 0; j < w; j += 8) {
-    for (i = 0; i < im_h; i += 2) {
-      __m256i data = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+  __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
 
-      // Load the next line
-      if (i + 1 < im_h)
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_h[0], coeffs_h[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_v[0], coeffs_v[3]), 0)))
+    is_vert_4tap = 1;
+
+  // horz_filt as 4 tap and vert_filt as 8 tap
+  if (is_horiz_4tap) {
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    // horz-filter
+    for (int j = 0; j < w; j += 8) {
+      for (i = 0; i < (im_h - 2); i += 2) {
+        __m256i data = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+        // Load the next line
         data = _mm256_inserti128_si256(
             data,
             _mm_loadu_si128(
                 (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
             1);
+        __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt);
 
-      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+                               round_shift_h);
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
 
+      __m256i data_1 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+      __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt);
       res =
           _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
-
       _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
-    }
 
-    /* Vertical filter */
-    {
+      // vert filter
+      CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+    }
+  } else if (is_vert_4tap) {
+    int im_h = h + 3;
+    const int fo_vert = 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      // horz_filter
+      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
+      // vert_filter
+      __m256i s[6];
       __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
       __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
       __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
       __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
-      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
-      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
 
-      __m256i s[8];
       s[0] = _mm256_unpacklo_epi16(src_0, src_1);
       s[1] = _mm256_unpacklo_epi16(src_2, src_3);
-      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
-
-      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
-      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
-      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+      s[3] = _mm256_unpackhi_epi16(src_0, src_1);
+      s[4] = _mm256_unpackhi_epi16(src_2, src_3);
 
       for (i = 0; i < h; i += 2) {
         const int16_t *data = &im_block[i * im_stride];
 
-        const __m256i s6 =
-            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
-        const __m256i s7 =
-            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+        const __m256i s4 =
+            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+        const __m256i s5 =
+            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
 
-        s[3] = _mm256_unpacklo_epi16(s6, s7);
-        s[7] = _mm256_unpackhi_epi16(s6, s7);
+        s[2] = _mm256_unpacklo_epi16(s4, s5);
+        s[5] = _mm256_unpackhi_epi16(s4, s5);
 
-        __m256i res_a = convolve(s, coeffs_v);
-        __m256i res_b = convolve(s + 4, coeffs_v);
+        __m256i res_a = convolve_4tap(s, coeffs_v + 1);
+        __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1);
 
         // Combine V round and 2F-H-V round into a single rounding
         res_a =
@@ -154,13 +176,25 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
 
         s[0] = s[1];
         s[1] = s[2];
-        s[2] = s[3];
-
+        s[3] = s[4];
         s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
       }
     }
+  } else {
+    int j;
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (j = 0; j < w; j += 8) {
+      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
+
+      CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+    }
   }
 }
 
@@ -195,20 +229,20 @@ void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
 
   if (w == 2) {
     do {
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
     } while (h);
   } else if (w == 4) {
     do {
-      memcpy(dst, src, 4 * sizeof(*src));
+      memmove(dst, src, 4 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 4 * sizeof(*src));
+      memmove(dst, src, 4 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
diff --git a/libaom/av1/common/x86/convolve_2d_sse2.c b/libaom/av1/common/x86/convolve_2d_sse2.c
index b1a62a4..369922b 100644
--- a/libaom/av1/common/x86/convolve_2d_sse2.c
+++ b/libaom/av1/common/x86/convolve_2d_sse2.c
@@ -255,20 +255,20 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
 
   if (w == 2) {
     do {
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
     } while (h);
   } else if (w == 4) {
     do {
-      memcpy(dst, src, 4 * sizeof(*src));
+      memmove(dst, src, 4 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 4 * sizeof(*src));
+      memmove(dst, src, 4 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
@@ -354,12 +354,11 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
   }
 }
 
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
-                                   uint8_t *dst0, int dst_stride0, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
-                                   ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_sse2(
+    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -371,7 +370,7 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const __m128i zero = _mm_setzero_si128();
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   int i, j;
@@ -411,14 +410,14 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
           const __m128i data_ref_0_hi =
               _mm_loadu_si128((__m128i *)(&dst[j + 8]));
 
-          const __m128i comp_avg_res_lo =
-              comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg);
+          const __m128i comp_avg_res_lo = comp_avg(
+              &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result_lo = convolve_rounding(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
 
-          const __m128i comp_avg_res_hi =
-              comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg);
+          const __m128i comp_avg_res_hi = comp_avg(
+              &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result_hi = convolve_rounding(
               &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
@@ -449,7 +448,7 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
           const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/convolve_avx2.c b/libaom/av1/common/x86/convolve_avx2.c
index 0e91ea9..21b9fe4 100644
--- a/libaom/av1/common/x86/convolve_avx2.c
+++ b/libaom/av1/common/x86/convolve_avx2.c
@@ -23,153 +23,239 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride;
-
+  int i, j, is_vert_4tap = 0;
   // right shift is F-1 because we are already dividing
   // filter co-efficients by 2
   const int right_shift_bits = (FILTER_BITS - 1);
   const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
   const __m256i right_shift_const =
       _mm256_set1_epi16((1 << right_shift_bits) >> 1);
-  __m256i coeffs[4], s[8];
 
   assert(conv_params->round_0 <= FILTER_BITS);
   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
-  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
-
   (void)filter_params_x;
   (void)subpel_x_q4;
   (void)conv_params;
+  __m256i coeffs[4], s[8];
+  __m128i d[6];
 
-  for (j = 0; j < w; j += 16) {
-    const uint8_t *data = &src_ptr[j];
-    __m256i src6;
-
-    // Load lines a and b. Line a to lower 128, line b to upper 128
-    const __m256i src_01a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-        0x20);
-
-    const __m256i src_12a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-        0x20);
-
-    const __m256i src_23a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-        0x20);
-
-    const __m256i src_34a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-        0x20);
-
-    const __m256i src_45a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-        0x20);
-
-    src6 = _mm256_castsi128_si256(
-        _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
-    const __m256i src_56a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-        src6, 0x20);
-
-    s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
-    s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
-    s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
-    s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
-    s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
-    s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
-    for (i = 0; i < h; i += 2) {
-      data = &src_ptr[i * src_stride + j];
-      const __m256i src_67a = _mm256_permute2x128_si256(
-          src6,
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-          0x20);
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
 
-      src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
-      const __m256i src_78a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-          src6, 0x20);
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_vert_4tap = 1;
+
+  // vert_filt as 4 tap
+  if (is_vert_4tap) {
+    const int fo_vert = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      const __m256i src_01a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+      const __m256i src_12a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
 
-      s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
-      s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+      const __m256i src_23a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
 
-      const __m256i res_lo = convolve_lowbd(s, coeffs);
+      const __m256i src_34a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
 
-      /* rounding code */
-      // shift by F - 1
-      const __m256i res_16b_lo = _mm256_sra_epi16(
-          _mm256_add_epi16(res_lo, right_shift_const), right_shift);
-      // 8 bit conversion and saturation to uint8
-      __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
 
-      if (w - j > 8) {
-        const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
 
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+        d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+        const __m256i src_45a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+        d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+        const __m256i src_56a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
+
+        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+        const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
         /* rounding code */
         // shift by F - 1
-        const __m256i res_16b_hi = _mm256_sra_epi16(
-            _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+        const __m256i res_16b_lo = _mm256_sra_epi16(
+            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
         // 8 bit conversion and saturation to uint8
-        __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
-
-        __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
-
-        const __m128i res_0 = _mm256_castsi256_si128(res_a);
-        const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
-
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                         res_1);
-      } else {
-        const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
-        if (w - j > 4) {
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+        if (w - j > 8) {
+          const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+          /* rounding code */
+          // shift by F - 1
+          const __m256i res_16b_hi = _mm256_sra_epi16(
+              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_a);
+          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
                            res_1);
-        } else if (w - j > 2) {
-          xx_storel_32(&dst[i * dst_stride + j], res_0);
-          xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
         } else {
-          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
-          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_1);
+          } else if (w - j > 2) {
+            xx_storel_32(&dst[i * dst_stride + j], res_0);
+            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+          } else {
+            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+            __m128i *const p_1 =
+                (__m128i *)&dst[i * dst_stride + j + dst_stride];
+            *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+            *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+          }
         }
+        s[0] = s[1];
+        s[1] = s[2];
+
+        s[3] = s[4];
+        s[4] = s[5];
       }
+    }
+  } else {
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src6;
+
+      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      const __m256i src_01a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
 
-      s[0] = s[1];
-      s[1] = s[2];
-      s[2] = s[3];
+      const __m256i src_12a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+      const __m256i src_23a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+      const __m256i src_34a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+      const __m256i src_45a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      const __m256i src_56a =
+          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+      s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+      s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+      s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+      s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+        const __m256i src_67a = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+        const __m256i src_78a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+        const __m256i res_lo = convolve_lowbd(s, coeffs);
 
-      s[4] = s[5];
-      s[5] = s[6];
-      s[6] = s[7];
+        /* rounding code */
+        // shift by F - 1
+        const __m256i res_16b_lo = _mm256_sra_epi16(
+            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+        if (w - j > 8) {
+          const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+          /* rounding code */
+          // shift by F - 1
+          const __m256i res_16b_hi = _mm256_sra_epi16(
+              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_a);
+          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_1);
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_1);
+          } else if (w - j > 2) {
+            xx_storel_32(&dst[i * dst_stride + j], res_0);
+            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+          } else {
+            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+            __m128i *const p_1 =
+                (__m128i *)&dst[i * dst_stride + j + dst_stride];
+            *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+            *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
     }
   }
 }
@@ -180,26 +266,14 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
-  int i, j;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_0;
 
-  __m256i filt[4], coeffs[4];
-
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
-
   const __m256i round_0_const =
       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
   const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
   const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(bits);
-
+  int i, is_horiz_4tap = 0;
   (void)filter_params_y;
   (void)subpel_y_q4;
 
@@ -208,51 +282,101 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
   assert(conv_params->round_0 > 0);
 
-  if (w <= 8) {
-    for (i = 0; i < h; i += 2) {
-      const __m256i data = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
-          _mm256_castsi128_si256(_mm_loadu_si128(
-              (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
-          0x20);
-
-      __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
-
-      res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
-                                 round_0_shift);
-
-      res_16b =
-          _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift);
-
-      /* rounding code */
-      // 8 bit conversion and saturation to uint8
-      __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
-
-      const __m128i res_0 = _mm256_castsi256_si128(res_8b);
-      const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
-      if (w > 4) {
-        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
-        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
-      } else if (w > 2) {
-        xx_storel_32(&dst[i * dst_stride], res_0);
-        xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
-      } else {
-        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
-        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
-        *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-        *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+  __m256i coeffs[4], filt[4];
+  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // horz_filt as 4 tap
+  if (is_horiz_4tap) {
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    if (w <= 8) {
+      for (i = 0; i < h; i += 2) {
+        const __m256i data = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+            0x20);
+
+        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                   round_0_shift);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                   round_shift);
+
+        /* rounding code */
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+        } else if (w > 2) {
+          xx_storel_32(&dst[i * dst_stride], res_0);
+          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+      }
+    } else {
+      for (i = 0; i < h; ++i) {
+        for (int j = 0; j < w; j += 16) {
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+          // 18 19 20 21 22 23
+          const __m256i data = _mm256_inserti128_si256(
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+              1);
+
+          __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                     round_0_shift);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                     round_shift);
+
+          /* rounding code */
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+          // Store values into the destination buffer
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+          __m128i res = _mm256_castsi256_si128(res_8b);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        }
       }
     }
   } else {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 16) {
-        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
-        // 19 20 21 22 23
-        const __m256i data = _mm256_inserti128_si256(
-            _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
-            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
-            1);
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    if (w <= 8) {
+      for (i = 0; i < h; i += 2) {
+        const __m256i data = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+            0x20);
 
         __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
 
@@ -266,11 +390,49 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
         // 8 bit conversion and saturation to uint8
         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
 
-        // Store values into the destination buffer
-        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-        res_8b = _mm256_permute4x64_epi64(res_8b, 216);
-        __m128i res = _mm256_castsi256_si128(res_8b);
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+        } else if (w > 2) {
+          xx_storel_32(&dst[i * dst_stride], res_0);
+          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+      }
+    } else {
+      for (i = 0; i < h; ++i) {
+        for (int j = 0; j < w; j += 16) {
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+          // 18 19 20 21 22 23
+          const __m256i data = _mm256_inserti128_si256(
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+              1);
+
+          __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                     round_0_shift);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                     round_shift);
+
+          /* rounding code */
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+          // Store values into the destination buffer
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+          __m128i res = _mm256_castsi256_si128(res_8b);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        }
       }
     }
   }
diff --git a/libaom/av1/common/x86/highbd_convolve_2d_avx2.c b/libaom/av1/common/x86/highbd_convolve_2d_avx2.c
index ae68f0b..357df12 100644
--- a/libaom/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/libaom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -238,10 +238,10 @@ void av1_highbd_convolve_2d_copy_sr_avx2(
 
   if (w == 2) {
     do {
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
diff --git a/libaom/av1/common/x86/highbd_convolve_2d_sse4.c b/libaom/av1/common/x86/highbd_convolve_2d_sse4.c
index 3f8dafb..3c1d5d1 100644
--- a/libaom/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/libaom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -21,7 +21,7 @@
 #include "aom_dsp/x86/convolve_sse4_1.h"
 #include "av1/common/convolve.h"
 
-void av1_highbd_jnt_convolve_2d_copy_sse4_1(
+void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -37,7 +37,7 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m128i wt0 = _mm_set1_epi32(w0);
@@ -75,15 +75,17 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
           const __m128i res_unsigned_lo =
               _mm_add_epi32(res_32b_lo, offset_const);
 
-          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
-              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_lo =
+              highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
 
           const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
           const __m128i res_unsigned_hi =
               _mm_add_epi32(res_32b_hi, offset_const);
 
-          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
-              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_hi =
+              highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
 
           const __m128i round_result_lo = highbd_convolve_rounding_sse2(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -132,9 +134,9 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
               _mm_add_epi32(res_32b_hi, offset_const);
 
           const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
-              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
           const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
-              &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+              &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg);
 
           const __m128i round_result_lo = highbd_convolve_rounding_sse2(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -166,7 +168,7 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
   }
 }
 
-void av1_highbd_jnt_convolve_2d_sse4_1(
+void av1_highbd_dist_wtd_convolve_2d_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -179,7 +181,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
   int im_stride = MAX_SB_SIZE;
   int i, j;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
@@ -359,8 +361,9 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
 
             const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
 
-            const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
-                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res =
+                highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
 
             const __m128i round_result = highbd_convolve_rounding_sse2(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -391,10 +394,12 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
             const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
             const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
 
-            const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
-                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-            const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
-                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_lo =
+                highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
+            const __m128i comp_avg_res_hi =
+                highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
 
             const __m128i round_result_lo =
                 highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
diff --git a/libaom/av1/common/x86/highbd_inv_txfm_avx2.c b/libaom/av1/common/x86/highbd_inv_txfm_avx2.c
index 5418057..fe22465 100644
--- a/libaom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/libaom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -4309,213 +4309,17 @@ void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
       highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
                                              stride, tx_type, tx_size, eob, bd);
       break;
-    default: assert(0); break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_16x16_avx2(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
-    default:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  const int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-      // Assembly version doesn't support IDTX, so use C version for it.
-    case IDTX:
-      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
-
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_16x32_avx2(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x16_avx2(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-void av1_highbd_inv_txfm_add_8x8_avx2(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
     case IDTX:
-      av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-  }
-}
-void av1_highbd_inv_txfm_add_8x32_avx2(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x8_avx2(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-void av1_highbd_inv_txfm_add_16x8_avx2(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
     case H_DCT:
-    case V_ADST:
     case H_ADST:
-    case V_FLIPADST:
     case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_8x16_avx2(const tran_low_t *input, uint8_t *dest,
-                                       int stride,
-                                       const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
     case V_DCT:
-    case H_DCT:
     case V_ADST:
-    case H_ADST:
     case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
+                                                tx_size, eob, bd);
       break;
+    default: assert(0); break;
   }
 }
 void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
@@ -4523,33 +4327,12 @@ void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
-    case TX_32X32:
-      av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_16X16:
-      av1_highbd_inv_txfm_add_16x16_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_8X8:
-      av1_highbd_inv_txfm_add_8x8_avx2(input, dest, stride, txfm_param);
-      break;
     case TX_4X8:
       av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
       av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
       break;
-    case TX_8X16:
-      av1_highbd_inv_txfm_add_8x16_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_16X8:
-      av1_highbd_inv_txfm_add_16x8_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_16X32:
-      av1_highbd_inv_txfm_add_16x32_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_32X16:
-      av1_highbd_inv_txfm_add_32x16_avx2(input, dest, stride, txfm_param);
-      break;
     case TX_4X4:
       av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
       break;
@@ -4559,21 +4342,10 @@ void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
     case TX_4X16:
       av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
       break;
-    case TX_8X32:
-      av1_highbd_inv_txfm_add_8x32_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_32X8:
-      av1_highbd_inv_txfm_add_32x8_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_64X64:
-    case TX_32X64:
-    case TX_64X32:
-    case TX_16X64:
-    case TX_64X16:
+    default:
       av1_highbd_inv_txfm2d_add_universe_avx2(
           input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
           txfm_param->eob, txfm_param->bd);
       break;
-    default: assert(0 && "Invalid transform size"); break;
   }
 }
diff --git a/libaom/av1/common/x86/highbd_inv_txfm_sse4.c b/libaom/av1/common/x86/highbd_inv_txfm_sse4.c
index 12c6350..8a8641d 100644
--- a/libaom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/libaom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -583,7 +583,66 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
   _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
   _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
 }
+static void highbd_clamp_epi32_sse4_1(const __m128i *in, __m128i *out,
+                                      const __m128i *clamp_lo,
+                                      const __m128i *clamp_hi, int size) {
+  __m128i a0, a1;
+  for (int i = 0; i < size; i += 4) {
+    a0 = _mm_max_epi32(in[i], *clamp_lo);
+    out[i] = _mm_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
+    out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
+
+    a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
+    out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
+    out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
+  }
+}
+static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  (void)bit;
+  (void)out_shift;
+  __m128i v[4];
+  __m128i fact = _mm_set1_epi32(NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a0, a1;
+
+  a0 = _mm_mullo_epi32(in[0], fact);
+  a1 = _mm_mullo_epi32(in[1], fact);
+  a0 = _mm_add_epi32(a0, offset);
+  a1 = _mm_add_epi32(a1, offset);
+  out[0] = _mm_srai_epi32(a0, NewSqrt2Bits);
+  out[1] = _mm_srai_epi32(a1, NewSqrt2Bits);
+
+  a0 = _mm_mullo_epi32(in[2], fact);
+  a1 = _mm_mullo_epi32(in[3], fact);
+  a0 = _mm_add_epi32(a0, offset);
+  a1 = _mm_add_epi32(a1, offset);
+  out[2] = _mm_srai_epi32(a0, NewSqrt2Bits);
+  out[3] = _mm_srai_epi32(a1, NewSqrt2Bits);
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+  }
+
+  // Transpose for 4x4
+  v[0] = _mm_unpacklo_epi32(out[0], out[1]);
+  v[1] = _mm_unpackhi_epi32(out[0], out[1]);
+  v[2] = _mm_unpacklo_epi32(out[2], out[3]);
+  v[3] = _mm_unpackhi_epi32(out[2], out[3]);
 
+  out[0] = _mm_unpacklo_epi64(v[0], v[2]);
+  out[1] = _mm_unpackhi_epi64(v[0], v[2]);
+  out[2] = _mm_unpacklo_epi64(v[1], v[3]);
+  out[3] = _mm_unpackhi_epi64(v[1], v[3]);
+}
 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
@@ -646,6 +705,48 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
       iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
+    case IDTX:
+      load_buffer_4x4(coeff, in);
+      iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_DCT:
+      load_buffer_4x4(coeff, in);
+      iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case H_DCT:
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_ADST:
+      load_buffer_4x4(coeff, in);
+      iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case H_ADST:
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(coeff, in);
+      iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
     default: assert(0);
   }
 }
@@ -1116,6 +1217,61 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                      &clamp_hi_out, out_shift);
   }
 }
+static void shift_sse4_1(const __m128i *in, __m128i *out,
+                         const __m128i *clamp_lo, const __m128i *clamp_hi,
+                         int shift, int size) {
+  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+  __m128i shift_vec = _mm_cvtsi32_si128(shift);
+  __m128i a0, a1;
+  for (int i = 0; i < size; i += 4) {
+    a0 = _mm_add_epi32(in[i], offset);
+    a1 = _mm_add_epi32(in[i + 1], offset);
+    a0 = _mm_sra_epi32(a0, shift_vec);
+    a1 = _mm_sra_epi32(a1, shift_vec);
+    a0 = _mm_max_epi32(a0, *clamp_lo);
+    a1 = _mm_max_epi32(a1, *clamp_lo);
+    out[i] = _mm_min_epi32(a0, *clamp_hi);
+    out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
+
+    a0 = _mm_add_epi32(in[i + 2], offset);
+    a1 = _mm_add_epi32(in[i + 3], offset);
+    a0 = _mm_sra_epi32(a0, shift_vec);
+    a1 = _mm_sra_epi32(a1, shift_vec);
+    a0 = _mm_max_epi32(a0, *clamp_lo);
+    a1 = _mm_max_epi32(a1, *clamp_lo);
+    out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
+    out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
+  }
+}
+
+static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  (void)bit;
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i v[8];
+  v[0] = _mm_add_epi32(in[0], in[0]);
+  v[1] = _mm_add_epi32(in[1], in[1]);
+  v[2] = _mm_add_epi32(in[2], in[2]);
+  v[3] = _mm_add_epi32(in[3], in[3]);
+  v[4] = _mm_add_epi32(in[4], in[4]);
+  v[5] = _mm_add_epi32(in[5], in[5]);
+  v[6] = _mm_add_epi32(in[6], in[6]);
+  v[7] = _mm_add_epi32(in[7], in[7]);
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 8);
+  } else {
+    highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 8);
+  }
+}
 
 static void round_shift_8x8(__m128i *in, int shift) {
   round_shift_4x4(&in[0], shift);
@@ -3000,7 +3156,59 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     }
   }
 }
+static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+  (void)bit;
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i v[16];
+  __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a0, a1, a2, a3;
+
+  for (int i = 0; i < 16; i += 8) {
+    a0 = _mm_mullo_epi32(in[i], fact);
+    a1 = _mm_mullo_epi32(in[i + 1], fact);
+    a0 = _mm_add_epi32(a0, offset);
+    a1 = _mm_add_epi32(a1, offset);
+    v[i] = _mm_srai_epi32(a0, NewSqrt2Bits);
+    v[i + 1] = _mm_srai_epi32(a1, NewSqrt2Bits);
+
+    a2 = _mm_mullo_epi32(in[i + 2], fact);
+    a3 = _mm_mullo_epi32(in[i + 3], fact);
+    a2 = _mm_add_epi32(a2, offset);
+    a3 = _mm_add_epi32(a3, offset);
+    v[i + 2] = _mm_srai_epi32(a2, NewSqrt2Bits);
+    v[i + 3] = _mm_srai_epi32(a3, NewSqrt2Bits);
+
+    a0 = _mm_mullo_epi32(in[i + 4], fact);
+    a1 = _mm_mullo_epi32(in[i + 5], fact);
+    a0 = _mm_add_epi32(a0, offset);
+    a1 = _mm_add_epi32(a1, offset);
+    v[i + 4] = _mm_srai_epi32(a0, NewSqrt2Bits);
+    v[i + 5] = _mm_srai_epi32(a1, NewSqrt2Bits);
+
+    a2 = _mm_mullo_epi32(in[i + 6], fact);
+    a3 = _mm_mullo_epi32(in[i + 7], fact);
+    a2 = _mm_add_epi32(a2, offset);
+    a3 = _mm_add_epi32(a3, offset);
+    v[i + 6] = _mm_srai_epi32(a2, NewSqrt2Bits);
+    v[i + 7] = _mm_srai_epi32(a3, NewSqrt2Bits);
+  }
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
 
+    shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 16);
+  } else {
+    highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 16);
+  }
+}
 static INLINE void idct64_stage8_sse4_1(
     __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
@@ -5020,207 +5228,23 @@ void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
     case IDTX:
-      av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                    tx_type, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                         int stride,
-                                         const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
     case H_DCT:
-    case V_ADST:
     case H_ADST:
-    case V_FLIPADST:
     case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                         int stride,
-                                         const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
     case V_DCT:
-    case H_DCT:
     case V_ADST:
-    case H_ADST:
     case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
       av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
                                                 txfm_param->tx_size,
                                                 txfm_param->eob, bd);
       break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
-                                          uint8_t *dest, int stride,
-                                          const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
     default:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
-                                          uint8_t *dest, int stride,
-                                          const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-      // Assembly version doesn't support IDTX, so use C version for it.
-    case IDTX:
-      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_16x32_sse4_1(const tran_low_t *input,
-                                          uint8_t *dest, int stride,
-                                          const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x16_sse4_1(const tran_low_t *input,
-                                          uint8_t *dest, int stride,
-                                          const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_8x32_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                         int stride,
-                                         const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default: assert(0);
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x8_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                         int stride,
-                                         const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-    case IDTX:
-      av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
+      av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, bd);
       break;
-    default: assert(0);
   }
 }
-
 void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
                                         int stride,
                                         const TxfmParam *txfm_param) {
@@ -5235,53 +5259,271 @@ void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
     av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
     return;
   }
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                    tx_type, bd);
-      break;
-  }
+  av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                                bd);
 }
+static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+  (void)bit;
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i v[32];
+  for (int i = 0; i < 32; i += 16) {
+    v[i] = _mm_slli_epi32(in[i], 2);
+    v[i + 1] = _mm_slli_epi32(in[i + 1], 2);
+    v[i + 2] = _mm_slli_epi32(in[i + 2], 2);
+    v[i + 3] = _mm_slli_epi32(in[i + 3], 2);
+    v[i + 4] = _mm_slli_epi32(in[i + 4], 2);
+    v[i + 5] = _mm_slli_epi32(in[i + 5], 2);
+    v[i + 6] = _mm_slli_epi32(in[i + 6], 2);
+    v[i + 7] = _mm_slli_epi32(in[i + 7], 2);
+    v[i + 8] = _mm_slli_epi32(in[i + 8], 2);
+    v[i + 9] = _mm_slli_epi32(in[i + 9], 2);
+    v[i + 10] = _mm_slli_epi32(in[i + 10], 2);
+    v[i + 11] = _mm_slli_epi32(in[i + 11], 2);
+    v[i + 12] = _mm_slli_epi32(in[i + 12], 2);
+    v[i + 13] = _mm_slli_epi32(in[i + 13], 2);
+    v[i + 14] = _mm_slli_epi32(in[i + 14], 2);
+    v[i + 15] = _mm_slli_epi32(in[i + 15], 2);
+  }
 
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 32);
+  } else {
+    highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 32);
+  }
+}
 static const transform_1d_sse4_1
     highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
       {
           { idct4x4_sse4_1, NULL, NULL, NULL },
           { iadst4x4_sse4_1, NULL, NULL, NULL },
-          { NULL, NULL, NULL, NULL },
+          { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
       },
       { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
         { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
-        { NULL, NULL, NULL, NULL } },
+        { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
       {
           { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
             NULL },
           { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
             NULL },
-          { NULL, NULL, NULL, NULL },
+          { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
       },
       { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
           idct32x32_sse4_1 },
         { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL } },
+        { iidentity32_sse4_1, NULL, NULL, NULL } },
       { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
           idct64x64_sse4_1 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } }
     };
+static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div4 = input_stride >> 2;
+  const int buf_size_h_div8 = (eoby + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
+    __m128i buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_w_div4; ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
+                                           NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+
+    for (int j = 0; j < buf_size_w_div4; ++j) {
+      _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+      _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+      _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+      _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+    }
+  }
+  for (int i = 0; i < buf_size_w_div4; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
+
+  // write to buffer
+  for (int i = 0; i < (txfm_size_col >> 3); i++) {
+    highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                   stride, ud_flip, txfm_size_row, bd);
+  }
+}
+static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div8 = input_stride >> 2;
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    __m128i buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(
+          buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, ud_flip,
+                                     txfm_size_row, bd);
+    }
+  }
+}
+static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
+                                              uint16_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob, const int bd) {
+  (void)eob;
+  __m128i buf1[64 * 4];
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    __m128i buf0[32];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (input_stride >> 2); ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
+                                           NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    for (int j = 0; j < (input_stride >> 2); ++j) {
+      _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+      _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+      _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+      _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+    }
+  }
+  for (int i = 0; i < (input_stride >> 2); i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
 
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, 0, txfm_size_row,
+                                     bd);
+    }
+  }
+}
 static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
                                                     uint16_t *output,
                                                     int stride, TX_TYPE tx_type,
@@ -5613,6 +5855,24 @@ void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
           bd);
       break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      highbd_inv_txfm2d_add_h_identity_ssse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      highbd_inv_txfm2d_add_v_identity_ssse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
+      break;
+    case IDTX:
+      highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
+                                        stride, tx_type, tx_size, eob, bd);
+      break;
     default: assert(0); break;
   }
 }
@@ -5623,26 +5883,9 @@ void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const TX_SIZE tx_size = txfm_param->tx_size;
-  const int32_t *src = cast_to_int32(input);
   int eob = txfm_param->eob;
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
-                                      tx_type, tx_size, eob, bd);
-      break;
-  }
+  highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, tx_size, eob, bd);
 }
 
 void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
@@ -5651,26 +5894,9 @@ void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const TX_SIZE tx_size = txfm_param->tx_size;
-  const int32_t *src = cast_to_int32(input);
   int eob = txfm_param->eob;
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
-                                      tx_type, tx_size, eob, bd);
-      break;
-  }
+  highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, tx_size, eob, bd);
 }
 
 void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
@@ -5679,26 +5905,9 @@ void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const TX_SIZE tx_size = txfm_param->tx_size;
-  const int32_t *src = cast_to_int32(input);
   int eob = txfm_param->eob;
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                                bd);
-      break;
-    default:
-      highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest),
-                                        stride, tx_type, tx_size, eob, bd);
-      break;
-  }
+  highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, tx_size, eob, bd);
 }
 
 void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
@@ -5707,26 +5916,9 @@ void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const TX_SIZE tx_size = txfm_param->tx_size;
-  const int32_t *src = cast_to_int32(input);
   int eob = txfm_param->eob;
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                                bd);
-      break;
-    default:
-      highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest),
-                                        stride, tx_type, tx_size, eob, bd);
-      break;
-  }
+  highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, tx_size, eob, bd);
 }
 
 void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
@@ -5734,57 +5926,16 @@ void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
-    case TX_32X32:
-      av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X16:
-      av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_8X8:
-      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
-      break;
     case TX_4X8:
       av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
       av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
       break;
-    case TX_8X16:
-      av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X8:
-      av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X32:
-      av1_highbd_inv_txfm_add_16x32_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_32X16:
-      av1_highbd_inv_txfm_add_32x16_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_4X4:
-      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X4:
-      av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_4X16:
-      av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_8X32:
-      av1_highbd_inv_txfm_add_8x32_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_32X8:
-      av1_highbd_inv_txfm_add_32x8_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_64X64:
-    case TX_32X64:
-    case TX_64X32:
-    case TX_16X64:
-    case TX_64X16:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(
-          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
-          txfm_param->eob, txfm_param->bd);
+    default:
+      // TODO(http://crbug.com/aomedia/2350): the remaining sse4_1 versions
+      // cause test vector mismatches.
+      av1_highbd_inv_txfm_add_c(input, dest, stride, txfm_param);
       break;
-    default: assert(0 && "Invalid transform size"); break;
   }
 }
diff --git a/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c b/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c
index e298cf6..c5040c4 100644
--- a/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -22,7 +22,7 @@
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
 
-void av1_highbd_jnt_convolve_2d_copy_avx2(
+void av1_highbd_dist_wtd_convolve_2d_copy_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -38,7 +38,7 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m256i wt0 = _mm256_set1_epi32(w0);
@@ -78,15 +78,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
           const __m256i res_unsigned_lo =
               _mm256_add_epi32(res_32b_lo, offset_const);
 
-          const __m256i comp_avg_res_lo = highbd_comp_avg(
-              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m256i comp_avg_res_lo =
+              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
 
           const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
           const __m256i res_unsigned_hi =
               _mm256_add_epi32(res_32b_hi, offset_const);
 
-          const __m256i comp_avg_res_hi = highbd_comp_avg(
-              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+          const __m256i comp_avg_res_hi =
+              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
 
           const __m256i round_result_lo = highbd_convolve_rounding(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -135,8 +137,9 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
             const __m256i res_unsigned_lo =
                 _mm256_add_epi32(res_32b, offset_const);
 
-            const __m256i comp_avg_res = highbd_comp_avg(
-                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res =
+                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result = highbd_convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -179,15 +182,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
             const __m256i res_unsigned_lo =
                 _mm256_add_epi32(res_32b_lo, offset_const);
 
-            const __m256i comp_avg_res_lo = highbd_comp_avg(
-                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_lo =
+                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
             const __m256i res_unsigned_hi =
                 _mm256_add_epi32(res_32b_hi, offset_const);
 
-            const __m256i comp_avg_res_hi = highbd_comp_avg(
-                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_hi =
+                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result_lo =
                 highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
@@ -223,7 +228,7 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
   }
 }
 
-void av1_highbd_jnt_convolve_2d_avx2(
+void av1_highbd_dist_wtd_convolve_2d_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -244,7 +249,7 @@ void av1_highbd_jnt_convolve_2d_avx2(
 
   __m256i s[8], coeffs_y[4], coeffs_x[4];
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
@@ -364,8 +369,9 @@ void av1_highbd_jnt_convolve_2d_avx2(
 
             const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
 
-            const __m256i comp_avg_res = highbd_comp_avg(
-                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res =
+                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result = highbd_convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -409,10 +415,12 @@ void av1_highbd_jnt_convolve_2d_avx2(
             const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
             const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
 
-            const __m256i comp_avg_res_lo = highbd_comp_avg(
-                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-            const __m256i comp_avg_res_hi = highbd_comp_avg(
-                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_lo =
+                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+            const __m256i comp_avg_res_hi =
+                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result_lo =
                 highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
@@ -456,7 +464,7 @@ void av1_highbd_jnt_convolve_2d_avx2(
   }
 }
 
-void av1_highbd_jnt_convolve_x_avx2(
+void av1_highbd_dist_wtd_convolve_x_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -473,7 +481,7 @@ void av1_highbd_jnt_convolve_x_avx2(
   __m256i s[4], coeffs_x[4];
 
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m256i wt0 = _mm256_set1_epi32(w0);
@@ -548,7 +556,7 @@ void av1_highbd_jnt_convolve_x_avx2(
           const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
 
           const __m256i comp_avg_res = highbd_comp_avg(
-              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
 
           const __m256i round_result = highbd_convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -588,10 +596,12 @@ void av1_highbd_jnt_convolve_x_avx2(
           const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
           const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
 
-          const __m256i comp_avg_res_lo = highbd_comp_avg(
-              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-          const __m256i comp_avg_res_hi = highbd_comp_avg(
-              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+          const __m256i comp_avg_res_lo =
+              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
+          const __m256i comp_avg_res_hi =
+              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
 
           const __m256i round_result_lo = highbd_convolve_rounding(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -623,7 +633,7 @@ void av1_highbd_jnt_convolve_x_avx2(
   }
 }
 
-void av1_highbd_jnt_convolve_y_avx2(
+void av1_highbd_dist_wtd_convolve_y_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -640,7 +650,7 @@ void av1_highbd_jnt_convolve_y_avx2(
   int i, j;
   __m256i s[8], coeffs_y[4];
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
@@ -753,8 +763,9 @@ void av1_highbd_jnt_convolve_y_avx2(
 
             const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
 
-            const __m256i comp_avg_res = highbd_comp_avg(
-                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res =
+                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result = highbd_convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -799,10 +810,12 @@ void av1_highbd_jnt_convolve_y_avx2(
             const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
             const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
 
-            const __m256i comp_avg_res_lo = highbd_comp_avg(
-                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-            const __m256i comp_avg_res_hi = highbd_comp_avg(
-                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_lo =
+                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+            const __m256i comp_avg_res_hi =
+                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result_lo =
                 highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
diff --git a/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c b/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c
index 1a29985..7fea36a 100644
--- a/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -17,7 +17,7 @@
 #include "aom_dsp/x86/convolve_sse2.h"
 #include "aom_dsp/x86/convolve_sse4_1.h"
 
-void av1_highbd_jnt_convolve_y_sse4_1(
+void av1_highbd_dist_wtd_convolve_y_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -33,7 +33,7 @@ void av1_highbd_jnt_convolve_y_sse4_1(
   assert(bits >= 0);
   int i, j;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
@@ -121,10 +121,12 @@ void av1_highbd_jnt_convolve_y_sse4_1(
             const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
             const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
 
-            const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1(
-                &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg);
-            const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1(
-                &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_0 =
+                highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
+            const __m128i comp_avg_res_1 =
+                highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
 
             const __m128i round_result_0 =
                 highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
@@ -186,16 +188,16 @@ void av1_highbd_jnt_convolve_y_sse4_1(
 
             const __m128i comp_avg_res_lo_0 =
                 highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
-                                       &wt0, &wt1, use_jnt_comp_avg);
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
             const __m128i comp_avg_res_lo_1 =
                 highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
-                                       &wt0, &wt1, use_jnt_comp_avg);
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
             const __m128i comp_avg_res_hi_0 =
                 highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
-                                       &wt0, &wt1, use_jnt_comp_avg);
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
             const __m128i comp_avg_res_hi_1 =
                 highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
-                                       &wt0, &wt1, use_jnt_comp_avg);
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
 
             const __m128i round_result_lo_0 =
                 highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
@@ -257,7 +259,7 @@ void av1_highbd_jnt_convolve_y_sse4_1(
   }
 }
 
-void av1_highbd_jnt_convolve_x_sse4_1(
+void av1_highbd_dist_wtd_convolve_x_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -274,7 +276,7 @@ void av1_highbd_jnt_convolve_x_sse4_1(
   __m128i s[4], coeffs_x[4];
 
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m128i wt0 = _mm_set1_epi32(w0);
@@ -339,7 +341,7 @@ void av1_highbd_jnt_convolve_x_sse4_1(
           const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
 
           const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
-              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
           const __m128i round_result = highbd_convolve_rounding_sse2(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
 
@@ -359,10 +361,12 @@ void av1_highbd_jnt_convolve_x_sse4_1(
           const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
           const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
 
-          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
-              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
-              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_lo =
+              highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
+          const __m128i comp_avg_res_hi =
+              highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
 
           const __m128i round_result_lo = highbd_convolve_rounding_sse2(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/highbd_warp_plane_sse4.c b/libaom/av1/common/x86/highbd_warp_plane_sse4.c
index 4bcab05..3765c5e 100644
--- a/libaom/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/libaom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -537,7 +537,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
             __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
             __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
 
-            if (conv_params->use_jnt_comp_avg) {
+            if (conv_params->use_dist_wtd_comp_avg) {
               res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
                                      _mm_mullo_epi32(res_lo, wt1));
               res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
@@ -570,7 +570,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
                   (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
               __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
 
-              if (conv_params->use_jnt_comp_avg) {
+              if (conv_params->use_dist_wtd_comp_avg) {
                 res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
                                        _mm_mullo_epi32(res_hi, wt1));
                 res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
diff --git a/libaom/av1/common/x86/jnt_convolve_avx2.c b/libaom/av1/common/x86/jnt_convolve_avx2.c
index 9f2e2b4..23cd6ab 100644
--- a/libaom/av1/common/x86/jnt_convolve_avx2.c
+++ b/libaom/av1/common/x86/jnt_convolve_avx2.c
@@ -35,22 +35,20 @@ static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
       _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
 }
 
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                             int dst_stride0, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
-  int i, j;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_horiz;
+  int i, j, is_horiz_4tap = 0;
   const int bits = FILTER_BITS - conv_params->round_1;
   const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -58,18 +56,10 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const int rounding_shift =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
-  __m256i filt[4], coeffs[4];
 
   assert(bits >= 0);
   assert(conv_params->round_0 > 0);
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
-
   const __m256i round_const =
       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -77,68 +67,136 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   (void)filter_params_y;
   (void)subpel_y_q4;
 
-  for (i = 0; i < h; i += 2) {
-    const uint8_t *src_data = src_ptr + i * src_stride;
-    CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
-    for (j = 0; j < w; j += 8) {
-      const __m256i data =
-          load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+  __m256i filt[4], coeffs[4];
 
-      __m256i res = convolve_lowbd_x(data, coeffs, filt);
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
 
-      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
 
-      res = _mm256_slli_epi16(res, bits);
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_horiz_4tap = 1;
 
-      const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+  // horz_filt as 4 tap
+  if (is_horiz_4tap) {
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    for (i = 0; i < h; i += 2) {
+      const uint8_t *src_data = src_ptr + i * src_stride;
+      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+      for (j = 0; j < w; j += 8) {
+        const __m256i data =
+            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
 
-      // Accumulate values into the destination buffer
-      if (do_average) {
-        const __m256i data_ref_0 =
-            load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
-        const __m256i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+        __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+        res = _mm256_slli_epi16(res, bits);
 
-        const __m256i round_result = convolve_rounding(
-            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
 
-        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
-        const __m128i res_0 = _mm256_castsi256_si128(res_8);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m256i data_ref_0 =
+              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
 
-        if (w > 4) {
-          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-          _mm_storel_epi64(
-              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
         } else {
-          *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
-          *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-              _mm_cvtsi128_si32(res_1);
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
         }
-      } else {
-        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
-        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+      }
+    }
+  } else {
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+    for (i = 0; i < h; i += 2) {
+      const uint8_t *src_data = src_ptr + i * src_stride;
+      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+      for (j = 0; j < w; j += 8) {
+        const __m256i data =
+            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+        __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+        res = _mm256_slli_epi16(res, bits);
+
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m256i data_ref_0 =
+              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
 
-        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
-        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                        res_1);
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
       }
     }
   }
 }
 
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                             int dst_stride0, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride;
+  int i, j, is_vert_4tap = 0;
   // +1 to compensate for dividing the filter coeffs by 2
   const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
   const __m256i round_const =
@@ -146,7 +204,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
   const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -168,195 +226,389 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   (void)filter_params_x;
   (void)subpel_x_q4;
 
-  for (j = 0; j < w; j += 16) {
-    const uint8_t *data = &src_ptr[j];
-    __m256i src6;
-    // Load lines a and b. Line a to lower 128, line b to upper 128
-    {
-      __m256i src_ab[7];
-      __m256i src_a[7];
-      src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-      for (int kk = 0; kk < 6; ++kk) {
-        data += src_stride;
-        src_a[kk + 1] =
-            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-        src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_vert_4tap = 1;
+
+  if (is_vert_4tap) {
+    const int fo_vert = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src4;
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      {
+        __m256i src_ab[4];
+        __m256i src_a[5];
+        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        for (int kk = 0; kk < 4; ++kk) {
+          data += src_stride;
+          src_a[kk + 1] =
+              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+          src_ab[kk] =
+              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+        }
+        src4 = src_a[4];
+        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+
+        s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+        s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
       }
-      src6 = src_a[6];
-      s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
-      s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
-      s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
-      s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
-      s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
-      s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
-    }
 
-    for (i = 0; i < h; i += 2) {
-      data = &src_ptr[(i + 7) * src_stride + j];
-      const __m256i src7 =
-          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-      const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[(i + 5) * src_stride + j];
+        const __m256i src5 =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20);
 
-      src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + src_stride)));
-      const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
+        src4 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + src_stride)));
+        const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20);
 
-      s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
-      s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
 
-      __m256i res_lo = convolve_lowbd(s, coeffs);
+        __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
 
-      res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
 
-      const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
-      const __m256i res_lo_0_shift =
-          _mm256_slli_epi32(res_lo_0_32b, left_shift);
-      const __m256i res_lo_0_round = _mm256_sra_epi32(
-          _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+        const __m256i res_lo_0_shift =
+            _mm256_slli_epi32(res_lo_0_32b, left_shift);
+        const __m256i res_lo_0_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
 
-      const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
-      const __m256i res_lo_1_shift =
-          _mm256_slli_epi32(res_lo_1_32b, left_shift);
-      const __m256i res_lo_1_round = _mm256_sra_epi32(
-          _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+        const __m256i res_lo_1_shift =
+            _mm256_slli_epi32(res_lo_1_32b, left_shift);
+        const __m256i res_lo_1_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
 
-      const __m256i res_lo_round =
-          _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+        const __m256i res_lo_round =
+            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
 
-      const __m256i res_lo_unsigned =
-          _mm256_add_epi16(res_lo_round, offset_const_2);
+        const __m256i res_lo_unsigned =
+            _mm256_add_epi16(res_lo_round, offset_const_2);
 
-      if (w - j < 16) {
-        if (do_average) {
-          const __m256i data_ref_0 = load_line2_avx2(
-              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
-          const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+        if (w - j < 16) {
+          if (do_average) {
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
 
-          const __m256i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
 
-          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
-          const __m128i res_0 = _mm256_castsi256_si128(res_8);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
 
-          if (w - j > 4) {
-            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_storel_epi64(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+            if (w - j > 4) {
+              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+              _mm_storel_epi64(
+                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+                  res_1);
+            } else {
+              *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                  _mm_cvtsi128_si32(res_0);
+              *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                  _mm_cvtsi128_si32(res_1);
+            }
           } else {
-            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
-                _mm_cvtsi128_si32(res_0);
-            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-                _mm_cvtsi128_si32(res_1);
+            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
           }
         } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+          __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
 
-          const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                          res_1);
+          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+          const __m256i res_hi_0_shift =
+              _mm256_slli_epi32(res_hi_0_32b, left_shift);
+          const __m256i res_hi_0_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+          const __m256i res_hi_1_shift =
+              _mm256_slli_epi32(res_hi_1_32b, left_shift);
+          const __m256i res_hi_1_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+          const __m256i res_hi_round =
+              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+          const __m256i res_hi_unsigned =
+              _mm256_add_epi16(res_hi_round, offset_const_2);
+
+          if (do_average) {
+            const __m256i data_ref_0_lo =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+
+            const __m256i data_ref_0_hi =
+                load_line2_avx2(&dst[i * dst_stride + j + 8],
+                                &dst[i * dst_stride + j + 8 + dst_stride]);
+
+            const __m256i comp_avg_res_lo = comp_avg(
+                &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i comp_avg_res_hi = comp_avg(
+                &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i round_result_lo =
+                convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i round_result_hi =
+                convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result_lo, round_result_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+          } else {
+            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+            const __m128i res_lo_1 =
+                _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_lo_1);
+
+            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+                            res_hi_0);
+
+            const __m128i res_hi_1 =
+                _mm256_extracti128_si256(res_hi_unsigned, 1);
+            _mm_store_si128(
+                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+                res_hi_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+
+        s[3] = s[4];
+        s[4] = s[5];
+      }
+    }
+  } else {
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src6;
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      {
+        __m256i src_ab[7];
+        __m256i src_a[7];
+        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        for (int kk = 0; kk < 6; ++kk) {
+          data += src_stride;
+          src_a[kk + 1] =
+              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+          src_ab[kk] =
+              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
         }
-      } else {
-        __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+        src6 = src_a[6];
+        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+        s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+        s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+        s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+        s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+      }
 
-        res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[(i + 7) * src_stride + j];
+        const __m256i src7 =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
 
-        const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
-        const __m256i res_hi_0_shift =
-            _mm256_slli_epi32(res_hi_0_32b, left_shift);
-        const __m256i res_hi_0_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + src_stride)));
+        const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
 
-        const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
-        const __m256i res_hi_1_shift =
-            _mm256_slli_epi32(res_hi_1_32b, left_shift);
-        const __m256i res_hi_1_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
 
-        const __m256i res_hi_round =
-            _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+        __m256i res_lo = convolve_lowbd(s, coeffs);
 
-        const __m256i res_hi_unsigned =
-            _mm256_add_epi16(res_hi_round, offset_const_2);
+        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
 
-        if (do_average) {
-          const __m256i data_ref_0_lo = load_line2_avx2(
-              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+        const __m256i res_lo_0_shift =
+            _mm256_slli_epi32(res_lo_0_32b, left_shift);
+        const __m256i res_lo_0_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
 
-          const __m256i data_ref_0_hi =
-              load_line2_avx2(&dst[i * dst_stride + j + 8],
-                              &dst[i * dst_stride + j + 8 + dst_stride]);
+        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+        const __m256i res_lo_1_shift =
+            _mm256_slli_epi32(res_lo_1_32b, left_shift);
+        const __m256i res_lo_1_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
 
-          const __m256i comp_avg_res_lo =
-              comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+        const __m256i res_lo_round =
+            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
 
-          const __m256i comp_avg_res_hi =
-              comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg);
+        const __m256i res_lo_unsigned =
+            _mm256_add_epi16(res_lo_round, offset_const_2);
 
-          const __m256i round_result_lo = convolve_rounding(
-              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+        if (w - j < 16) {
+          if (do_average) {
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
 
-          const __m256i round_result_hi = convolve_rounding(
-              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
 
-          const __m256i res_8 =
-              _mm256_packus_epi16(round_result_lo, round_result_hi);
-          const __m128i res_0 = _mm256_castsi256_si128(res_8);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
 
-          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-          _mm_store_si128(
-              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+            if (w - j > 4) {
+              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+              _mm_storel_epi64(
+                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+                  res_1);
+            } else {
+              *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                  _mm_cvtsi128_si32(res_0);
+              *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                  _mm_cvtsi128_si32(res_1);
+            }
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
 
+            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
         } else {
-          const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+          __m256i res_hi = convolve_lowbd(s + 4, coeffs);
 
-          const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                          res_lo_1);
+          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+          const __m256i res_hi_0_shift =
+              _mm256_slli_epi32(res_hi_0_32b, left_shift);
+          const __m256i res_hi_0_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+          const __m256i res_hi_1_shift =
+              _mm256_slli_epi32(res_hi_1_32b, left_shift);
+          const __m256i res_hi_1_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+          const __m256i res_hi_round =
+              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+          const __m256i res_hi_unsigned =
+              _mm256_add_epi16(res_hi_round, offset_const_2);
+
+          if (do_average) {
+            const __m256i data_ref_0_lo =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
 
-          const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0);
+            const __m256i data_ref_0_hi =
+                load_line2_avx2(&dst[i * dst_stride + j + 8],
+                                &dst[i * dst_stride + j + 8 + dst_stride]);
 
-          const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1);
-          _mm_store_si128(
-              (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1);
+            const __m256i comp_avg_res_lo = comp_avg(
+                &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i comp_avg_res_hi = comp_avg(
+                &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i round_result_lo =
+                convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i round_result_hi =
+                convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result_lo, round_result_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+          } else {
+            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+            const __m128i res_lo_1 =
+                _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_lo_1);
+
+            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+                            res_hi_0);
+
+            const __m128i res_hi_1 =
+                _mm256_extracti128_si256(res_hi_unsigned, 1);
+            _mm_store_si128(
+                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+                res_hi_1);
+          }
         }
-      }
-      s[0] = s[1];
-      s[1] = s[2];
-      s[2] = s[3];
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
 
-      s[4] = s[5];
-      s[5] = s[6];
-      s[6] = s[7];
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
     }
   }
 }
 
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                              int dst_stride0, int w, int h,
-                              const InterpFilterParams *filter_params_x,
-                              const InterpFilterParams *filter_params_y,
-                              const int subpel_x_q4, const int subpel_y_q4,
-                              ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
 
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
-  int im_h = h + filter_params_y->taps - 1;
+
   int im_stride = 8;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
   const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -364,18 +616,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const int rounding_shift =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
-  __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
 
   assert(conv_params->round_0 > 0);
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
   const __m256i round_const_h = _mm256_set1_epi16(
       ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
   const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -385,9 +628,29 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
       (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
   const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
 
-  for (j = 0; j < w; j += 8) {
-    /* Horizontal filter */
-    {
+  __m256i filt[4], coeffs_x[4], coeffs_y[4];
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
+    is_vert_4tap = 1;
+
+  if (is_horiz_4tap) {
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+    for (int j = 0; j < w; j += 8) {
+      /* Horizontal filter */
       const uint8_t *src_h = src_ptr + j;
       for (i = 0; i < im_h; i += 2) {
         __m256i data =
@@ -396,49 +659,59 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
           data = _mm256_inserti128_si256(
               data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
         src_h += (src_stride << 1);
-        __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
+        __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
 
         res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
                                round_shift_h);
 
         _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
       }
+      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
     }
+  } else if (is_vert_4tap) {
+    int im_h = h + 3;
+    const int fo_vert = 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      const uint8_t *src_h = src_ptr + j;
+      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
 
-    /* Vertical filter */
-    {
+      /* Vertical filter */
+      __m256i s[6];
       __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
       __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
       __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
       __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
-      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
-      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
 
       s[0] = _mm256_unpacklo_epi16(s0, s1);
       s[1] = _mm256_unpacklo_epi16(s2, s3);
-      s[2] = _mm256_unpacklo_epi16(s4, s5);
 
-      s[4] = _mm256_unpackhi_epi16(s0, s1);
-      s[5] = _mm256_unpackhi_epi16(s2, s3);
-      s[6] = _mm256_unpackhi_epi16(s4, s5);
+      s[3] = _mm256_unpackhi_epi16(s0, s1);
+      s[4] = _mm256_unpackhi_epi16(s2, s3);
 
       for (i = 0; i < h; i += 2) {
         const int16_t *data = &im_block[i * im_stride];
 
-        const __m256i s6 =
-            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
-        const __m256i s7 =
-            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+        const __m256i s4 =
+            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+        const __m256i s5 =
+            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
 
-        s[3] = _mm256_unpacklo_epi16(s6, s7);
-        s[7] = _mm256_unpackhi_epi16(s6, s7);
+        s[2] = _mm256_unpacklo_epi16(s4, s5);
+        s[5] = _mm256_unpackhi_epi16(s4, s5);
 
-        const __m256i res_a = convolve(s, coeffs_y);
+        const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
         const __m256i res_a_round = _mm256_sra_epi32(
             _mm256_add_epi32(res_a, round_const_v), round_shift_v);
 
         if (w - j > 4) {
-          const __m256i res_b = convolve(s + 4, coeffs_y);
+          const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
           const __m256i res_b_round = _mm256_sra_epi32(
               _mm256_add_epi32(res_b, round_const_v), round_shift_v);
           const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
@@ -448,8 +721,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
             const __m256i data_ref_0 =
                 load_line2_avx2(&dst[i * dst_stride + j],
                                 &dst[i * dst_stride + j + dst_stride]);
-            const __m256i comp_avg_res =
-                comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
 
             const __m256i round_result = convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -479,8 +752,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
                 load_line2_avx2(&dst[i * dst_stride + j],
                                 &dst[i * dst_stride + j + dst_stride]);
 
-            const __m256i comp_avg_res =
-                comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
 
             const __m256i round_result = convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -504,25 +777,36 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
                             res_1);
           }
         }
-
         s[0] = s[1];
         s[1] = s[2];
-        s[2] = s[3];
-
+        s[3] = s[4];
         s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
       }
     }
+  } else {
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      const uint8_t *src_h = src_ptr + j;
+      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+
+      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+    }
   }
 }
 
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
-                                   uint8_t *dst0, int dst_stride0, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
-                                   ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_avx2(
+    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -535,7 +819,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const __m256i wt = unpack_weights_avx2(conv_params);
   const __m256i zero = _mm256_setzero_si256();
 
@@ -562,7 +846,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
               _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
 
           const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m256i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -600,7 +884,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
           const __m256i data_ref_0 = load_line2_avx2(
               &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
           const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m256i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/jnt_convolve_sse2.c b/libaom/av1/common/x86/jnt_convolve_sse2.c
index 7f5677b..641cd02 100644
--- a/libaom/av1/common/x86/jnt_convolve_sse2.c
+++ b/libaom/av1/common/x86/jnt_convolve_sse2.c
@@ -16,12 +16,12 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
 
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                             int dst_stride0, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   const int dst_stride = conv_params->dst_stride;
@@ -37,7 +37,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const __m128i wt1 = _mm_set1_epi16(w1);
   const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -77,7 +77,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
         const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
 
         const __m128i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
         const __m128i round_result = convolve_rounding(
             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -134,7 +134,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -150,12 +150,12 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   }
 }
 
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                             int dst_stride0, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   const int dst_stride = conv_params->dst_stride;
@@ -167,7 +167,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
   const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -225,7 +225,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
         const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
 
         const __m128i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
         const __m128i round_result = convolve_rounding(
             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -254,7 +254,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
         const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
 
         const __m128i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
         const __m128i round_result = convolve_rounding(
             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -331,7 +331,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -360,7 +360,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -384,12 +384,12 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   }
 }
 
-void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                              int dst_stride0, int w, int h,
-                              const InterpFilterParams *filter_params_x,
-                              const InterpFilterParams *filter_params_y,
-                              const int subpel_x_q4, const int subpel_y_q4,
-                              ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
@@ -402,7 +402,7 @@ void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
   const __m128i zero = _mm_setzero_si128();
@@ -594,7 +594,7 @@ void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/jnt_convolve_ssse3.c b/libaom/av1/common/x86/jnt_convolve_ssse3.c
index 8227727..9aeab29 100644
--- a/libaom/av1/common/x86/jnt_convolve_ssse3.c
+++ b/libaom/av1/common/x86/jnt_convolve_ssse3.c
@@ -16,12 +16,11 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
 
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
-                               uint8_t *dst0, int dst_stride0, int w, int h,
-                               const InterpFilterParams *filter_params_x,
-                               const InterpFilterParams *filter_params_y,
-                               const int subpel_x_q4, const int subpel_y_q4,
-                               ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_ssse3(
+    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
@@ -34,7 +33,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
   const __m128i zero = _mm_setzero_si128();
@@ -211,7 +210,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/warp_plane_sse4.c b/libaom/av1/common/x86/warp_plane_sse4.c
index b810cea..4532d17 100644
--- a/libaom/av1/common/x86/warp_plane_sse4.c
+++ b/libaom/av1/common/x86/warp_plane_sse4.c
@@ -577,7 +577,7 @@ static INLINE void store_vertical_filter_output(
       __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
       const __m128i p_16 = _mm_loadl_epi64(p);
 
-      if (conv_params->use_jnt_comp_avg) {
+      if (conv_params->use_dist_wtd_comp_avg) {
         const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
         const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
         const __m128i shifted_32 =
@@ -610,7 +610,7 @@ static INLINE void store_vertical_filter_output(
             (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
         const __m128i p4_16 = _mm_loadl_epi64(p4);
 
-        if (conv_params->use_jnt_comp_avg) {
+        if (conv_params->use_dist_wtd_comp_avg) {
           const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
           const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
           const __m128i shifted_32 =
diff --git a/libaom/av1/common/x86/wiener_convolve_avx2.c b/libaom/av1/common/x86/wiener_convolve_avx2.c
index 1f13e2f..87a6e12 100644
--- a/libaom/av1/common/x86/wiener_convolve_avx2.c
+++ b/libaom/av1/common/x86/wiener_convolve_avx2.c
@@ -17,7 +17,6 @@
 #include "av1/common/convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 
@@ -26,207 +25,236 @@
 // on the left.
 // A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
 // loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
-
-// Exploiting the range of wiener filter coefficients,
-// horizontal filtering can be done in 16 bit intermediate precision.
-// The details are as follows :
-// Consider the horizontal wiener filter coefficients of the following form :
-//      [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0]
-// Subtracting  2^(FILTER_BITS) from the centre tap we get the following  :
-//      [C0, C1, C2,     -2 * (C0 + C1 + C2),             C2, C1, C0]
-// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3
-// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit
-// precision. Finally, after rounding the above result by round_0, we multiply
-// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the
-// horizontal filter output.
-
 void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
                                       const int16_t *filter_y, int y_step_q4,
                                       int w, int h,
                                       const ConvolveParams *conv_params) {
+  const int bd = 8;
   assert(x_step_q4 == 16 && y_step_q4 == 16);
   assert(!(w & 7));
   (void)x_step_q4;
   (void)y_step_q4;
 
-  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
-  int im_h = h + SUBPEL_TAPS - 2;
-  int im_stride = 8;
-  memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE);
-  int i, j;
-  const int center_tap = (SUBPEL_TAPS - 1) / 2;
+  DECLARE_ALIGNED(32, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 2;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
   const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
 
-  __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
-
-  assert(conv_params->round_0 > 0);
-
-  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
-
-  filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
-
-  const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
-  const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
-
-  // coeffs 0 1 0 1 0 1 0 1
-  coeffs_h[0] =
-      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
-  // coeffs 2 3 2 3 2 3 2 3
-  coeffs_h[1] =
-      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
-  // coeffs 4 5 4 5 4 5 4 5
-  coeffs_h[2] =
-      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
-  // coeffs 6 7 6 7 6 7 6 7
-  coeffs_h[3] =
-      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
-
-  const __m256i round_const_h =
-      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
-  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m256i zero_256 = _mm256_setzero_si256();
 
   // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i zero_128 = _mm_setzero_si128();
-  const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
-  const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
-
-  const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
-
-  // coeffs 0 1 0 1 0 1 0 1
-  coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
-  // coeffs 2 3 2 3 2 3 2 3
-  coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
-  // coeffs 4 5 4 5 4 5 4 5
-  coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
-  // coeffs 6 7 6 7 6 7 6 7
-  coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
-
-  const __m256i round_const_v =
-      _mm256_set1_epi32((1 << (conv_params->round_1 - 1)));
-  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
-
-  for (j = 0; j < w; j += 8) {
-    for (i = 0; i < im_h; i += 2) {
-      __m256i data = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
-
-      // Load the next line
-      if (i + 1 < im_h)
-        data = _mm256_inserti128_si256(
-            data,
-            _mm_loadu_si128(
-                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
-            1);
-
-      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
-
-      res =
-          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
-
-      __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
-
-      // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
-      // the result
-      data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
-      res = _mm256_add_epi16(res, data_0);
-
-      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+  const __m256i clamp_low = zero_256;
+  const __m256i clamp_high =
+      _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+  /* Horizontal filter */
+  {
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const = _mm256_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (int i = 0; i < intermediate_height; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint8_t *data_ij = src_ptr + i * src_stride + j;
+
+        // Load 8-bit src data
+        const __m128i data_0 = xx_loadu_128(data_ij + 0);
+        const __m128i data_1 = xx_loadu_128(data_ij + 1);
+        const __m128i data_2 = xx_loadu_128(data_ij + 2);
+        const __m128i data_3 = xx_loadu_128(data_ij + 3);
+        const __m128i data_4 = xx_loadu_128(data_ij + 4);
+        const __m128i data_5 = xx_loadu_128(data_ij + 5);
+        const __m128i data_6 = xx_loadu_128(data_ij + 6);
+        const __m128i data_7 = xx_loadu_128(data_ij + 7);
+
+        // (Zero-)Extend 8-bit data to 16-bit data
+        const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
+        const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
+        const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
+        const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
+        const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
+        const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
+        const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
+        const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
+
+        // Multiply src data by filter coeffs and sum pairs
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        // Calculate scalar product for even- and odd-indices separately,
+        // increasing to 32-bit precision
+        const __m256i res_even_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+        const __m256i res_odd_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+
+        const __m256i res_even = _mm256_srai_epi32(
+            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+        const __m256i res_odd = _mm256_srai_epi32(
+            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+        // Reduce to 16-bit precision and pack even- and odd-index results
+        // back into one register. The _mm256_packs_epi32 intrinsic returns
+        // a register with the pixels ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+        const __m256i res_clamped =
+            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+
+        // Store in a temporary array
+        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+      }
     }
+  }
 
-    /* Vertical filter */
-    {
-      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
-      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
-      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
-      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
-      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
-      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
-
-      __m256i s[8];
-      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
-      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
-      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
-
-      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
-      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
-      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
-
-      for (i = 0; i < h - 1; i += 2) {
-        const int16_t *data = &im_block[i * im_stride];
-
-        const __m256i s6 =
-            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
-        const __m256i s7 =
-            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
-
-        s[3] = _mm256_unpacklo_epi16(s6, s7);
-        s[7] = _mm256_unpackhi_epi16(s6, s7);
-
-        __m256i res_a = convolve(s, coeffs_v);
-        __m256i res_b = convolve(s + 4, coeffs_v);
-
-        const __m256i res_a_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
-        const __m256i res_b_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
-
-        /* rounding code */
-        // 16 bit conversion
-        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
-        // 8 bit conversion and saturation to uint8
-        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
-
-        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
-
-        // Store values into the destination buffer
-        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
-
-        _mm_storel_epi64(p_0, res_0);
-        _mm_storel_epi64(p_1, res_1);
-
-        s[0] = s[1];
-        s[1] = s[2];
-        s[2] = s[3];
-
-        s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
-      }
-      if (h - i) {
-        s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
-        s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
-        s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
-
-        const int16_t *data = &im_block[i * im_stride];
-        const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
-        const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
-
-        __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
-        __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
-
-        s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
-        __m256i convolveres = convolve(s, coeffs_v);
-
-        const __m256i res_round = _mm256_sra_epi32(
-            _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
-
-        /* rounding code */
-        // 16 bit conversion
-        __m128i reslo = _mm256_castsi256_si128(res_round);
-        __m128i reshi = _mm256_extracti128_si256(res_round, 1);
-        const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
-
-        // 8 bit conversion and saturation to uint8
-        const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
-        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storel_epi64(p_0, res_8b);
+  /* Vertical filter */
+  {
+    // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
+    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+    // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const =
+        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                          (1 << (bd + conv_params->round_1 - 1)));
+
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
+
+        // Load 16-bit data from the output of the horizontal filter in
+        // which the pixels are ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
+        const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
+        const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
+        const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
+        const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
+        const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
+        const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
+        const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
+
+        // Filter the even-indices, increasing to 32-bit precision
+        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+        const __m256i res_even = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+        // Filter the odd-indices, increasing to 32-bit precision
+        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        const __m256i res_odd = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+        // Pixels are currently in the following order:
+        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
+        //
+        // Rearrange the pixels into the following order:
+        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
+        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+        const __m256i res_lo_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m256i res_hi_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        // Reduce to 16-bit precision and pack into the correct order:
+        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+        const __m256i res_16bit =
+            _mm256_packs_epi32(res_lo_round, res_hi_round);
+
+        // Reduce to 8-bit precision. This messes up the order:
+        // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
+        // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
+        const __m256i res_8bit =
+            _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
+
+        // Swap the two central 32-bit values to get the order:
+        // [ - - - - - - - - - - - - - - - - ]
+        // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
+        const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
+
+        // Store the lower 128-bit lane in the dst array
+        xx_storeu_128(dst + i * dst_stride + j,
+                      _mm256_castsi256_si128(res_8bit2));
       }
     }
   }
diff --git a/libaom/av1/decoder/decodeframe.c b/libaom/av1/decoder/decodeframe.c
index a30b267..b7fc370 100644
--- a/libaom/av1/decoder/decodeframe.c
+++ b/libaom/av1/decoder/decodeframe.c
@@ -64,6 +64,9 @@
 
 #define ACCT_STR __func__
 
+#define AOM_MIN_THREADS_PER_TILE 1
+#define AOM_MAX_THREADS_PER_TILE 2
+
 // This is needed by ext_tile related unit tests.
 #define EXT_TILE_DEBUG 1
 #define MC_TEMP_BUF_PELS                       \
@@ -153,13 +156,10 @@ static void inverse_transform_block(MACROBLOCKD *xd, int plane,
                                     const TX_SIZE tx_size, uint8_t *dst,
                                     int stride, int reduced_tx_set) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const dqcoeff = pd->dqcoeff;
+  tran_low_t *const dqcoeff = pd->dqcoeff_block + xd->cb_offset[plane];
   eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
   uint16_t scan_line = eob_data->max_scan_line;
   uint16_t eob = eob_data->eob;
-
-  memcpy(dqcoeff, pd->dqcoeff_block + xd->cb_offset[plane],
-         (scan_line + 1) * sizeof(dqcoeff[0]));
   av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, stride,
                               eob, reduced_tx_set);
   memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
@@ -696,27 +696,28 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
         assert(bw < 8 || bh < 8);
         ConvolveParams conv_params = get_conv_params_no_round(
             0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
-        conv_params.use_jnt_comp_avg = 0;
+        conv_params.use_dist_wtd_comp_avg = 0;
         struct buf_2d *const dst_buf = &pd->dst;
         uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
 
         ref = 0;
-        const RefBuffer *ref_buf =
-            &cm->current_frame
-                 .frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+        const RefCntBuffer *ref_buf =
+            get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
+        const struct scale_factors *ref_scale_factors =
+            get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
 
-        pd->pre[ref].buf0 = (plane == 1) ? ref_buf->buf->buf.u_buffer
-                                         : ref_buf->buf->buf.v_buffer;
+        pd->pre[ref].buf0 =
+            (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer;
         pd->pre[ref].buf =
-            pd->pre[ref].buf0 +
-            scaled_buffer_offset(pre_x, pre_y, ref_buf->buf->buf.uv_stride,
-                                 &ref_buf->sf);
-        pd->pre[ref].width = ref_buf->buf->buf.uv_crop_width;
-        pd->pre[ref].height = ref_buf->buf->buf.uv_crop_height;
-        pd->pre[ref].stride = ref_buf->buf->buf.uv_stride;
+            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+                                                     ref_buf->buf.uv_stride,
+                                                     ref_scale_factors);
+        pd->pre[ref].width = ref_buf->buf.uv_crop_width;
+        pd->pre[ref].height = ref_buf->buf.uv_crop_height;
+        pd->pre[ref].stride = ref_buf->buf.uv_stride;
 
         const struct scale_factors *const sf =
-            is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+            is_intrabc ? &cm->sf_identity : ref_scale_factors;
         struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
 
         const MV mv = this_mbmi->mv[ref].as_mv;
@@ -736,7 +737,7 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
                                &scaled_mv, &subpel_x_mv, &subpel_y_mv);
         pre = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
         src_stride = pre_buf->stride;
-        highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+        highbd = is_cur_buf_hbd(xd);
         extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv,
                          subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref],
                          &pre, &src_stride);
@@ -769,7 +770,7 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
     int src_stride[2];
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       const struct scale_factors *const sf =
-          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+          is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
       struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
       const MV mv = mi->mv[ref].as_mv;
       PadBlock block;
@@ -780,9 +781,9 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
       dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf,
                              &subpel_params[ref], bw, bh, &block, mi_x, mi_y,
                              &scaled_mv, &subpel_x_mv, &subpel_y_mv);
-      pre[ref] = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+      pre[ref] = pre_buf->buf0 + (int64_t)block.y0 * pre_buf->stride + block.x0;
       src_stride[ref] = pre_buf->stride;
-      highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+      highbd = is_cur_buf_hbd(xd);
 
       WarpTypesAllowed warp_types;
       warp_types.global_warp_allowed = is_global[ref];
@@ -800,13 +801,13 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
 
     ConvolveParams conv_params = get_conv_params_no_round(
         0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
-    av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
-                               &conv_params.bck_offset,
-                               &conv_params.use_jnt_comp_avg, is_compound);
+    av1_dist_wtd_comp_weight_assign(
+        cm, mi, 0, &conv_params.fwd_offset, &conv_params.bck_offset,
+        &conv_params.use_dist_wtd_comp_avg, is_compound);
 
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       const struct scale_factors *const sf =
-          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+          is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
       WarpTypesAllowed warp_types;
       warp_types.global_warp_allowed = is_global[ref];
       warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
@@ -855,7 +856,7 @@ static void dec_build_inter_predictors_for_planes(const AV1_COMMON *cm,
 
 static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,
                                            MACROBLOCKD *xd, int mi_row,
-                                           int mi_col, BUFFER_SET *ctx,
+                                           int mi_col, const BUFFER_SET *ctx,
                                            BLOCK_SIZE bsize) {
   dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
 
@@ -870,7 +871,7 @@ static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,
 
 static void dec_build_inter_predictors_sbuv(const AV1_COMMON *cm,
                                             MACROBLOCKD *xd, int mi_row,
-                                            int mi_col, BUFFER_SET *ctx,
+                                            int mi_col, const BUFFER_SET *ctx,
                                             BLOCK_SIZE bsize) {
   dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
                                         MAX_MB_PLANE - 1);
@@ -1015,7 +1016,7 @@ static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     int len = sizeof(uint16_t);
     dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
     dst_buf1[1] =
@@ -1063,11 +1064,13 @@ static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
       assert(frame == INTRA_FRAME);
       assert(ref == 0);
     } else {
-      RefBuffer *ref_buf = &cm->current_frame.frame_refs[frame - LAST_FRAME];
+      const RefCntBuffer *ref_buf = get_ref_frame_buf(cm, frame);
+      const struct scale_factors *ref_scale_factors =
+          get_ref_scale_factors_const(cm, frame);
 
-      xd->block_refs[ref] = ref_buf;
-      av1_setup_pre_planes(xd, ref, &ref_buf->buf->buf, mi_row, mi_col,
-                           &ref_buf->sf, num_planes);
+      xd->block_ref_scale_factors[ref] = ref_scale_factors;
+      av1_setup_pre_planes(xd, ref, &ref_buf->buf, mi_row, mi_col,
+                           ref_scale_factors, num_planes);
     }
   }
 
@@ -2238,7 +2241,6 @@ static void setup_quantization(AV1_COMMON *const cm,
     cm->v_dc_delta_q = 0;
     cm->v_ac_delta_q = 0;
   }
-  cm->dequant_bit_depth = seq_params->bit_depth;
   cm->using_qmatrix = aom_rb_read_bit(rb);
   if (cm->using_qmatrix) {
     cm->qm_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
@@ -2374,7 +2376,7 @@ static void setup_buffer_pool(AV1_COMMON *cm) {
   if (aom_realloc_frame_buffer(
           &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+          AOM_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
           &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) {
     unlock_buffer_pool(pool);
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
@@ -2438,17 +2440,28 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
   int width, height;
   int found = 0;
   int has_valid_ref_frame = 0;
-  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
     if (aom_rb_read_bit(rb)) {
-      YV12_BUFFER_CONFIG *const buf = &cm->current_frame.frame_refs[i].buf->buf;
-      width = buf->y_crop_width;
-      height = buf->y_crop_height;
-      cm->render_width = buf->render_width;
-      cm->render_height = buf->render_height;
-      setup_superres(cm, rb, &width, &height);
-      resize_context_buffers(cm, width, height);
-      found = 1;
-      break;
+      const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
+      // This will never be NULL in a normal stream, as streams are required to
+      // have a shown keyframe before any inter frames, which would refresh all
+      // the reference buffers. However, it might be null if we're starting in
+      // the middle of a stream, and static analysis will error if we don't do
+      // a null check here.
+      if (ref_buf == NULL) {
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Invalid condition: invalid reference buffer");
+      } else {
+        const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf;
+        width = buf->y_crop_width;
+        height = buf->y_crop_height;
+        cm->render_width = buf->render_width;
+        cm->render_height = buf->render_height;
+        setup_superres(cm, rb, &width, &height);
+        resize_context_buffers(cm, width, height);
+        found = 1;
+        break;
+      }
     }
   }
 
@@ -2469,20 +2482,20 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
 
   // Check to make sure at least one of frames that this frame references
   // has valid dimensions.
-  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-    RefBuffer *const ref_frame = &cm->current_frame.frame_refs[i];
+  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
     has_valid_ref_frame |=
-        valid_ref_frame_size(ref_frame->buf->buf.y_crop_width,
-                             ref_frame->buf->buf.y_crop_height, width, height);
+        valid_ref_frame_size(ref_frame->buf.y_crop_width,
+                             ref_frame->buf.y_crop_height, width, height);
   }
   if (!has_valid_ref_frame)
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Referenced frame has invalid size");
-  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-    RefBuffer *const ref_frame = &cm->current_frame.frame_refs[i];
+  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
     if (!valid_ref_frame_img_fmt(
-            ref_frame->buf->buf.bit_depth, ref_frame->buf->buf.subsampling_x,
-            ref_frame->buf->buf.subsampling_y, seq_params->bit_depth,
+            ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x,
+            ref_frame->buf.subsampling_y, seq_params->bit_depth,
             seq_params->subsampling_x, seq_params->subsampling_y))
       aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Referenced frame has incompatible color format");
@@ -2716,9 +2729,10 @@ static const uint8_t *get_ls_tile_buffers(
 
     const int tile_col_size_bytes = pbi->tile_col_size_bytes;
     const int tile_size_bytes = pbi->tile_size_bytes;
+    int tile_width, tile_height;
+    av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
     const int tile_copy_mode =
-        ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256) ? 1
-                                                                           : 0;
+        ((AOMMAX(tile_width, tile_height) << MI_SIZE_LOG2) <= 256) ? 1 : 0;
     // Read tile column sizes for all columns (we need the last tile buffer)
     for (int c = 0; c < tile_cols; ++c) {
       const int is_last = c == tile_cols - 1;
@@ -3206,7 +3220,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
         continue;
 
       td->bit_reader = &tile_data->bit_reader;
-      av1_zero(td->dqcoeff);
+      av1_zero(td->cb_buffer_base.dqcoeff);
       av1_tile_init(&td->xd.tile, cm, row, col);
       td->xd.current_qindex = cm->base_qindex;
       setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
@@ -3220,7 +3234,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
         td->bit_reader->accounting = NULL;
       }
 #endif
-      av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+      av1_init_macroblockd(cm, &td->xd, NULL);
       av1_init_above_context(cm, &td->xd, row);
 
       // Initialise the tile context from the frame context
@@ -3277,7 +3291,7 @@ static void tile_worker_hook_init(AV1Decoder *const pbi,
   int tile_col = tile_data->tile_info.tile_col;
 
   td->bit_reader = &tile_data->bit_reader;
-  av1_zero(td->dqcoeff);
+  av1_zero(td->cb_buffer_base.dqcoeff);
   av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
   td->xd.current_qindex = cm->base_qindex;
   setup_bool_decoder(tile_buffer->data, thread_data->data_end,
@@ -3292,7 +3306,7 @@ static void tile_worker_hook_init(AV1Decoder *const pbi,
     td->bit_reader->accounting = NULL;
   }
 #endif
-  av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+  av1_init_macroblockd(cm, &td->xd, NULL);
   td->xd.error_info = &thread_data->error_info;
   av1_init_above_context(cm, &td->xd, tile_row);
 
@@ -3350,6 +3364,20 @@ static int tile_worker_hook(void *arg1, void *arg2) {
   return !td->xd.corrupted;
 }
 
+static INLINE int get_max_row_mt_workers_per_tile(AV1_COMMON *cm,
+                                                  TileInfo tile) {
+  // NOTE: Currently value of max workers is calculated based
+  // on the parse and decode time. As per the theoretical estimate
+  // when percentage of parse time is equal to percentage of decode
+  // time, number of workers needed to parse + decode a tile can not
+  // exceed more than 2.
+  // TODO(any): Modify this value if parsing is optimized in future.
+  int sb_rows = av1_get_sb_rows_in_tile(cm, tile);
+  int max_workers =
+      sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE;
+  return max_workers;
+}
+
 // The caller must hold pbi->row_mt_mutex_ when calling this function.
 // Returns 1 if either the next job is stored in *next_job_info or 1 is stored
 // in *end_of_frame.
@@ -3380,8 +3408,8 @@ static int get_next_job_info(AV1Decoder *const pbi,
   int min_threads_working = INT_MAX;
   int max_mis_to_decode = 0;
   int tile_row_idx, tile_col_idx;
-  int tile_row = 0;
-  int tile_col = 0;
+  int tile_row = -1;
+  int tile_col = -1;
 
   memset(next_job_info, 0, sizeof(*next_job_info));
 
@@ -3429,7 +3457,9 @@ static int get_next_job_info(AV1Decoder *const pbi,
           max_mis_to_decode = 0;
         }
         if (num_threads_working == min_threads_working &&
-            num_mis_to_decode > max_mis_to_decode) {
+            num_mis_to_decode > max_mis_to_decode &&
+            num_threads_working <
+                get_max_row_mt_workers_per_tile(cm, tile_data->tile_info)) {
           max_mis_to_decode = num_mis_to_decode;
           tile_row = tile_row_idx;
           tile_col = tile_col_idx;
@@ -3437,6 +3467,8 @@ static int get_next_job_info(AV1Decoder *const pbi,
       }
     }
   }
+  // No job found to process
+  if (tile_row == -1 || tile_col == -1) return 0;
 
   tile_data = pbi->tile_data + tile_row * cm->tile_cols + tile_col;
   tile_info = tile_data->tile_info;
@@ -3565,9 +3597,22 @@ static int row_mt_worker_hook(void *arg1, void *arg2) {
       TileDataDec *const tile_data = cur_job_info->tile_data;
       tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
                             allow_update_cdf);
-
+#if CONFIG_MULTITHREAD
+      pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+      tile_data->dec_row_mt_sync.num_threads_working++;
+#if CONFIG_MULTITHREAD
+      pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
       // decode tile
       parse_tile_row_mt(pbi, td, tile_data);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+      tile_data->dec_row_mt_sync.num_threads_working--;
+#if CONFIG_MULTITHREAD
+      pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
     } else {
       break;
     }
@@ -3616,7 +3661,7 @@ static int row_mt_worker_hook(void *arg1, void *arg2) {
     TileInfo tile_info = tile_data->tile_info;
 
     av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
-    av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+    av1_init_macroblockd(cm, &td->xd, NULL);
     td->xd.error_info = &thread_data->error_info;
 
     decode_tile_sb_row(pbi, td, tile_info, mi_row);
@@ -3825,7 +3870,7 @@ static void decode_mt_init(AV1Decoder *pbi) {
       thread_data->error_info.setjmp = 0;
     }
   }
-  const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+  const int use_highbd = cm->seq_params.use_highbitdepth;
   const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
   for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
     DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
@@ -3956,6 +4001,7 @@ static void dec_alloc_cb_buf(AV1Decoder *pbi) {
     av1_dec_free_cb_buf(pbi);
     CHECK_MEM_ERROR(cm, pbi->cb_buffer_base,
                     aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size));
+    memset(pbi->cb_buffer_base, 0, sizeof(*pbi->cb_buffer_base) * size);
     pbi->cb_buffer_alloc_size = size;
   }
 }
@@ -4043,7 +4089,8 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
   int tile_cols_start;
   int tile_cols_end;
   int tile_count_tg;
-  int num_workers;
+  int num_workers = 0;
+  int max_threads;
   const uint8_t *raw_data_end = NULL;
   int max_sb_rows = 0;
 
@@ -4059,7 +4106,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
     tile_cols_end = tile_cols;
   }
   tile_count_tg = end_tile - start_tile + 1;
-  num_workers = pbi->max_threads;
+  max_threads = pbi->max_threads;
 
   // No tiles to decode.
   if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
@@ -4072,7 +4119,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
   assert(tile_rows <= MAX_TILE_ROWS);
   assert(tile_cols <= MAX_TILE_COLS);
   assert(tile_count_tg > 0);
-  assert(num_workers > 0);
+  assert(max_threads > 0);
   assert(start_tile <= end_tile);
   assert(start_tile >= 0 && end_tile < n_tiles);
 
@@ -4104,8 +4151,10 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
 
       max_sb_rows = AOMMAX(max_sb_rows,
                            av1_get_sb_rows_in_tile(cm, tile_data->tile_info));
+      num_workers += get_max_row_mt_workers_per_tile(cm, tile_data->tile_info);
     }
   }
+  num_workers = AOMMIN(num_workers, max_threads);
 
   if (pbi->allocated_row_mt_sync_rows != max_sb_rows) {
     for (int i = 0; i < n_tiles; ++i) {
@@ -4190,20 +4239,38 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
 
   if (!pars->update_parameters) {
     // inherit parameters from a previous reference frame
-    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
     int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3);
-    int buf_idx = cm->ref_frame_map[film_grain_params_ref_idx];
-    if (buf_idx == INVALID_IDX) {
+    // Section 6.8.20: It is a requirement of bitstream conformance that
+    // film_grain_params_ref_idx is equal to ref_frame_idx[ j ] for some value
+    // of j in the range 0 to REFS_PER_FRAME - 1.
+    int found = 0;
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+      if (film_grain_params_ref_idx == cm->remapped_ref_idx[i]) {
+        found = 1;
+        break;
+      }
+    }
+    if (!found) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Invalid film grain reference idx %d. ref_frame_idx = "
+                         "{%d, %d, %d, %d, %d, %d, %d}",
+                         film_grain_params_ref_idx, cm->remapped_ref_idx[0],
+                         cm->remapped_ref_idx[1], cm->remapped_ref_idx[2],
+                         cm->remapped_ref_idx[3], cm->remapped_ref_idx[4],
+                         cm->remapped_ref_idx[5], cm->remapped_ref_idx[6]);
+    }
+    RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx];
+    if (buf == NULL) {
       aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Invalid Film grain reference idx");
     }
-    if (!frame_bufs[buf_idx].film_grain_params_present) {
+    if (!buf->film_grain_params_present) {
       aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Film grain reference parameters not available");
     }
     uint16_t random_seed = pars->random_seed;
-    *pars = frame_bufs[buf_idx].film_grain_params;  // inherit paramaters
-    pars->random_seed = random_seed;                // with new random seed
+    *pars = buf->film_grain_params;   // inherit paramaters
+    pars->random_seed = random_seed;  // with new random seed
     return;
   }
 
@@ -4420,13 +4487,13 @@ void av1_read_timing_info_header(AV1_COMMON *cm,
   cm->timing_info.equal_picture_interval =
       aom_rb_read_bit(rb);  // Equal picture interval bit
   if (cm->timing_info.equal_picture_interval) {
-    cm->timing_info.num_ticks_per_picture =
-        aom_rb_read_uvlc(rb) + 1;  // ticks per picture
-    if (cm->timing_info.num_ticks_per_picture == 0) {
+    const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
+    if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
       aom_internal_error(
           &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
           "num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.");
     }
+    cm->timing_info.num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1;
   }
 }
 
@@ -4505,7 +4572,7 @@ void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
     seq_params->enable_warped_motion = 0;
     seq_params->enable_dual_filter = 0;
     seq_params->order_hint_info.enable_order_hint = 0;
-    seq_params->order_hint_info.enable_jnt_comp = 0;
+    seq_params->order_hint_info.enable_dist_wtd_comp = 0;
     seq_params->order_hint_info.enable_ref_frame_mvs = 0;
     seq_params->force_screen_content_tools = 2;  // SELECT_SCREEN_CONTENT_TOOLS
     seq_params->force_integer_mv = 2;            // SELECT_INTEGER_MV
@@ -4517,7 +4584,7 @@ void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
     seq_params->enable_dual_filter = aom_rb_read_bit(rb);
 
     seq_params->order_hint_info.enable_order_hint = aom_rb_read_bit(rb);
-    seq_params->order_hint_info.enable_jnt_comp =
+    seq_params->order_hint_info.enable_dist_wtd_comp =
         seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0;
     seq_params->order_hint_info.enable_ref_frame_mvs =
         seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0;
@@ -4663,62 +4730,71 @@ static void read_global_motion(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
 }
 
 // Release the references to the frame buffers in cm->ref_frame_map and reset
-// all elements of cm->ref_frame_map to -1.
+// all elements of cm->ref_frame_map to NULL.
 static void reset_ref_frame_map(AV1_COMMON *const cm) {
   BufferPool *const pool = cm->buffer_pool;
-  RefCntBuffer *const frame_bufs = pool->frame_bufs;
 
   for (int i = 0; i < REF_FRAMES; i++) {
-    decrease_ref_count(cm->ref_frame_map[i], frame_bufs, pool);
+    decrease_ref_count(cm->ref_frame_map[i], pool);
+    cm->ref_frame_map[i] = NULL;
   }
-  memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
 }
 
 // Generate next_ref_frame_map.
 static void generate_next_ref_frame_map(AV1Decoder *const pbi) {
   AV1_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
-  RefCntBuffer *const frame_bufs = pool->frame_bufs;
 
   lock_buffer_pool(pool);
   // cm->next_ref_frame_map holds references to frame buffers. After storing a
   // frame buffer index in cm->next_ref_frame_map, we need to increase the
   // frame buffer's ref_count.
   int ref_index = 0;
-  for (int mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+  for (int mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) {
     if (mask & 1) {
-      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      cm->next_ref_frame_map[ref_index] = cm->cur_frame;
     } else {
       cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
     }
-    if (cm->next_ref_frame_map[ref_index] >= 0)
-      ++frame_bufs[cm->next_ref_frame_map[ref_index]].ref_count;
+    if (cm->next_ref_frame_map[ref_index] != NULL)
+      ++cm->next_ref_frame_map[ref_index]->ref_count;
     ++ref_index;
   }
 
   for (; ref_index < REF_FRAMES; ++ref_index) {
     cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
-    if (cm->next_ref_frame_map[ref_index] >= 0)
-      ++frame_bufs[cm->next_ref_frame_map[ref_index]].ref_count;
+    if (cm->next_ref_frame_map[ref_index] != NULL)
+      ++cm->next_ref_frame_map[ref_index]->ref_count;
   }
   unlock_buffer_pool(pool);
   pbi->hold_ref_buf = 1;
 }
 
+// If the refresh_frame_flags bitmask is set, update reference frame id values
+// and mark frames as valid for reference.
+static void update_ref_frame_id(AV1_COMMON *const cm, int frame_id) {
+  assert(cm->seq_params.frame_id_numbers_present_flag);
+  int refresh_frame_flags = cm->current_frame.refresh_frame_flags;
+  for (int i = 0; i < REF_FRAMES; i++) {
+    if ((refresh_frame_flags >> i) & 1) {
+      cm->ref_frame_id[i] = frame_id;
+      cm->valid_for_referencing[i] = 1;
+    }
+  }
+}
+
 static void show_existing_frame_reset(AV1Decoder *const pbi,
                                       int existing_frame_idx) {
   AV1_COMMON *const cm = &pbi->common;
-  BufferPool *const pool = cm->buffer_pool;
-  RefCntBuffer *const frame_bufs = pool->frame_bufs;
 
   assert(cm->show_existing_frame);
 
   cm->current_frame.frame_type = KEY_FRAME;
 
-  pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+  cm->current_frame.refresh_frame_flags = (1 << REF_FRAMES) - 1;
 
   for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-    cm->current_frame.frame_refs[i].buf = NULL;
+    cm->remapped_ref_idx[i] = INVALID_IDX;
   }
 
   if (pbi->need_resync) {
@@ -4726,22 +4802,10 @@ static void show_existing_frame_reset(AV1Decoder *const pbi,
     pbi->need_resync = 0;
   }
 
-  cm->cur_frame->intra_only = 1;
-
+  // Note that the displayed frame must be valid for referencing in order to
+  // have been selected.
   if (cm->seq_params.frame_id_numbers_present_flag) {
-    /* If bitmask is set, update reference frame id values and
-       mark frames as valid for reference.
-       Note that the displayed frame be valid for referencing
-       in order to have been selected.
-    */
-    int refresh_frame_flags = pbi->refresh_frame_flags;
-    int display_frame_id = cm->ref_frame_id[existing_frame_idx];
-    for (int i = 0; i < REF_FRAMES; i++) {
-      if ((refresh_frame_flags >> i) & 1) {
-        cm->ref_frame_id[i] = display_frame_id;
-        cm->valid_for_referencing[i] = 1;
-      }
-    }
+    update_ref_frame_id(cm, cm->ref_frame_id[existing_frame_idx]);
   }
 
   cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
@@ -4749,8 +4813,7 @@ static void show_existing_frame_reset(AV1Decoder *const pbi,
   generate_next_ref_frame_map(pbi);
 
   // Reload the adapted CDFs from when we originally coded this keyframe
-  *cm->fc =
-      frame_bufs[cm->next_ref_frame_map[existing_frame_idx]].frame_context;
+  *cm->fc = cm->next_ref_frame_map[existing_frame_idx]->frame_context;
 }
 
 static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
@@ -4758,16 +4821,18 @@ static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
   int i;
 
   // We have not stored any references to frame buffers in
-  // cm->next_ref_frame_map, so we can directly reset it to all -1.
-  memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+  // cm->next_ref_frame_map, so we can directly reset it to all NULL.
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->next_ref_frame_map[i] = NULL;
+  }
 
   lock_buffer_pool(cm->buffer_pool);
   reset_ref_frame_map(cm);
   assert(cm->cur_frame->ref_count == 1);
   for (i = 0; i < FRAME_BUFFERS; ++i) {
-    // Reset all unreferenced frame buffers. We can also reset cm->new_fb_idx
-    // because we are the sole owner of cm->new_fb_idx.
-    if (frame_bufs[i].ref_count > 0 && i != cm->new_fb_idx) {
+    // Reset all unreferenced frame buffers. We can also reset cm->cur_frame
+    // because we are the sole owner of cm->cur_frame.
+    if (frame_bufs[i].ref_count > 0 && &frame_bufs[i] != cm->cur_frame) {
       continue;
     }
     frame_bufs[i].order_hint = 0;
@@ -4794,10 +4859,6 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   }
 
   cm->last_frame_type = current_frame->frame_type;
-  cm->last_intra_only = current_frame->intra_only;
-
-  // NOTE: By default all coded frames to be used as a reference
-  cm->is_reference_frame = 1;
 
   if (seq_params->reduced_still_picture_hdr) {
     cm->show_existing_frame = 0;
@@ -4812,7 +4873,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     cm->error_resilient_mode = 1;
   } else {
     cm->show_existing_frame = aom_rb_read_bit(rb);
-    cm->reset_decoder_state = 0;
+    pbi->reset_decoder_state = 0;
 
     if (cm->show_existing_frame) {
       if (pbi->sequence_header_changed) {
@@ -4822,7 +4883,11 @@ static int read_uncompressed_header(AV1Decoder *pbi,
       }
       // Show an existing frame directly.
       const int existing_frame_idx = aom_rb_read_literal(rb, 3);
-      const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
+      RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx];
+      if (frame_to_show == NULL) {
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Buffer does not contain a decoded frame");
+      }
       if (seq_params->decoder_model_info_present_flag &&
           cm->timing_info.equal_picture_interval == 0) {
         av1_read_temporal_point_info(cm, rb);
@@ -4838,42 +4903,36 @@ static int read_uncompressed_header(AV1Decoder *pbi,
                              "Reference buffer frame ID mismatch");
       }
       lock_buffer_pool(pool);
-      if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
-        unlock_buffer_pool(pool);
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Buffer %d does not contain a decoded frame",
-                           frame_to_show);
-      }
+      assert(frame_to_show->ref_count > 0);
       // cm->cur_frame should be the buffer referenced by the return value
       // of the get_free_fb() call in av1_receive_compressed_data(), and
       // generate_next_ref_frame_map() has not been called, so ref_count
       // should still be 1.
       assert(cm->cur_frame->ref_count == 1);
-      // ref_cnt_fb() decrements ref_count directly rather than call
-      // decrease_ref_count(). If cm->cur_frame->raw_frame_buffer
-      // has already been allocated, it will not be released by ref_cnt_fb()!
+      // assign_frame_buffer_p() decrements ref_count directly rather than
+      // call decrease_ref_count(). If cm->cur_frame->raw_frame_buffer has
+      // already been allocated, it will not be released by
+      // assign_frame_buffer_p()!
       assert(!cm->cur_frame->raw_frame_buffer.data);
-      assign_frame_buffer(frame_bufs, &cm->new_fb_idx, frame_to_show);
-      cm->cur_frame = &cm->buffer_pool->frame_bufs[cm->new_fb_idx];
-      cm->reset_decoder_state =
-          frame_bufs[frame_to_show].frame_type == KEY_FRAME;
+      assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
+      pbi->reset_decoder_state = frame_to_show->frame_type == KEY_FRAME;
       unlock_buffer_pool(pool);
 
       cm->lf.filter_level[0] = 0;
       cm->lf.filter_level[1] = 0;
       cm->show_frame = 1;
 
-      if (!frame_bufs[frame_to_show].showable_frame) {
+      if (!frame_to_show->showable_frame) {
         aom_merge_corrupted_flag(&xd->corrupted, 1);
       }
-      if (cm->reset_decoder_state) frame_bufs[frame_to_show].showable_frame = 0;
+      if (pbi->reset_decoder_state) frame_to_show->showable_frame = 0;
 
-      cm->film_grain_params = frame_bufs[frame_to_show].film_grain_params;
+      cm->film_grain_params = frame_to_show->film_grain_params;
 
-      if (cm->reset_decoder_state) {
+      if (pbi->reset_decoder_state) {
         show_existing_frame_reset(pbi, existing_frame_idx);
       } else {
-        pbi->refresh_frame_flags = 0;
+        current_frame->refresh_frame_flags = 0;
       }
 
       return 0;
@@ -4908,7 +4967,6 @@ static int read_uncompressed_header(AV1Decoder *pbi,
       cm->showable_frame = aom_rb_read_bit(rb);
     }
     cm->cur_frame->showable_frame = cm->showable_frame;
-    current_frame->intra_only = current_frame->frame_type == INTRA_ONLY_FRAME;
     cm->error_resilient_mode =
         frame_is_sframe(cm) ||
                 (current_frame->frame_type == KEY_FRAME && cm->show_frame)
@@ -4933,7 +4991,6 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     cm->cur_frame_force_integer_mv = 0;
   }
 
-  cm->frame_refs_short_signaling = 0;
   int frame_size_override_flag = 0;
   cm->allow_intrabc = 0;
   cm->primary_ref_frame = PRIMARY_REF_NONE;
@@ -5020,22 +5077,23 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     }
   }
   if (current_frame->frame_type == KEY_FRAME) {
-    if (!cm->show_frame)  // unshown keyframe (forward keyframe)
-      pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
-    else  // shown keyframe
-      pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+    if (!cm->show_frame) {  // unshown keyframe (forward keyframe)
+      current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+    } else {  // shown keyframe
+      current_frame->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+    }
 
     for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-      cm->current_frame.frame_refs[i].buf = NULL;
+      cm->remapped_ref_idx[i] = INVALID_IDX;
     }
     if (pbi->need_resync) {
       reset_ref_frame_map(cm);
       pbi->need_resync = 0;
     }
   } else {
-    if (current_frame->intra_only) {
-      pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
-      if (pbi->refresh_frame_flags == 0xFF) {
+    if (current_frame->frame_type == INTRA_ONLY_FRAME) {
+      current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+      if (current_frame->refresh_frame_flags == 0xFF) {
         aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "Intra only frames cannot have refresh flags 0xFF");
       }
@@ -5044,17 +5102,12 @@ static int read_uncompressed_header(AV1Decoder *pbi,
         pbi->need_resync = 0;
       }
     } else if (pbi->need_resync != 1) { /* Skip if need resync */
-      pbi->refresh_frame_flags =
+      current_frame->refresh_frame_flags =
           frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES);
-      if (!pbi->refresh_frame_flags) {
-        // NOTE: "pbi->refresh_frame_flags == 0" indicates that the coded frame
-        //       will not be used as a reference
-        cm->is_reference_frame = 0;
-      }
     }
   }
 
-  if (!frame_is_intra_only(cm) || pbi->refresh_frame_flags != 0xFF) {
+  if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xFF) {
     // Read all ref frame order hints if error_resilient_mode == 1
     if (cm->error_resilient_mode &&
         seq_params->order_hint_info.enable_order_hint) {
@@ -5062,40 +5115,39 @@ static int read_uncompressed_header(AV1Decoder *pbi,
         // Read order hint from bit stream
         unsigned int order_hint = aom_rb_read_literal(
             rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
-        // Get buffer index
-        int buf_idx = cm->ref_frame_map[ref_idx];
-        assert(buf_idx < FRAME_BUFFERS);
-        if (buf_idx == -1 || order_hint != frame_bufs[buf_idx].order_hint) {
-          if (buf_idx >= 0) {
+        // Get buffer
+        RefCntBuffer *buf = cm->ref_frame_map[ref_idx];
+        if (buf == NULL || order_hint != buf->order_hint) {
+          if (buf != NULL) {
             lock_buffer_pool(pool);
-            decrease_ref_count(buf_idx, frame_bufs, pool);
+            decrease_ref_count(buf, pool);
             unlock_buffer_pool(pool);
           }
           // If no corresponding buffer exists, allocate a new buffer with all
           // pixels set to neutral grey.
-          buf_idx = get_free_fb(cm);
+          int buf_idx = get_free_fb(cm);
           if (buf_idx == INVALID_IDX) {
             aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                                "Unable to find free frame buffer");
           }
+          buf = &frame_bufs[buf_idx];
           lock_buffer_pool(pool);
           if (aom_realloc_frame_buffer(
-                  &frame_bufs[buf_idx].buf, seq_params->max_frame_width,
+                  &buf->buf, seq_params->max_frame_width,
                   seq_params->max_frame_height, seq_params->subsampling_x,
                   seq_params->subsampling_y, seq_params->use_highbitdepth,
                   AOM_BORDER_IN_PIXELS, cm->byte_alignment,
-                  &pool->frame_bufs[buf_idx].raw_frame_buffer, pool->get_fb_cb,
-                  pool->cb_priv)) {
-            decrease_ref_count(buf_idx, frame_bufs, pool);
+                  &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) {
+            decrease_ref_count(buf, pool);
             unlock_buffer_pool(pool);
             aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           }
           unlock_buffer_pool(pool);
-          set_planes_to_neutral_grey(seq_params, &frame_bufs[buf_idx].buf, 0);
+          set_planes_to_neutral_grey(seq_params, &buf->buf, 0);
 
-          cm->ref_frame_map[ref_idx] = buf_idx;
-          frame_bufs[buf_idx].order_hint = order_hint;
+          cm->ref_frame_map[ref_idx] = buf;
+          buf->order_hint = order_hint;
         }
       }
     }
@@ -5111,7 +5163,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   } else {
     cm->allow_ref_frame_mvs = 0;
 
-    if (current_frame->intra_only) {
+    if (current_frame->frame_type == INTRA_ONLY_FRAME) {
       cm->cur_frame->film_grain_params_present =
           seq_params->film_grain_params_present;
       setup_frame_size(cm, frame_size_override_flag, rb);
@@ -5119,57 +5171,53 @@ static int read_uncompressed_header(AV1Decoder *pbi,
         cm->allow_intrabc = aom_rb_read_bit(rb);
 
     } else if (pbi->need_resync != 1) { /* Skip if need resync */
-
+      int frame_refs_short_signaling = 0;
       // Frame refs short signaling is off when error resilient mode is on.
       if (seq_params->order_hint_info.enable_order_hint)
-        cm->frame_refs_short_signaling = aom_rb_read_bit(rb);
+        frame_refs_short_signaling = aom_rb_read_bit(rb);
 
-      if (cm->frame_refs_short_signaling) {
+      if (frame_refs_short_signaling) {
         // == LAST_FRAME ==
         const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
-        const int lst_idx = cm->ref_frame_map[lst_ref];
+        const RefCntBuffer *const lst_buf = cm->ref_frame_map[lst_ref];
 
         // == GOLDEN_FRAME ==
         const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
-        const int gld_idx = cm->ref_frame_map[gld_ref];
+        const RefCntBuffer *const gld_buf = cm->ref_frame_map[gld_ref];
 
         // Most of the time, streams start with a keyframe. In that case,
         // ref_frame_map will have been filled in at that point and will not
-        // contain any -1's. However, streams are explicitly allowed to start
+        // contain any NULLs. However, streams are explicitly allowed to start
         // with an intra-only frame, so long as they don't then signal a
         // reference to a slot that hasn't been set yet. That's what we are
         // checking here.
-        if (lst_idx == -1)
+        if (lst_buf == NULL)
           aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                              "Inter frame requests nonexistent reference");
-        if (gld_idx == -1)
+        if (gld_buf == NULL)
           aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                              "Inter frame requests nonexistent reference");
 
-        av1_set_frame_refs(cm, lst_ref, gld_ref);
+        av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref);
       }
 
       for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
         int ref = 0;
-        if (!cm->frame_refs_short_signaling) {
+        if (!frame_refs_short_signaling) {
           ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
-          const int idx = cm->ref_frame_map[ref];
 
           // Most of the time, streams start with a keyframe. In that case,
           // ref_frame_map will have been filled in at that point and will not
-          // contain any -1's. However, streams are explicitly allowed to start
+          // contain any NULLs. However, streams are explicitly allowed to start
           // with an intra-only frame, so long as they don't then signal a
           // reference to a slot that hasn't been set yet. That's what we are
           // checking here.
-          if (idx == -1)
+          if (cm->ref_frame_map[ref] == NULL)
             aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                                "Inter frame requests nonexistent reference");
-
-          RefBuffer *const ref_frame = &cm->current_frame.frame_refs[i];
-          ref_frame->buf = &frame_bufs[idx];
-          ref_frame->map_idx = ref;
+          cm->remapped_ref_idx[i] = ref;
         } else {
-          ref = cm->current_frame.frame_refs[i].map_idx;
+          ref = cm->remapped_ref_idx[i];
         }
 
         cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
@@ -5206,26 +5254,29 @@ static int read_uncompressed_header(AV1Decoder *pbi,
       cm->switchable_motion_mode = aom_rb_read_bit(rb);
     }
 
-    cm->prev_frame = get_prev_frame(cm);
+    cm->prev_frame = get_primary_ref_frame_buf(cm);
     if (cm->primary_ref_frame != PRIMARY_REF_NONE &&
-        cm->current_frame.frame_refs[cm->primary_ref_frame].buf == NULL) {
+        get_primary_ref_frame_buf(cm) == NULL) {
       aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Reference frame containing this frame's initial "
                          "frame context is unavailable.");
     }
 
-    if (!current_frame->intra_only && pbi->need_resync != 1) {
+    if (!(current_frame->frame_type == INTRA_ONLY_FRAME) &&
+        pbi->need_resync != 1) {
       if (frame_might_allow_ref_frame_mvs(cm))
         cm->allow_ref_frame_mvs = aom_rb_read_bit(rb);
       else
         cm->allow_ref_frame_mvs = 0;
 
-      for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-        RefBuffer *const ref_buf = &cm->current_frame.frame_refs[i];
+      for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+        const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
+        struct scale_factors *const ref_scale_factors =
+            get_ref_scale_factors(cm, i);
         av1_setup_scale_factors_for_frame(
-            &ref_buf->sf, ref_buf->buf->buf.y_crop_width,
-            ref_buf->buf->buf.y_crop_height, cm->width, cm->height);
-        if ((!av1_is_valid_scale(&ref_buf->sf)))
+            ref_scale_factors, ref_buf->buf.y_crop_width,
+            ref_buf->buf.y_crop_height, cm->width, cm->height);
+        if ((!av1_is_valid_scale(ref_scale_factors)))
           aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                              "Reference frame has invalid dimensions");
       }
@@ -5236,20 +5287,10 @@ static int read_uncompressed_header(AV1Decoder *pbi,
 
   av1_setup_frame_sign_bias(cm);
 
-  cm->cur_frame->intra_only =
-      current_frame->frame_type == KEY_FRAME || current_frame->intra_only;
   cm->cur_frame->frame_type = current_frame->frame_type;
 
   if (seq_params->frame_id_numbers_present_flag) {
-    /* If bitmask is set, update reference frame id values and
-       mark frames as valid for reference */
-    int refresh_frame_flags = pbi->refresh_frame_flags;
-    for (int i = 0; i < REF_FRAMES; i++) {
-      if ((refresh_frame_flags >> i) & 1) {
-        cm->ref_frame_id[i] = cm->current_frame_id;
-        cm->valid_for_referencing[i] = 1;
-      }
-    }
+    update_ref_frame_id(cm, cm->current_frame_id);
   }
 
   const int might_bwd_adapt =
@@ -5297,6 +5338,11 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   }
 
   read_tile_info(pbi, rb);
+  if (!is_min_tile_width_satisfied(cm)) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Minimum tile width requirement not satisfied");
+  }
+
   setup_quantization(cm, rb);
   xd->bd = (int)seq_params->bit_depth;
 
@@ -5486,7 +5532,7 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
   if (cm->show_existing_frame) {
     // showing a frame directly
     *p_data_end = data + uncomp_hdr_size;
-    if (cm->reset_decoder_state) {
+    if (pbi->reset_decoder_state) {
       // Use the default frame context values.
       *cm->fc = *cm->default_frame_context;
       if (!cm->fc->initialized)
@@ -5498,8 +5544,6 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
 
   cm->setup_mi(cm);
 
-  cm->current_frame_seg_map = cm->cur_frame->seg_map;
-
   av1_setup_motion_field(cm);
 
   av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
@@ -5508,8 +5552,7 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
     // use the default frame context values
     *cm->fc = *cm->default_frame_context;
   } else {
-    *cm->fc =
-        cm->current_frame.frame_refs[cm->primary_ref_frame].buf->frame_context;
+    *cm->fc = get_primary_ref_frame_buf(cm)->frame_context;
   }
   if (!cm->fc->initialized)
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
@@ -5528,7 +5571,7 @@ static void setup_frame_info(AV1Decoder *pbi) {
       cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
     av1_alloc_restoration_buffers(cm);
   }
-  const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+  const int use_highbd = cm->seq_params.use_highbitdepth;
   const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
   if (pbi->td.mc_buf_size != buf_size) {
     av1_free_mc_tmp_buf(&pbi->td);
diff --git a/libaom/av1/decoder/decodemv.c b/libaom/av1/decoder/decodemv.c
index 7a94717..2791f3a 100644
--- a/libaom/av1/decoder/decodemv.c
+++ b/libaom/av1/decoder/decodemv.c
@@ -299,7 +299,7 @@ static void set_segment_id(AV1_COMMON *cm, int mi_offset, int x_mis, int y_mis,
 
   for (int y = 0; y < y_mis; y++)
     for (int x = 0; x < x_mis; x++)
-      cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+      cm->cur_frame->seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }
 
 static int read_intra_segment_id(AV1_COMMON *const cm,
@@ -355,7 +355,7 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
   if (!seg->enabled) return 0;  // Default for disabled segmentation
 
   if (!seg->update_map) {
-    copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+    copy_segment_id(cm, cm->last_frame_seg_map, cm->cur_frame->seg_map,
                     mi_offset, x_mis, y_mis);
     return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
   }
@@ -364,7 +364,6 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
   if (preskip) {
     if (!seg->segid_preskip) return 0;
   } else {
-    if (seg->segid_preskip) return mbmi->segment_id;
     if (mbmi->skip) {
       if (seg->temporal_update) {
         mbmi->seg_id_predicted = 0;
@@ -679,11 +678,10 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
 
     int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
     int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
-    int_mv global_mvs[REF_FRAMES];
 
     av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, xd->ref_mv_count,
-                     xd->ref_mv_stack, ref_mvs, global_mvs, mi_row, mi_col,
-                     inter_mode_ctx);
+                     xd->ref_mv_stack, ref_mvs, /*global_mvs=*/NULL, mi_row,
+                     mi_col, inter_mode_ctx);
 
     int_mv nearestmv, nearmv;
 
@@ -700,7 +698,8 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                                      mi_col, bsize, r);
     if (!valid_dv) {
       // Intra bc motion vectors are not valid - signal corrupt frame
-      aom_merge_corrupted_flag(&xd->corrupted, 1);
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid intrabc dv");
     }
   }
 }
@@ -1271,9 +1270,9 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
   const int is_compound = has_second_ref(mbmi);
 
   MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
-  int_mv global_mvs[REF_FRAMES];
   av1_find_mv_refs(cm, xd, mbmi, ref_frame, xd->ref_mv_count, xd->ref_mv_stack,
-                   ref_mvs, global_mvs, mi_row, mi_col, inter_mode_ctx);
+                   ref_mvs, /*global_mvs=*/NULL, mi_row, mi_col,
+                   inter_mode_ctx);
 
   int mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame);
   mbmi->ref_mv_idx = 0;
@@ -1388,9 +1387,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
 
   for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
     const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-    RefBuffer *ref_buf = &cm->current_frame.frame_refs[frame - LAST_FRAME];
-
-    xd->block_refs[ref] = ref_buf;
+    xd->block_ref_scale_factors[ref] = get_ref_scale_factors_const(cm, frame);
   }
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
@@ -1419,13 +1416,16 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
     }
 
     if (mbmi->comp_group_idx == 0) {
-      if (cm->seq_params.order_hint_info.enable_jnt_comp) {
+      if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
         const int comp_index_ctx = get_comp_index_context(cm, xd);
         mbmi->compound_idx = aom_read_symbol(
             r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
+        mbmi->interinter_comp.type =
+            mbmi->compound_idx ? COMPOUND_AVERAGE : COMPOUND_DISTWTD;
       } else {
         // Distance-weighted compound is disabled, so always use average
         mbmi->compound_idx = 1;
+        mbmi->interinter_comp.type = COMPOUND_AVERAGE;
       }
     } else {
       assert(cm->current_frame.reference_mode != SINGLE_REFERENCE &&
@@ -1436,8 +1436,9 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
       // compound_diffwtd, wedge
       if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
         mbmi->interinter_comp.type =
-            1 + aom_read_symbol(r, ec_ctx->compound_type_cdf[bsize],
-                                COMPOUND_TYPES - 1, ACCT_STR);
+            COMPOUND_WEDGE + aom_read_symbol(r,
+                                             ec_ctx->compound_type_cdf[bsize],
+                                             MASKED_COMPOUND_TYPES, ACCT_STR);
       else
         mbmi->interinter_comp.type = COMPOUND_DIFFWTD;
 
@@ -1502,7 +1503,8 @@ static void read_inter_frame_mode_info(AV1Decoder *const pbi,
   else
     mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
 
-  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 0, r);
+  if (!cm->seg.segid_preskip)
+    mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 0, r);
 
   read_cdef(cm, r, xd, mi_col, mi_row);
 
diff --git a/libaom/av1/decoder/decoder.c b/libaom/av1/decoder/decoder.c
index 773305d..bff4b7a 100644
--- a/libaom/av1/decoder/decoder.c
+++ b/libaom/av1/decoder/decoder.c
@@ -100,15 +100,16 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   aom_once(initialize_dec);
 
   // Initialize the references to not point to any frame buffers.
-  memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
-  memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+  for (int i = 0; i < REF_FRAMES; i++) {
+    cm->ref_frame_map[i] = NULL;
+    cm->next_ref_frame_map[i] = NULL;
+  }
 
   cm->current_frame.frame_number = 0;
   pbi->decoding_first_frame = 1;
   pbi->common.buffer_pool = pool;
 
   cm->seq_params.bit_depth = AOM_BITS_8;
-  cm->dequant_bit_depth = AOM_BITS_8;
 
   cm->alloc_mi = av1_dec_alloc_mi;
   cm->free_mi = dec_free_mi;
@@ -321,26 +322,26 @@ aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
 static void release_frame_buffers(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
+  cm->cur_frame->buf.corrupted = 1;
   lock_buffer_pool(pool);
   // Release all the reference buffers in cm->next_ref_frame_map if the worker
   // thread is holding them.
   if (pbi->hold_ref_buf) {
-    int ref_index;
-    for (ref_index = 0; ref_index < REF_FRAMES; ++ref_index) {
-      const int new_idx = cm->next_ref_frame_map[ref_index];
-      decrease_ref_count(new_idx, frame_bufs, pool);
+    for (int ref_index = 0; ref_index < REF_FRAMES; ++ref_index) {
+      decrease_ref_count(cm->next_ref_frame_map[ref_index], pool);
+      cm->next_ref_frame_map[ref_index] = NULL;
     }
     pbi->hold_ref_buf = 0;
   }
   // Release current frame.
-  decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+  decrease_ref_count(cm->cur_frame, pool);
   unlock_buffer_pool(pool);
+  cm->cur_frame = NULL;
 }
 
 // If any buffer updating is signaled it should be done here.
-// Consumes a reference to cm->new_fb_idx.
+// Consumes a reference to cm->cur_frame.
 //
 // This functions returns void. It reports failure by setting
 // cm->error.error_code.
@@ -348,7 +349,6 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
   int ref_index = 0, mask;
   AV1_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
   if (frame_decoded) {
     lock_buffer_pool(pool);
@@ -358,58 +358,55 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
     if (!pbi->camera_frame_header_ready) {
       // If we are not holding reference buffers in cm->next_ref_frame_map,
       // assert that the following two for loops are no-ops.
-      assert(IMPLIES(!pbi->hold_ref_buf, pbi->refresh_frame_flags == 0));
       assert(IMPLIES(!pbi->hold_ref_buf,
-                     cm->show_existing_frame && !cm->reset_decoder_state));
+                     cm->current_frame.refresh_frame_flags == 0));
+      assert(IMPLIES(!pbi->hold_ref_buf,
+                     cm->show_existing_frame && !pbi->reset_decoder_state));
 
       // The following two for loops need to release the reference stored in
       // cm->ref_frame_map[ref_index] before transferring the reference stored
       // in cm->next_ref_frame_map[ref_index] to cm->ref_frame_map[ref_index].
-      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-        const int old_idx = cm->ref_frame_map[ref_index];
-        decrease_ref_count(old_idx, frame_bufs, pool);
+      for (mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) {
+        decrease_ref_count(cm->ref_frame_map[ref_index], pool);
         cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+        cm->next_ref_frame_map[ref_index] = NULL;
         ++ref_index;
       }
 
       const int check_on_show_existing_frame =
-          !cm->show_existing_frame || cm->reset_decoder_state;
+          !cm->show_existing_frame || pbi->reset_decoder_state;
       for (; ref_index < REF_FRAMES && check_on_show_existing_frame;
            ++ref_index) {
-        const int old_idx = cm->ref_frame_map[ref_index];
-        decrease_ref_count(old_idx, frame_bufs, pool);
+        decrease_ref_count(cm->ref_frame_map[ref_index], pool);
         cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+        cm->next_ref_frame_map[ref_index] = NULL;
       }
     }
 
     if (cm->show_existing_frame || cm->show_frame) {
-      YV12_BUFFER_CONFIG *cur_frame = &cm->cur_frame->buf;
       if (pbi->output_all_layers) {
         // Append this frame to the output queue
         if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) {
           // We can't store the new frame anywhere, so drop it and return an
           // error
-          decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
-          cm->cur_frame = NULL;
+          cm->cur_frame->buf.corrupted = 1;
+          decrease_ref_count(cm->cur_frame, pool);
           cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
         } else {
-          pbi->output_frames[pbi->num_output_frames] = cur_frame;
-          pbi->output_frame_index[pbi->num_output_frames] = cm->new_fb_idx;
+          pbi->output_frames[pbi->num_output_frames] = cm->cur_frame;
           pbi->num_output_frames++;
         }
       } else {
         // Replace any existing output frame
         assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1);
         if (pbi->num_output_frames > 0) {
-          decrease_ref_count(pbi->output_frame_index[0], frame_bufs, pool);
+          decrease_ref_count(pbi->output_frames[0], pool);
         }
-        pbi->output_frames[0] = cur_frame;
-        pbi->output_frame_index[0] = cm->new_fb_idx;
+        pbi->output_frames[0] = cm->cur_frame;
         pbi->num_output_frames = 1;
       }
     } else {
-      decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
-      cm->cur_frame = NULL;
+      decrease_ref_count(cm->cur_frame, pool);
     }
 
     unlock_buffer_pool(pool);
@@ -420,17 +417,17 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
     assert(IMPLIES(!pbi->camera_frame_header_ready, !pbi->hold_ref_buf));
     // Nothing was decoded, so just drop this frame buffer
     lock_buffer_pool(pool);
-    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
-    cm->cur_frame = NULL;
+    decrease_ref_count(cm->cur_frame, pool);
     unlock_buffer_pool(pool);
   }
+  cm->cur_frame = NULL;
 
   if (!pbi->camera_frame_header_ready) {
     pbi->hold_ref_buf = 0;
 
     // Invalidate these references until the next frame starts.
     for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
-      cm->current_frame.frame_refs[ref_index].buf = NULL;
+      cm->remapped_ref_idx[ref_index] = INVALID_IDX;
     }
   }
 }
@@ -438,7 +435,6 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
 int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
                                 const uint8_t **psource) {
   AV1_COMMON *volatile const cm = &pbi->common;
-  BufferPool *volatile const pool = cm->buffer_pool;
   const uint8_t *source = *psource;
   cm->error.error_code = AOM_CODEC_OK;
   cm->error.has_detail = 0;
@@ -452,24 +448,15 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
     // TODO(jkoleszar): Error concealment is undefined and non-normative
     // at this point, but if it becomes so, [0] may not always be the correct
     // thing to do here.
-    if (cm->current_frame.frame_refs[0].buf != NULL) {
-      cm->current_frame.frame_refs[0].buf->buf.corrupted = 1;
-    }
+    RefCntBuffer *ref_buf = get_ref_frame_buf(cm, LAST_FRAME);
+    if (ref_buf != NULL) ref_buf->buf.corrupted = 1;
   }
 
-  // Find a free buffer for the new frame, releasing the reference previously
-  // held.
-
-  // Find a free frame buffer. Return error if can not find any.
-  cm->new_fb_idx = get_free_fb(cm);
-  if (cm->new_fb_idx == INVALID_IDX) {
+  if (assign_cur_frame_new_fb(cm) == NULL) {
     cm->error.error_code = AOM_CODEC_MEM_ERROR;
     return 1;
   }
 
-  // Assign a MV array to the frame buffer.
-  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
-
   if (!pbi->camera_frame_header_ready) pbi->hold_ref_buf = 0;
 
   // The jmp_buf is valid only for the duration of the function that calls
@@ -514,7 +501,7 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
   cm->txb_count = 0;
 #endif
 
-  // Note: At this point, this function holds a reference to cm->new_fb_idx
+  // Note: At this point, this function holds a reference to cm->cur_frame
   // in the buffer pool. This reference is consumed by swap_frame_buffers().
   swap_frame_buffers(pbi, frame_decoded);
 
@@ -541,10 +528,6 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
   }
 
   // Update progress in frame parallel decode.
-  cm->last_width = cm->width;
-  cm->last_height = cm->height;
-  cm->last_tile_cols = cm->tile_cols;
-  cm->last_tile_rows = cm->tile_rows;
   cm->error.setjmp = 0;
 
   return 0;
@@ -553,11 +536,9 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
 // Get the frame at a particular index in the output queue
 int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
                       aom_film_grain_t **grain_params) {
-  RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs;
-
   if (index >= pbi->num_output_frames) return -1;
-  *sd = pbi->output_frames[index];
-  *grain_params = &frame_bufs[pbi->output_frame_index[index]].film_grain_params;
+  *sd = &pbi->output_frames[index]->buf;
+  *grain_params = &pbi->output_frames[index]->film_grain_params;
   aom_clear_system_state();
   return 0;
 }
@@ -567,6 +548,6 @@ int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
 int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) {
   if (pbi->num_output_frames == 0) return -1;
 
-  *frame = *pbi->output_frames[pbi->num_output_frames - 1];
+  *frame = pbi->output_frames[pbi->num_output_frames - 1]->buf;
   return 0;
 }
diff --git a/libaom/av1/decoder/decoder.h b/libaom/av1/decoder/decoder.h
index 6ca28e7..685c931 100644
--- a/libaom/av1/decoder/decoder.h
+++ b/libaom/av1/decoder/decoder.h
@@ -48,11 +48,9 @@ typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
                                                    MACROBLOCKD *const xd);
 
 typedef struct ThreadData {
-  aom_reader *bit_reader;
   DECLARE_ALIGNED(32, MACROBLOCKD, xd);
-  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
   CB_BUFFER cb_buffer_base;
+  aom_reader *bit_reader;
   uint8_t *mc_buf[2];
   int32_t mc_buf_size;
   int mc_buf_use_highbd;  // Boolean: whether the byte pointers stored in
@@ -163,8 +161,6 @@ typedef struct AV1Decoder {
 
   DECLARE_ALIGNED(32, AV1_COMMON, common);
 
-  int refresh_frame_flags;
-
   AVxWorker lf_worker;
   AV1LfSync lf_row_sync;
   AV1LrSync lr_row_sync;
@@ -190,8 +186,7 @@ typedef struct AV1Decoder {
   // Note: The saved buffers are released at the start of the next time the
   // application calls aom_codec_decode().
   int output_all_layers;
-  YV12_BUFFER_CONFIG *output_frames[MAX_NUM_SPATIAL_LAYERS];
-  int output_frame_index[MAX_NUM_SPATIAL_LAYERS];  // Buffer pool indices
+  RefCntBuffer *output_frames[MAX_NUM_SPATIAL_LAYERS];
   size_t num_output_frames;  // How many frames are queued up so far?
 
   // In order to properly support random-access decoding, we need
@@ -205,6 +200,7 @@ typedef struct AV1Decoder {
   int need_resync;   // wait for key/intra-only frame.
   int hold_ref_buf;  // Boolean: whether we are holding reference buffers in
                      // common.next_ref_frame_map.
+  int reset_decoder_state;
 
   int tile_size_bytes;
   int tile_col_size_bytes;
@@ -283,23 +279,22 @@ void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync);
 
 void av1_dec_free_cb_buf(AV1Decoder *pbi);
 
-static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+static INLINE void decrease_ref_count(RefCntBuffer *const buf,
                                       BufferPool *const pool) {
-  if (idx >= 0) {
-    --frame_bufs[idx].ref_count;
+  if (buf != NULL) {
+    --buf->ref_count;
     // Reference counts should never become negative. If this assertion fails,
     // there is a bug in our reference count management.
-    assert(frame_bufs[idx].ref_count >= 0);
+    assert(buf->ref_count >= 0);
     // A worker may only get a free framebuffer index when calling get_free_fb.
     // But the raw frame buffer is not set up until we finish decoding header.
     // So if any error happens during decoding header, frame_bufs[idx] will not
     // have a valid raw frame buffer.
-    if (frame_bufs[idx].ref_count == 0 &&
-        frame_bufs[idx].raw_frame_buffer.data) {
-      pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
-      frame_bufs[idx].raw_frame_buffer.data = NULL;
-      frame_bufs[idx].raw_frame_buffer.size = 0;
-      frame_bufs[idx].raw_frame_buffer.priv = NULL;
+    if (buf->ref_count == 0 && buf->raw_frame_buffer.data) {
+      pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer);
+      buf->raw_frame_buffer.data = NULL;
+      buf->raw_frame_buffer.size = 0;
+      buf->raw_frame_buffer.priv = NULL;
     }
   }
 }
diff --git a/libaom/av1/decoder/decodetxb.c b/libaom/av1/decoder/decodetxb.c
index f3ef2d5..223e32e 100644
--- a/libaom/av1/decoder/decodetxb.c
+++ b/libaom/av1/decoder/decodetxb.c
@@ -136,6 +136,15 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
   uint16_t *const max_scan_line = &(eob_data->max_scan_line);
   *max_scan_line = 0;
   *eob = 0;
+
+#if CONFIG_INSPECTION
+  if (plane == 0) {
+    const int txk_type_idx =
+        av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+    mbmi->tx_skip[txk_type_idx] = all_zero;
+  }
+#endif
+
   if (all_zero) {
     *max_scan_line = 0;
     if (plane == 0) {
@@ -146,9 +155,6 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
     return 0;
   }
 
-  memset(levels_buf, 0,
-         sizeof(*levels_buf) *
-             ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END));
   if (plane == AOM_PLANE_Y) {
     // only y plane's tx_type is transmitted
     av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r);
@@ -214,23 +220,30 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
       break;
   }
 
-  if (k_eob_offset_bits[eob_pt] > 0) {
+  const int eob_offset_bits = k_eob_offset_bits[eob_pt];
+  if (eob_offset_bits > 0) {
     const int eob_ctx = eob_pt - 3;
     int bit = aom_read_symbol(
         r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR);
     if (bit) {
-      eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1));
+      eob_extra += (1 << (eob_offset_bits - 1));
     }
 
-    for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
+    for (int i = 1; i < eob_offset_bits; i++) {
       bit = aom_read_bit(r, ACCT_STR);
       if (bit) {
-        eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1 - i));
+        eob_extra += (1 << (eob_offset_bits - 1 - i));
       }
     }
   }
   *eob = rec_eob_pos(eob_pt, eob_extra);
 
+  if (*eob > 1) {
+    memset(levels_buf, 0,
+           sizeof(*levels_buf) *
+               ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END));
+  }
+
   {
     // Read the non-zero coefficient with scan index eob-1
     // TODO(angiebird): Put this into a function
@@ -242,12 +255,10 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
         ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx];
     int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1;
     if (level > NUM_BASE_LEVELS) {
-      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      const int br_ctx = get_br_ctx_eob(pos, bwl, tx_class);
+      cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
       for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
-        const int k = aom_read_symbol(
-            r,
-            ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
-            BR_CDF_SIZE, ACCT_STR);
+        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
         level += k;
         if (k < BR_CDF_SIZE - 1) break;
       }
@@ -269,13 +280,6 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
     }
   }
 
-  int16_t num_zero_coeffs = 0;
-  for (int c = 0; c < *eob; ++c) {
-    const int pos = scan[c];
-    num_zero_coeffs = AOMMAX(num_zero_coeffs, pos);
-  }
-  memset(tcoeffs, 0, (num_zero_coeffs + 1) * sizeof(tcoeffs[0]));
-
   for (int c = 0; c < *eob; ++c) {
     const int pos = scan[c];
     uint8_t sign;
diff --git a/libaom/av1/decoder/inspection.c b/libaom/av1/decoder/inspection.c
index 17a9f98..eeed1d3 100644
--- a/libaom/av1/decoder/inspection.c
+++ b/libaom/av1/decoder/inspection.c
@@ -33,7 +33,7 @@ void ifd_clear(insp_frame_data *fd) {
 
 /* TODO(negge) This function may be called by more than one thread when using
                a multi-threaded decoder and this may cause a data race. */
-int ifd_inspect(insp_frame_data *fd, void *decoder) {
+int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform) {
   struct AV1Decoder *pbi = (struct AV1Decoder *)decoder;
   AV1_COMMON *const cm = &pbi->common;
 
@@ -82,6 +82,9 @@ int ifd_inspect(insp_frame_data *fd, void *decoder) {
       mi->ref_frame[1] = mbmi->ref_frame[1];
       // Prediction Mode
       mi->mode = mbmi->mode;
+      mi->intrabc = (int16_t)mbmi->use_intrabc;
+      mi->palette = (int16_t)mbmi->palette_mode_info.palette_size[0];
+      mi->uv_palette = (int16_t)mbmi->palette_mode_info.palette_size[1];
       // Prediction Mode for Chromatic planes
       if (mi->mode < INTRA_MODES) {
         mi->uv_mode = mbmi->uv_mode;
@@ -111,13 +114,19 @@ int ifd_inspect(insp_frame_data *fd, void *decoder) {
       else
         mi->tx_size = mbmi->tx_size;
 
+      if (skip_not_transform && mi->skip) mi->tx_size = -1;
+
       mi->tx_type =
           (mi->skip ? 0 : mbmi->txk_type[av1_get_txk_type_index(bsize, r, c)]);
+      if (skip_not_transform &&
+          (mi->skip || mbmi->tx_skip[av1_get_txk_type_index(bsize, r, c)]))
+        mi->tx_type = -1;
 
       mi->cdef_level = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] /
                        CDEF_SEC_STRENGTHS;
       mi->cdef_strength = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] %
                           CDEF_SEC_STRENGTHS;
+
       mi->cdef_strength += mi->cdef_strength == 3;
       if (mbmi->uv_mode == UV_CFL_PRED) {
         mi->cfl_alpha_idx = mbmi->cfl_alpha_idx;
diff --git a/libaom/av1/decoder/inspection.h b/libaom/av1/decoder/inspection.h
index 0c6f3ad..b963f6a 100644
--- a/libaom/av1/decoder/inspection.h
+++ b/libaom/av1/decoder/inspection.h
@@ -52,6 +52,9 @@ struct insp_mi_data {
   int16_t current_qindex;
   int16_t compound_type;
   int16_t motion_mode;
+  int16_t intrabc;
+  int16_t palette;
+  int16_t uv_palette;
 };
 
 typedef struct insp_frame_data insp_frame_data;
@@ -80,7 +83,7 @@ struct insp_frame_data {
 
 void ifd_init(insp_frame_data *fd, int frame_width, int frame_height);
 void ifd_clear(insp_frame_data *fd);
-int ifd_inspect(insp_frame_data *fd, void *decoder);
+int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libaom/av1/decoder/obu.c b/libaom/av1/decoder/obu.c
index d892dc4..aaea572 100644
--- a/libaom/av1/decoder/obu.c
+++ b/libaom/av1/decoder/obu.c
@@ -26,7 +26,7 @@
 #include "av1/decoder/obu.h"
 
 // Picture prediction structures (0-12 are predefined) in scalability metadata.
-typedef enum {
+enum {
   SCALABILITY_L1T2 = 0,
   SCALABILITY_L1T3 = 1,
   SCALABILITY_L2T1 = 2,
@@ -42,7 +42,7 @@ typedef enum {
   SCALABILITY_S2T2h = 12,
   SCALABILITY_S2T3h = 13,
   SCALABILITY_SS = 14
-} SCALABILITY_STRUCTURES;
+} UENUM1BYTE(SCALABILITY_STRUCTURES);
 
 aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
     int operating_point_idc, unsigned int *number_spatial_layers,
@@ -98,12 +98,10 @@ static int byte_alignment(AV1_COMMON *const cm,
 static uint32_t read_temporal_delimiter_obu() { return 0; }
 
 // Returns a boolean that indicates success.
-static int read_bitstream_level(BitstreamLevel *bl,
+static int read_bitstream_level(AV1_LEVEL *seq_level_idx,
                                 struct aom_read_bit_buffer *rb) {
-  const uint8_t seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);
-  if (!is_valid_seq_level_idx(seq_level_idx)) return 0;
-  bl->major = (seq_level_idx >> LEVEL_MINOR_BITS) + LEVEL_MAJOR_MIN;
-  bl->minor = seq_level_idx & ((1 << LEVEL_MINOR_BITS) - 1);
+  *seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);
+  if (!is_valid_seq_level_idx(*seq_level_idx)) return 0;
   return 1;
 }
 
@@ -151,7 +149,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
     seq_params->display_model_info_present_flag = 0;
     seq_params->operating_points_cnt_minus_1 = 0;
     seq_params->operating_point_idc[0] = 0;
-    if (!read_bitstream_level(&seq_params->level[0], rb)) {
+    if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) {
       cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
       return 0;
     }
@@ -175,13 +173,13 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
     for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
       seq_params->operating_point_idc[i] =
           aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
-      if (!read_bitstream_level(&seq_params->level[i], rb)) {
+      if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) {
         cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
         return 0;
       }
       // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
       // is equivalent to level 3.3.
-      if (seq_params->level[i].major > 3)
+      if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0)
         seq_params->tier[i] = aom_rb_read_bit(rb);
       else
         seq_params->tier[i] = 0;
@@ -195,10 +193,9 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
       if (cm->timing_info_present &&
           (cm->timing_info.equal_picture_interval ||
            cm->op_params[i].decoder_model_param_present_flag)) {
-        cm->op_params[i].bitrate = max_level_bitrate(
-            seq_params->profile,
-            major_minor_to_seq_level_idx(seq_params->level[i]),
-            seq_params->tier[i]);
+        cm->op_params[i].bitrate =
+            max_level_bitrate(seq_params->profile, seq_params->seq_level_idx[i],
+                              seq_params->tier[i]);
         // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
         // the check
         if (cm->op_params[i].bitrate == 0)
@@ -364,8 +361,10 @@ static void alloc_tile_list_buffer(AV1Decoder *pbi) {
   // image format 4:2:0, the output frame of U plane and V plane is 1/4 of the
   // output frame.
   AV1_COMMON *const cm = &pbi->common;
-  const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
-  const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
+  int tile_width, tile_height;
+  av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+  const int tile_width_in_pixels = tile_width * MI_SIZE;
+  const int tile_height_in_pixels = tile_height * MI_SIZE;
   const int output_frame_width =
       (pbi->output_frame_width_in_tiles_minus_1 + 1) * tile_width_in_pixels;
   const int output_frame_height =
@@ -415,8 +414,10 @@ static void yv12_tile_copy(const YV12_BUFFER_CONFIG *src, int hstart1,
 static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
                                                   int tile_idx) {
   AV1_COMMON *const cm = &pbi->common;
-  const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
-  const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
+  int tile_width, tile_height;
+  av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+  const int tile_width_in_pixels = tile_width * MI_SIZE;
+  const int tile_height_in_pixels = tile_height * MI_SIZE;
   const int ssy = cm->seq_params.subsampling_y;
   const int ssx = cm->seq_params.subsampling_x;
   const int num_planes = av1_num_planes(cm);
diff --git a/libaom/av1/encoder/aq_cyclicrefresh.c b/libaom/av1/encoder/aq_cyclicrefresh.c
index 8d96b23..bfb2a90 100644
--- a/libaom/av1/encoder/aq_cyclicrefresh.c
+++ b/libaom/av1/encoder/aq_cyclicrefresh.c
@@ -31,9 +31,9 @@ struct CYCLIC_REFRESH {
   // excess of the cycle time, i.e., in the case of all zero motion, block
   // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
   int time_for_refresh;
-  // Target number of (8x8) blocks that are set for delta-q.
+  // Target number of (4x4) blocks that are set for delta-q.
   int target_num_seg_blocks;
-  // Actual number of (8x8) blocks that were applied delta-q.
+  // Actual number of (4x4) blocks that were applied delta-q.
   int actual_num_seg1_blocks;
   int actual_num_seg2_blocks;
   // RD mult. parameters for segment 1.
@@ -55,6 +55,8 @@ struct CYCLIC_REFRESH {
   int rate_boost_fac;
   double low_content_avg;
   int qindex_delta[3];
+  double weight_segment;
+  int apply_cyclic_refresh;
 };
 
 CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
@@ -87,27 +89,6 @@ void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
   }
 }
 
-// Check if we should turn off cyclic refresh based on bitrate condition.
-static int apply_cyclic_refresh_bitrate(const AV1_COMMON *cm,
-                                        const RATE_CONTROL *rc) {
-  // Turn off cyclic refresh if bits available per frame is not sufficiently
-  // larger than bit cost of segmentation. Segment map bit cost should scale
-  // with number of seg blocks, so compare available bits to number of blocks.
-  // Average bits available per frame = avg_frame_bandwidth
-  // Number of (8x8) blocks in frame = mi_rows * mi_cols;
-  const float factor = 0.25;
-  const int number_blocks = cm->mi_rows * cm->mi_cols;
-  // The condition below corresponds to turning off at target bitrates:
-  // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p.
-  // Also turn off at very small frame sizes, to avoid too large fraction of
-  // superblocks to be refreshed per frame. Threshold below is less than QCIF.
-  if (rc->avg_frame_bandwidth < factor * number_blocks ||
-      number_blocks / 64 < 5)
-    return 0;
-  else
-    return 1;
-}
-
 // Check if this coding block, of size bsize, should be considered for refresh
 // (lower-qp coding). Decision can be based on various factors, such as
 // size of the coding block (i.e., below min_block size rejected), coding
@@ -158,11 +139,11 @@ int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
   const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   int estimated_bits;
   int mbs = cm->MBs;
-  int num8x8bl = mbs << 2;
+  int num4x4bl = mbs << 4;
   // Weight for non-base segments: use actual number of blocks refreshed in
-  // previous/just encoded frame. Note number of blocks here is in 8x8 units.
-  double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
-  double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
+  // previous/just encoded frame. Note number of blocks here is in 4x4 units.
+  double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl;
+  double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl;
   // Take segment weighted average for estimated bits.
   estimated_bits =
       (int)((1.0 - weight_segment1 - weight_segment2) *
@@ -190,14 +171,14 @@ int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
   const AV1_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   int bits_per_mb;
-  int num8x8bl = cm->MBs << 2;
+  int num4x4bl = cm->MBs << 4;
   // Weight for segment prior to encoding: take the average of the target
   // number for the frame to be encoded and the actual from the previous frame.
   double weight_segment =
       (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks +
                 cr->actual_num_seg2_blocks) >>
                1) /
-      num8x8bl;
+      num4x4bl;
   // Compute delta-q corresponding to qindex i.
   int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
   // Take segment weighted average for bits per mb.
@@ -264,21 +245,6 @@ void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi,
       int map_offset = block_index + y * cm->mi_cols + x;
       cr->map[map_offset] = new_map_value;
       cpi->segmentation_map[map_offset] = mbmi->segment_id;
-      // Inter skip blocks were clearly not coded at the current qindex, so
-      // don't update the map for them. For cases where motion is non-zero or
-      // the reference frame isn't the previous frame, the previous value in
-      // the map for this spatial location is not entirely correct.
-      if ((!is_inter_block(mbmi) || !skip) &&
-          mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
-        cr->last_coded_q_map[map_offset] = clamp(
-            cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
-      } else if (is_inter_block(mbmi) && skip &&
-                 mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
-        cr->last_coded_q_map[map_offset] =
-            AOMMIN(clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id],
-                         0, MAXQ),
-                   cr->last_coded_q_map[map_offset]);
-      }
     }
 }
 
@@ -315,73 +281,6 @@ void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
     rc->baseline_gf_interval = 40;
 }
 
-// Update some encoding stats (from the just encoded frame). If this frame's
-// background has high motion, refresh the golden frame. Otherwise, if the
-// golden reference is to be updated check if we should NOT update the golden
-// ref.
-void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  int mi_row, mi_col;
-  double fraction_low = 0.0;
-  int low_content_frame = 0;
-
-  MB_MODE_INFO **mi;
-  RATE_CONTROL *const rc = &cpi->rc;
-  const int rows = cm->mi_rows, cols = cm->mi_cols;
-  int cnt1 = 0, cnt2 = 0;
-  int force_gf_refresh = 0;
-
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0
-                            ? mi[0]->mv[0].as_mv.row
-                            : -1 * mi[0]->mv[0].as_mv.row;
-      int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0
-                            ? mi[0]->mv[0].as_mv.col
-                            : -1 * mi[0]->mv[0].as_mv.col;
-
-      // Calculate the motion of the background.
-      if (abs_mvr <= 16 && abs_mvc <= 16) {
-        cnt1++;
-        if (abs_mvr == 0 && abs_mvc == 0) cnt2++;
-      }
-      mi++;
-
-      // Accumulate low_content_frame.
-      if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++;
-    }
-  }
-
-  // For video conference clips, if the background has high motion in current
-  // frame because of the camera movement, set this frame as the golden frame.
-  // Use 70% and 5% as the thresholds for golden frame refreshing.
-  if (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1) {
-    av1_cyclic_refresh_set_golden_update(cpi);
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-
-    if (rc->frames_till_gf_update_due > rc->frames_to_key)
-      rc->frames_till_gf_update_due = rc->frames_to_key;
-    cpi->refresh_golden_frame = 1;
-    force_gf_refresh = 1;
-  }
-
-  fraction_low = (double)low_content_frame / (rows * cols);
-  // Update average.
-  cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
-  if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
-    // Don't update golden reference if the amount of low_content for the
-    // current encoded frame is small, or if the recursive average of the
-    // low_content over the update interval window falls below threshold.
-    if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
-      cpi->refresh_golden_frame = 0;
-    // Reset for next internal.
-    cr->low_content_avg = fraction_low;
-  }
-}
-
 // Update the segmentation map, and related quantities: cyclic refresh map,
 // refresh sb_index, and target number of blocks to be refreshed.
 // The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
@@ -458,26 +357,70 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
 
 // Set cyclic refresh parameters.
 void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
+  // TODO(marpan): Parameters need to be tuned.
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int num4x4bl = cm->MBs << 4;
+  int target_refresh = 0;
+  double weight_segment_target = 0;
+  double weight_segment = 0;
+  int qp_thresh = AOMMIN(20, rc->best_quality << 1);
+  cr->apply_cyclic_refresh = 1;
+  if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf) ||
+      rc->avg_frame_qindex[INTER_FRAME] < qp_thresh) {
+    cr->apply_cyclic_refresh = 0;
+    return;
+  }
   cr->percent_refresh = 10;
-  cr->max_qdelta_perc = 50;
+  cr->max_qdelta_perc = 60;
   cr->time_for_refresh = 0;
+  cr->motion_thresh = 32;
+  cr->rate_boost_fac = 15;
   // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
   // periods of the refresh cycle, after a key frame.
-  if (rc->frames_since_key < 4 * cr->percent_refresh)
+  // Account for larger interval on base layer for temporal layers.
+  if (cr->percent_refresh > 0 &&
+      rc->frames_since_key < 400 / cr->percent_refresh) {
     cr->rate_ratio_qdelta = 3.0;
-  else
+  } else {
     cr->rate_ratio_qdelta = 2.0;
-  // Adjust some parameters for low resolutions at low bitrates.
-  if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) {
-    cr->motion_thresh = 4;
+  }
+  // Adjust some parameters for low resolutions.
+  if (cm->width <= 352 && cm->height <= 288) {
+    if (rc->avg_frame_bandwidth < 3000) {
+      cr->motion_thresh = 16;
+      cr->rate_boost_fac = 13;
+    } else {
+      cr->max_qdelta_perc = 70;
+      cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.5);
+    }
+  }
+  if (cpi->oxcf.rc_mode == AOM_VBR) {
+    // To be adjusted for VBR mode, e.g., based on gf period and boost.
+    // For now use smaller qp-delta (than CBR), no second boosted seg, and
+    // turn-off (no refresh) on golden refresh (since it's already boosted).
+    cr->percent_refresh = 10;
+    cr->rate_ratio_qdelta = 1.5;
     cr->rate_boost_fac = 10;
-  } else {
-    cr->motion_thresh = 32;
-    cr->rate_boost_fac = 17;
+    if (cpi->refresh_golden_frame == 1) {
+      cr->percent_refresh = 0;
+      cr->rate_ratio_qdelta = 1.0;
+    }
   }
+  // Weight for segment prior to encoding: take the average of the target
+  // number for the frame to be encoded and the actual from the previous frame.
+  // Use the target if its less. To be used for setting the base qp for the
+  // frame in vp9_rc_regulate_q.
+  target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  weight_segment_target = (double)(target_refresh) / num4x4bl;
+  weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks +
+                             cr->actual_num_seg2_blocks) >>
+                            1) /
+                   num4x4bl;
+  if (weight_segment_target < 7 * weight_segment / 8)
+    weight_segment = weight_segment_target;
+  cr->weight_segment = weight_segment;
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
@@ -486,7 +429,6 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
-  const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
   int resolution_change =
       cm->prev_frame && (cm->width != cm->prev_frame->width ||
                          cm->height != cm->prev_frame->height);
@@ -498,8 +440,7 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
     return;
   }
   if (cm->current_frame.frame_number == 0) cr->low_content_avg = 0.0;
-  // Don't apply refresh on key frame or enhancement layer frames.
-  if (!apply_cyclic_refresh || cm->current_frame.frame_type == KEY_FRAME) {
+  if (!cr->apply_cyclic_refresh) {
     // Set segmentation map to 0 and disable.
     unsigned char *const seg_map = cpi->segmentation_map;
     memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
diff --git a/libaom/av1/encoder/aq_cyclicrefresh.h b/libaom/av1/encoder/aq_cyclicrefresh.h
index b457819..ddabae6 100644
--- a/libaom/av1/encoder/aq_cyclicrefresh.h
+++ b/libaom/av1/encoder/aq_cyclicrefresh.h
@@ -54,19 +54,12 @@ void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
                                        int mi_col, BLOCK_SIZE bsize,
                                        int64_t rate, int64_t dist, int skip);
 
-// Update the segmentation map, and related quantities: cyclic refresh map,
-// refresh sb_index, and target number of blocks to be refreshed.
-void av1_cyclic_refresh_update__map(struct AV1_COMP *const cpi);
-
 // Update the actual number of blocks that were applied the segment delta q.
 void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi);
 
 // Set golden frame update interval, for 1 pass CBR mode.
 void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
 
-// Check if we should not update golden reference, based on past refresh stats.
-void av1_cyclic_refresh_check_golden_update(struct AV1_COMP *const cpi);
-
 // Set/update global/frame level refresh parameters.
 void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
 
diff --git a/libaom/av1/encoder/aq_variance.c b/libaom/av1/encoder/aq_variance.c
index cfd7610..d572948 100644
--- a/libaom/av1/encoder/aq_variance.c
+++ b/libaom/av1/encoder/aq_variance.c
@@ -121,7 +121,7 @@ int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
 
   for (i = 0; i < bh; i += 4) {
     for (j = 0; j < bw; j += 4) {
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (is_cur_buf_hbd(xd)) {
         var +=
             log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
@@ -153,7 +153,7 @@ static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
   uint8_t *buf = x->plane[0].src.buf;
   const int bw = MI_SIZE * mi_size_wide[bs];
   const int bh = MI_SIZE * mi_size_high[bs];
-  int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int hbd = is_cur_buf_hbd(xd);
 
   int var = 0;
   for (int r = 0; r < bh; r += 8)
diff --git a/libaom/av1/encoder/av1_multi_thread.c b/libaom/av1/encoder/av1_multi_thread.c
index a0c556e..1260c7a 100644
--- a/libaom/av1/encoder/av1_multi_thread.c
+++ b/libaom/av1/encoder/av1_multi_thread.c
@@ -35,6 +35,14 @@ void av1_row_mt_mem_alloc(AV1_COMP *cpi, int max_sb_rows) {
           &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
                           tile_col];
       av1_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_sb_rows);
+      if (cpi->oxcf.cdf_update_mode)
+        CHECK_MEM_ERROR(
+            cm, this_tile->row_ctx,
+            (FRAME_CONTEXT *)aom_memalign(
+                16,
+                AOMMAX(1, (av1_get_sb_cols_in_tile(cm, this_tile->tile_info) -
+                           1)) *
+                    sizeof(*this_tile->row_ctx)));
     }
   }
 }
@@ -53,6 +61,7 @@ void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
           &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
                           tile_col];
       av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+      if (cpi->oxcf.cdf_update_mode) aom_free(this_tile->row_ctx);
     }
   }
   multi_thread_ctxt->allocated_sb_rows = 0;
diff --git a/libaom/av1/encoder/av1_quantize.c b/libaom/av1/encoder/av1_quantize.c
index 21ab4db..ff1342c 100644
--- a/libaom/av1/encoder/av1_quantize.c
+++ b/libaom/av1/encoder/av1_quantize.c
@@ -41,47 +41,37 @@ static void quantize_fp_helper_c(
     const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
     const qm_val_t *iqm_ptr, int log_scale) {
   int i, eob = -1;
+  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
+  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (qm_ptr == NULL && iqm_ptr == NULL) {
-    const int rounding0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
-    {  // rc == 0
-      const int coeff = coeff_ptr[0];
-      const int coeff_sign = (coeff >> 31);
-      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      if ((abs_coeff << (1 + log_scale)) >= (int32_t)(dequant_ptr[0])) {
-        abs_coeff = clamp64(abs_coeff + rounding0, INT16_MIN, INT16_MAX);
-        const int tmp32 = (int)((abs_coeff * quant_ptr[0]) >> (16 - log_scale));
-        if (tmp32) {
-          qcoeff_ptr[0] = (tmp32 ^ coeff_sign) - coeff_sign;
-          const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[0]) >> log_scale;
-          dqcoeff_ptr[0] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
-          eob = 0;
-        }
-      }
-    }
-    const int rounding1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
-    const int32_t thresh1 = (int32_t)(dequant_ptr[1]);
-    for (i = 1; i < n_coeffs; i++) {
-      const int coeff = coeff_ptr[i];
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
+      const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
       int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      if ((abs_coeff << (1 + log_scale)) >= thresh1) {
-        abs_coeff = clamp64(abs_coeff + rounding1, INT16_MIN, INT16_MAX);
-        const int tmp32 = (int)((abs_coeff * quant_ptr[1]) >> (16 - log_scale));
+      int tmp32 = 0;
+      if ((abs_coeff << (1 + log_scale)) >= thresh) {
+        abs_coeff =
+            clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
+        tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
         if (tmp32) {
-          qcoeff_ptr[i] = (tmp32 ^ coeff_sign) - coeff_sign;
-          const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[1]) >> log_scale;
-          dqcoeff_ptr[i] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
-          eob = AOMMAX(iscan[i], eob);
+          qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+          const tran_low_t abs_dqcoeff =
+              (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+          dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
         }
       }
+      if (tmp32) eob = i;
     }
   } else {
     // Quantization pass: All coefficients with index >= zero_flag are
@@ -99,7 +89,7 @@ static void quantize_fp_helper_c(
       int tmp32 = 0;
       if (abs_coeff * wt >=
           (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
-        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+        abs_coeff += rounding[rc != 0];
         abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX);
         tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
                       (16 - log_scale + AOM_QM_BITS));
@@ -275,32 +265,65 @@ void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
-  if (qm_ptr != NULL && iqm_ptr != NULL) {
-    quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                        p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                        dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                        sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  if (qparam->use_quant_b_adapt) {
+    // TODO(sarahparker) These quantize_b optimizations need SIMD
+    // implementations
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      quantize_b_adaptive_helper_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX,
+                                  p->round_QTX, p->quant_QTX,
+                                  p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+                                  p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 1:
+          aom_quantize_b_32x32_adaptive(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 2:
+          aom_quantize_b_64x64_adaptive_c(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        default: assert(0);
+      }
+    }
   } else {
-    switch (qparam->log_scale) {
-      case 0:
-        aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                       p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                       dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                       sc->iscan);
-        break;
-      case 1:
-        aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                             p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                             dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                             sc->iscan);
-        break;
-      case 2:
-        aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                             p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                             dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                             sc->iscan);
-        break;
-      default: assert(0);
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                          p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                          dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                          sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                         p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                         dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                         sc->iscan);
+          break;
+        case 1:
+          aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                               sc->iscan);
+          break;
+        case 2:
+          aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                               sc->iscan);
+          break;
+        default: assert(0);
+      }
     }
   }
 }
@@ -391,41 +414,81 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
                                   const QUANT_PARAM *qparam) {
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
-  if (qm_ptr != NULL && iqm_ptr != NULL) {
-    highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                               sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  if (qparam->use_quant_b_adapt) {
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      highbd_quantize_b_adaptive_helper_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          if (LIKELY(n_coeffs >= 8)) {
+            aom_highbd_quantize_b_adaptive_c(
+                coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+                p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+                eob_ptr, sc->scan, sc->iscan);
+          } else {
+            // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
+            // quantization
+            aom_highbd_quantize_b_adaptive_c(
+                coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+                p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+                eob_ptr, sc->scan, sc->iscan);
+          }
+          break;
+        case 1:
+          aom_highbd_quantize_b_32x32_adaptive_c(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 2:
+          aom_highbd_quantize_b_64x64_adaptive_c(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        default: assert(0);
+      }
+    }
   } else {
-    switch (qparam->log_scale) {
-      case 0:
-        if (LIKELY(n_coeffs >= 8)) {
-          aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                                p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                                dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                                sc->iscan);
-        } else {
-          // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
-          // quantization
-          aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, p->zbin_QTX,
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                                 p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                                 dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                                 sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          if (LIKELY(n_coeffs >= 8)) {
+            aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX,
                                   p->round_QTX, p->quant_QTX,
                                   p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
                                   p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
-        }
-        break;
-      case 1:
-        aom_highbd_quantize_b_32x32(
-            coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
-            p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
-            eob_ptr, sc->scan, sc->iscan);
-        break;
-      case 2:
-        aom_highbd_quantize_b_64x64(
-            coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
-            p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
-            eob_ptr, sc->scan, sc->iscan);
-        break;
-      default: assert(0);
+          } else {
+            // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
+            // quantization
+            aom_highbd_quantize_b_c(
+                coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+                p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+                eob_ptr, sc->scan, sc->iscan);
+          }
+          break;
+        case 1:
+          aom_highbd_quantize_b_32x32(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 2:
+          aom_highbd_quantize_b_64x64(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        default: assert(0);
+      }
     }
   }
 }
diff --git a/libaom/av1/encoder/av1_quantize.h b/libaom/av1/encoder/av1_quantize.h
index fb53881..6419265 100644
--- a/libaom/av1/encoder/av1_quantize.h
+++ b/libaom/av1/encoder/av1_quantize.h
@@ -22,11 +22,15 @@
 extern "C" {
 #endif
 
+#define EOB_FACTOR 325
+#define SKIP_EOB_FACTOR_ADJUST 200
+
 typedef struct QUANT_PARAM {
   int log_scale;
   TX_SIZE tx_size;
   const qm_val_t *qmatrix;
   const qm_val_t *iqmatrix;
+  int use_quant_b_adapt;
 } QUANT_PARAM;
 
 typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
diff --git a/libaom/av1/encoder/bitstream.c b/libaom/av1/encoder/bitstream.c
index df79b79..cbac2b2 100644
--- a/libaom/av1/encoder/bitstream.c
+++ b/libaom/av1/encoder/bitstream.c
@@ -145,7 +145,7 @@ static void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
 static void write_tx_size_vartx(MACROBLOCKD *xd, const MB_MODE_INFO *mbmi,
                                 TX_SIZE tx_size, int depth, int blk_row,
                                 int blk_col, aom_writer *w) {
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
   const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
   const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
 
@@ -369,10 +369,18 @@ static void pack_txb_tokens(aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x,
                                                          blk_col)];
 
   if (tx_size == plane_tx_size || plane) {
-    tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
-    const uint16_t eob = x->mbmi_ext->eobs[plane][block];
-    TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
-                        x->mbmi_ext->dc_sign_ctx[plane][block] };
+    const int txb_offset =
+        x->mbmi_ext->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+    tran_low_t *tcoeff_txb =
+        x->mbmi_ext->cb_coef_buff->tcoeff[plane] + x->mbmi_ext->cb_offset;
+    uint16_t *eob_txb = x->mbmi_ext->cb_coef_buff->eobs[plane] + txb_offset;
+    uint8_t *txb_skip_ctx_txb =
+        x->mbmi_ext->cb_coef_buff->txb_skip_ctx[plane] + txb_offset;
+    int *dc_sign_ctx_txb =
+        x->mbmi_ext->cb_coef_buff->dc_sign_ctx[plane] + txb_offset;
+    tran_low_t *tcoeff = BLOCK_OFFSET(tcoeff_txb, block);
+    const uint16_t eob = eob_txb[block];
+    TXB_CTX txb_ctx = { txb_skip_ctx_txb[block], dc_sign_ctx_txb[block] };
     av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff,
                          eob, &txb_ctx);
 #if CONFIG_RD_DEBUG
@@ -460,7 +468,7 @@ static void write_segment_id(AV1_COMP *cpi, const MB_MODE_INFO *const mbmi,
     // changing from lossless to lossy.
     assert(is_inter_block(mbmi) || !cpi->has_lossless_segment);
 
-    set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row,
+    set_spatial_segment_id(cm, cm->cur_frame->seg_map, mbmi->sb_type, mi_row,
                            mi_col, pred);
     set_spatial_segment_id(cm, cpi->segmentation_map, mbmi->sb_type, mi_row,
                            mi_col, pred);
@@ -473,7 +481,7 @@ static void write_segment_id(AV1_COMP *cpi, const MB_MODE_INFO *const mbmi,
       av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
   aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
   aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
-  set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row,
+  set_spatial_segment_id(cm, cm->cur_frame->seg_map, mbmi->sb_type, mi_row,
                          mi_col, mbmi->segment_id);
 }
 
@@ -627,7 +635,7 @@ static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
           av1_extract_interp_filter(mbmi->interp_filters, dir);
       aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
                        SWITCHABLE_FILTERS);
-      ++cpi->interp_filter_selected[0][filter];
+      ++cm->cur_frame->interp_filter_selected[filter];
       if (cm->seq_params.enable_dual_filter == 0) return;
     }
   }
@@ -867,14 +875,7 @@ static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx,
 
 static void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, aom_writer *w,
                        int skip, int mi_col, int mi_row) {
-  if (cm->coded_lossless || cm->allow_intrabc) {
-    // Initialize to indicate no CDEF for safety.
-    cm->cdef_info.cdef_bits = 0;
-    cm->cdef_info.cdef_strengths[0] = 0;
-    cm->cdef_info.nb_cdef_strengths = 1;
-    cm->cdef_info.cdef_uv_strengths[0] = 0;
-    return;
-  }
+  if (cm->coded_lossless || cm->allow_intrabc) return;
 
   const int m = ~((1 << (6 - MI_SIZE_LOG2)) - 1);
   const MB_MODE_INFO *mbmi =
@@ -903,7 +904,7 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
                                    int mi_row, int mi_col, int skip,
                                    int preskip) {
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   AV1_COMMON *const cm = &cpi->common;
 
   if (seg->update_map) {
@@ -913,7 +914,7 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
       if (seg->segid_preskip) return;
       if (skip) {
         write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 1);
-        if (seg->temporal_update) ((MB_MODE_INFO *)mbmi)->seg_id_predicted = 0;
+        if (seg->temporal_update) mbmi->seg_id_predicted = 0;
         return;
       }
     }
@@ -925,7 +926,7 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
         write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
       }
       if (pred_flag) {
-        set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type,
+        set_spatial_segment_id(cm, cm->cur_frame->seg_map, mbmi->sb_type,
                                mi_row, mi_col, mbmi->segment_id);
       }
     } else {
@@ -1134,7 +1135,7 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
     if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w);
 
     // First write idx to indicate current compound inter prediction mode group
-    // Group A (0): jnt_comp, compound_average
+    // Group A (0): dist_wtd_comp, compound_average
     // Group B (1): interintra, compound_diffwtd, wedge
     if (has_second_ref(mbmi)) {
       const int masked_compound_used = is_any_masked_compound_used(bsize) &&
@@ -1152,7 +1153,7 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
         if (mbmi->compound_idx)
           assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
 
-        if (cm->seq_params.order_hint_info.enable_jnt_comp) {
+        if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
           const int comp_index_ctx = get_comp_index_context(cm, xd);
           aom_write_symbol(w, mbmi->compound_idx,
                            ec_ctx->compound_index_cdf[comp_index_ctx], 2);
@@ -1169,9 +1170,9 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
                mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
 
         if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
-          aom_write_symbol(w, mbmi->interinter_comp.type - 1,
+          aom_write_symbol(w, mbmi->interinter_comp.type - COMPOUND_WEDGE,
                            ec_ctx->compound_type_cdf[bsize],
-                           COMPOUND_TYPES - 1);
+                           MASKED_COMPOUND_TYPES);
 
         if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
           assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
@@ -1185,7 +1186,6 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
         }
       }
     }
-
     write_mb_interp_filter(cpi, xd, w);
   }
 }
@@ -1237,13 +1237,14 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
 }
 
 #if CONFIG_RD_DEBUG
-static void dump_mode_info(MODE_INFO *mi) {
+static void dump_mode_info(MB_MODE_INFO *mi) {
   printf("\nmi->mi_row == %d\n", mi->mi_row);
   printf("&& mi->mi_col == %d\n", mi->mi_col);
   printf("&& mi->sb_type == %d\n", mi->sb_type);
   printf("&& mi->tx_size == %d\n", mi->tx_size);
   printf("&& mi->mode == %d\n", mi->mode);
 }
+
 static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
                                    int plane) {
   if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
@@ -1274,30 +1275,28 @@ static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
 #if ENC_MISMATCH_DEBUG
 static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
-  const MB_MODE_INFO *const *mbmi = xd->mi[0];
+  const MB_MODE_INFO *const *mbmi =
+      *(cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col));
+  const MB_MODE_INFO_EXT *const *mbmi_ext =
+      cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
   if (is_inter_block(mbmi)) {
 #define FRAME_TO_CHECK 11
     if (cm->current_frame.frame_number == FRAME_TO_CHECK &&
         cm->show_frame == 1) {
       const BLOCK_SIZE bsize = mbmi->sb_type;
 
-      int_mv mv[2];
-      int is_comp_ref = has_second_ref(mbmi);
-      int ref;
+      int_mv mv[2] = { 0 };
+      const int is_comp_ref = has_second_ref(mbmi);
 
-      for (ref = 0; ref < 1 + is_comp_ref; ++ref)
+      for (int ref = 0; ref < 1 + is_comp_ref; ++ref)
         mv[ref].as_mv = mbmi->mv[ref].as_mv;
 
       if (!is_comp_ref) {
         mv[1].as_int = 0;
       }
 
-      MACROBLOCK *const x = &cpi->td.mb;
-      const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
       const int16_t mode_ctx =
-          is_comp_ref ? mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]
+          is_comp_ref ? 0
                       : av1_mode_context_analyzer(mbmi_ext->mode_context,
                                                   mbmi->ref_frame);
 
@@ -1479,14 +1478,16 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
                                   row, col, &block[plane], plane);
           }
         }
+      }
 #if CONFIG_RD_DEBUG
+      for (plane = 0; plane < num_planes && is_inter_block(mbmi); ++plane) {
         if (mbmi->sb_type >= BLOCK_8X8 &&
             rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
-          dump_mode_info(m);
+          dump_mode_info(mbmi);
           assert(0);
         }
-#endif  // CONFIG_RD_DEBUG
       }
+#endif  // CONFIG_RD_DEBUG
     }
   }
 }
@@ -1875,8 +1876,8 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
   assert(!cm->all_lossless);
 
   const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
-  WienerInfo *wiener_info = xd->wiener_info + plane;
-  SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
+  WienerInfo *ref_wiener_info = &xd->wiener_info[plane];
+  SgrprojInfo *ref_sgrproj_info = &xd->sgrproj_info[plane];
   RestorationType unit_rtype = rui->restoration_type;
 
   if (frame_rtype == RESTORE_SWITCHABLE) {
@@ -1887,10 +1888,10 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
 #endif
     switch (unit_rtype) {
       case RESTORE_WIENER:
-        write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w);
+        write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
         break;
       case RESTORE_SGRPROJ:
-        write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w);
+        write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
         break;
       default: assert(unit_rtype == RESTORE_NONE); break;
     }
@@ -1901,7 +1902,7 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
     ++counts->wiener_restore[unit_rtype != RESTORE_NONE];
 #endif
     if (unit_rtype != RESTORE_NONE) {
-      write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w);
+      write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
     }
   } else if (frame_rtype == RESTORE_SGRPROJ) {
     aom_write_symbol(w, unit_rtype != RESTORE_NONE,
@@ -1910,7 +1911,7 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
     ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE];
 #endif
     if (unit_rtype != RESTORE_NONE) {
-      write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w);
+      write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
     }
   }
 }
@@ -1941,13 +1942,9 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
     aom_wb_write_bit(wb, lf->mode_ref_delta_update);
 
     if (lf->mode_ref_delta_update) {
-      const int prime_idx = cm->primary_ref_frame;
-      const RefCntBuffer *const buf =
-          prime_idx == PRIMARY_REF_NONE
-              ? NULL
-              : cm->current_frame.frame_refs[prime_idx].buf;
+      const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
       int8_t last_ref_deltas[REF_FRAMES];
-      if (prime_idx == PRIMARY_REF_NONE || buf == NULL) {
+      if (buf == NULL) {
         av1_set_default_ref_deltas(last_ref_deltas);
       } else {
         memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
@@ -1960,7 +1957,7 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
       }
 
       int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
-      if (prime_idx == PRIMARY_REF_NONE || buf == NULL) {
+      if (buf == NULL) {
         av1_set_default_mode_deltas(last_mode_deltas);
       } else {
         memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
@@ -2076,15 +2073,6 @@ static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
   }
 }
 
-static void write_tx_mode(AV1_COMMON *cm, TX_MODE *mode,
-                          struct aom_write_bit_buffer *wb) {
-  if (cm->coded_lossless) {
-    *mode = ONLY_4X4;
-    return;
-  }
-  aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
-}
-
 static void write_frame_interp_filter(InterpFilter filter,
                                       struct aom_write_bit_buffer *wb) {
   aom_wb_write_bit(wb, filter == SWITCHABLE);
@@ -2092,29 +2080,6 @@ static void write_frame_interp_filter(InterpFilter filter,
     aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS);
 }
 
-static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) {
-  if (cm->interp_filter == SWITCHABLE) {
-    // Check to see if only one of the filters is actually used
-    int count[SWITCHABLE_FILTERS];
-    int i, j, c = 0;
-    for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
-      count[i] = 0;
-      for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
-        count[i] += counts->switchable_interp[j][i];
-      c += (count[i] > 0);
-    }
-    if (c == 1) {
-      // Only one filter is used. So set the filter at frame level
-      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
-        if (count[i]) {
-          if (i == EIGHTTAP_REGULAR) cm->interp_filter = i;
-          break;
-        }
-      }
-    }
-  }
-}
-
 // Same function as write_uniform but writing to uncompresses header wb
 static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) {
   const int l = get_unsigned_bits(n);
@@ -2212,63 +2177,12 @@ static void write_ext_tile_info(const AV1_COMMON *const cm,
   }
 }
 
-static int get_refresh_mask(AV1_COMP *cpi) {
-  if ((cpi->common.current_frame.frame_type == KEY_FRAME &&
-       cpi->common.show_frame) ||
-      frame_is_sframe(&cpi->common))
-    return 0xFF;
-
-  int refresh_mask = 0;
-
-  // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be
-  // notified to get LAST3_FRAME refreshed and then the virtual indexes for all
-  // the 3 LAST reference frames will be updated accordingly, i.e.:
-  // (1) The original virtual index for LAST3_FRAME will become the new virtual
-  //     index for LAST_FRAME; and
-  // (2) The original virtual indexes for LAST_FRAME and LAST2_FRAME will be
-  //     shifted and become the new virtual indexes for LAST2_FRAME and
-  //     LAST3_FRAME.
-  refresh_mask |=
-      (cpi->refresh_last_frame << get_ref_frame_map_idx(cpi, LAST3_FRAME));
-
-#if USE_SYMM_MULTI_LAYER
-  const int bwd_ref_frame =
-      (cpi->new_bwdref_update_rule == 1) ? EXTREF_FRAME : BWDREF_FRAME;
-#else
-  const int bwd_ref_frame = BWDREF_FRAME;
-#endif
-  refresh_mask |=
-      (cpi->refresh_bwd_ref_frame << get_ref_frame_map_idx(cpi, bwd_ref_frame));
-
-  refresh_mask |= (cpi->refresh_alt2_ref_frame
-                   << get_ref_frame_map_idx(cpi, ALTREF2_FRAME));
-
-  if (av1_preserve_existing_gf(cpi)) {
-    // We have decided to preserve the previously existing golden frame as our
-    // new ARF frame. However, in the short term we leave it in the GF slot and,
-    // if we're updating the GF with the current decoded frame, we save it
-    // instead to the ARF slot.
-    // Later, in the function av1_encoder.c:av1_update_reference_frames() we
-    // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it
-    // there so that it can be done outside of the recode loop.
-    // Note: This is highly specific to the use of ARF as a forward reference,
-    // and this needs to be generalized as other uses are implemented
-    // (like RTC/temporal scalability).
-
-    if (cpi->preserve_arf_as_gld) {
-      return refresh_mask;
-    } else {
-      return refresh_mask | (cpi->refresh_golden_frame
-                             << get_ref_frame_map_idx(cpi, ALTREF_FRAME));
-    }
-  } else {
-    const int arf_idx = get_ref_frame_map_idx(cpi, ALTREF_FRAME);
-    return refresh_mask |
-           (cpi->refresh_golden_frame
-            << get_ref_frame_map_idx(cpi, GOLDEN_FRAME)) |
-           (cpi->refresh_alt_ref_frame << arf_idx);
-  }
-}
+// Stores the location and size of a tile's data in the bitstream.  Used for
+// later identifying identical tiles
+typedef struct TileBufferEnc {
+  uint8_t *data;
+  size_t size;
+} TileBufferEnc;
 
 static INLINE int find_identical_tile(
     const int tile_row, const int tile_col,
@@ -2289,18 +2203,18 @@ static INLINE int find_identical_tile(
     int col_offset = candidate_offset[0].col;
     int row = tile_row - row_offset;
     int col = tile_col - col_offset;
-    uint8_t tile_hdr;
     const uint8_t *tile_data;
     TileBufferEnc *candidate;
 
     if (row < 0 || col < 0) continue;
 
-    tile_hdr = *(tile_buffers[row][col].data);
+    const uint32_t tile_hdr = mem_get_le32(tile_buffers[row][col].data);
 
-    // Read out tcm bit
-    if ((tile_hdr >> 7) == 1) {
-      // The candidate is a copy tile itself
-      row_offset += tile_hdr & 0x7f;
+    // Read out tile-copy-mode bit:
+    if ((tile_hdr >> 31) == 1) {
+      // The candidate is a copy tile itself: the offset is stored in bits
+      // 30 through 24 inclusive.
+      row_offset += (tile_hdr >> 24) & 0x7f;
       row = tile_row - row_offset;
     }
 
@@ -2370,14 +2284,13 @@ static void write_frame_size(const AV1_COMMON *cm, int frame_size_override,
   write_render_size(cm, wb);
 }
 
-static void write_frame_size_with_refs(AV1_COMP *cpi,
+static void write_frame_size_with_refs(const AV1_COMMON *const cm,
                                        struct aom_write_bit_buffer *wb) {
-  AV1_COMMON *const cm = &cpi->common;
   int found = 0;
 
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
+    const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame);
 
     if (cfg != NULL) {
       found = cm->superres_upscaled_width == cfg->y_crop_width &&
@@ -2539,34 +2452,27 @@ static void write_tu_pts_info(AV1_COMMON *const cm,
       cm->buffer_model.frame_presentation_time_length);
 }
 
-static void write_film_grain_params(AV1_COMP *cpi,
+static void write_film_grain_params(const AV1_COMP *const cpi,
                                     struct aom_write_bit_buffer *wb) {
-  AV1_COMMON *const cm = &cpi->common;
-  aom_film_grain_t *pars = &cm->film_grain_params;
-
-  cm->cur_frame->film_grain_params = *pars;
+  const AV1_COMMON *const cm = &cpi->common;
+  const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params;
 
   aom_wb_write_bit(wb, pars->apply_grain);
   if (!pars->apply_grain) return;
 
   aom_wb_write_literal(wb, pars->random_seed, 16);
 
-  pars->random_seed += 3381;  // Changing random seed for film grain
-  if (!pars->random_seed)     // Random seed should not be zero
-    pars->random_seed += 7391;
   if (cm->current_frame.frame_type == INTER_FRAME)
     aom_wb_write_bit(wb, pars->update_parameters);
-  else
-    pars->update_parameters = 1;
+
   if (!pars->update_parameters) {
-    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-    int ref_frame, ref_idx, buf_idx;
+    int ref_frame, ref_idx;
     for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) {
-      ref_idx = get_ref_frame_map_idx(cpi, ref_frame);
+      ref_idx = get_ref_frame_map_idx(cm, ref_frame);
       assert(ref_idx != INVALID_IDX);
-      buf_idx = cm->ref_frame_map[ref_idx];
-      if (frame_bufs[buf_idx].film_grain_params_present &&
-          memcmp(pars, &frame_bufs[buf_idx].film_grain_params, sizeof(*pars))) {
+      const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx];
+      if (buf->film_grain_params_present &&
+          av1_check_grain_params_equiv(pars, &buf->film_grain_params)) {
         break;
       }
     }
@@ -2582,16 +2488,16 @@ static void write_film_grain_params(AV1_COMP *cpi,
     aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
   }
 
-  if (!cm->seq_params.monochrome)
+  if (!cm->seq_params.monochrome) {
     aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
-  else
-    pars->chroma_scaling_from_luma = 0;  // for monochrome override to 0
+  } else {
+    assert(!pars->chroma_scaling_from_luma);
+  }
 
   if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
       ((cm->seq_params.subsampling_x == 1) &&
        (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) {
-    pars->num_cb_points = 0;
-    pars->num_cr_points = 0;
+    assert(pars->num_cb_points == 0 && pars->num_cr_points == 0);
   } else {
     aom_wb_write_literal(wb, pars->num_cb_points, 4);  // max 10
     for (int i = 0; i < pars->num_cb_points; i++) {
@@ -2651,7 +2557,7 @@ static void write_film_grain_params(AV1_COMP *cpi,
   aom_wb_write_bit(wb, pars->clip_to_restricted_range);
 }
 
-static void write_sb_size(SequenceHeader *seq_params,
+static void write_sb_size(const SequenceHeader *const seq_params,
                           struct aom_write_bit_buffer *wb) {
   (void)seq_params;
   (void)wb;
@@ -2662,41 +2568,16 @@ static void write_sb_size(SequenceHeader *seq_params,
   aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
 }
 
-static void write_sequence_header(AV1_COMP *cpi,
+static void write_sequence_header(const SequenceHeader *const seq_params,
                                   struct aom_write_bit_buffer *wb) {
-  AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *seq_params = &cm->seq_params;
-
-  int max_frame_width = cpi->oxcf.forced_max_frame_width
-                            ? cpi->oxcf.forced_max_frame_width
-                            : cpi->oxcf.width;
-  int max_frame_height = cpi->oxcf.forced_max_frame_height
-                             ? cpi->oxcf.forced_max_frame_height
-                             : cpi->oxcf.height;
-  // max((int)ceil(log2(max_frame_width)), 1)
-  const int num_bits_width =
-      (max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1;
-  // max((int)ceil(log2(max_frame_height)), 1)
-  const int num_bits_height =
-      (max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1;
-  assert(num_bits_width <= 16);
-  assert(num_bits_height <= 16);
-
-  seq_params->num_bits_width = num_bits_width;
-  seq_params->num_bits_height = num_bits_height;
-  seq_params->max_frame_width = max_frame_width;
-  seq_params->max_frame_height = max_frame_height;
-
-  aom_wb_write_literal(wb, num_bits_width - 1, 4);
-  aom_wb_write_literal(wb, num_bits_height - 1, 4);
-  aom_wb_write_literal(wb, max_frame_width - 1, num_bits_width);
-  aom_wb_write_literal(wb, max_frame_height - 1, num_bits_height);
-
-  /* Placeholder for actually writing to the bitstream */
-  if (!seq_params->reduced_still_picture_hdr) {
-    seq_params->frame_id_length = FRAME_ID_LENGTH;
-    seq_params->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
+  aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4);
+  aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4);
+  aom_wb_write_literal(wb, seq_params->max_frame_width - 1,
+                       seq_params->num_bits_width);
+  aom_wb_write_literal(wb, seq_params->max_frame_height - 1,
+                       seq_params->num_bits_height);
 
+  if (!seq_params->reduced_still_picture_hdr) {
     aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
     if (seq_params->frame_id_numbers_present_flag) {
       // We must always have delta_frame_id_length < frame_id_length,
@@ -2724,7 +2605,7 @@ static void write_sequence_header(AV1_COMP *cpi,
     aom_wb_write_bit(wb, seq_params->order_hint_info.enable_order_hint);
 
     if (seq_params->order_hint_info.enable_order_hint) {
-      aom_wb_write_bit(wb, seq_params->order_hint_info.enable_jnt_comp);
+      aom_wb_write_bit(wb, seq_params->order_hint_info.enable_dist_wtd_comp);
       aom_wb_write_bit(wb, seq_params->order_hint_info.enable_ref_frame_mvs);
     }
     if (seq_params->force_screen_content_tools == 2) {
@@ -2821,7 +2702,7 @@ static void write_global_motion(AV1_COMP *cpi,
     // does not work currently and causes mismatches when resize is on.
     // Fix it before turning the optimization back on.
     /*
-    YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_buffer(cpi, frame);
+    YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_yv12_buf(cpi, frame);
     if (cpi->source->y_crop_width == ref_buf->y_crop_width &&
         cpi->source->y_crop_height == ref_buf->y_crop_height) {
       write_global_motion_params(&cm->global_motion[frame],
@@ -2842,78 +2723,72 @@ static void write_global_motion(AV1_COMP *cpi,
   }
 }
 
-static void check_frame_refs_short_signaling(AV1_COMP *const cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  if (!cm->frame_refs_short_signaling) return;
-
+static int check_frame_refs_short_signaling(AV1_COMMON *const cm) {
   // Check whether all references are distinct frames.
-  int buf_markers[FRAME_BUFFERS] = { 0 };
+  const RefCntBuffer *seen_bufs[FRAME_BUFFERS] = { NULL };
+  int num_refs = 0;
   for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
-    if (buf_idx != INVALID_IDX) {
-      assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS);
-      buf_markers[buf_idx] = 1;
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    if (buf != NULL) {
+      int seen = 0;
+      for (int i = 0; i < num_refs; i++) {
+        if (seen_bufs[i] == buf) {
+          seen = 1;
+          break;
+        }
+      }
+      if (!seen) seen_bufs[num_refs++] = buf;
     }
   }
 
-  int num_refs = 0;
-  for (int buf_idx = 0; buf_idx < FRAME_BUFFERS; ++buf_idx) {
-    num_refs += buf_markers[buf_idx];
-  }
-
   // We only turn on frame_refs_short_signaling when all references are
   // distinct.
   if (num_refs < INTER_REFS_PER_FRAME) {
     // It indicates that there exist more than one reference frame pointing to
     // the same reference buffer, i.e. two or more references are duplicate.
-    cm->frame_refs_short_signaling = 0;
-    return;
+    return 0;
   }
 
   // Check whether the encoder side ref frame choices are aligned with that to
   // be derived at the decoder side.
-  RefBuffer frame_refs_copy[INTER_REFS_PER_FRAME];
+  int remapped_ref_idx_decoder[REF_FRAMES];
 
-  // Backup the frame refs info
-  memcpy(frame_refs_copy, cm->current_frame.frame_refs,
-         INTER_REFS_PER_FRAME * sizeof(RefBuffer));
-
-  const int lst_map_idx = get_ref_frame_map_idx(cpi, LAST_FRAME);
-  const int gld_map_idx = get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+  const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+  const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
 
   // Set up the frame refs mapping indexes according to the
   // frame_refs_short_signaling policy.
-  av1_set_frame_refs(cm, lst_map_idx, gld_map_idx);
+  av1_set_frame_refs(cm, remapped_ref_idx_decoder, lst_map_idx, gld_map_idx);
 
   // We only turn on frame_refs_short_signaling when the encoder side decision
   // on ref frames is identical to that at the decoder side.
+  int frame_refs_short_signaling = 1;
   for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) {
     // Compare the buffer index between two reference frames indexed
     // respectively by the encoder and the decoder side decisions.
-    if (cm->current_frame.frame_refs[ref_idx].buf !=
-        frame_refs_copy[ref_idx].buf) {
-      cm->frame_refs_short_signaling = 0;
+    RefCntBuffer *ref_frame_buf_new = NULL;
+    if (remapped_ref_idx_decoder[ref_idx] != INVALID_IDX) {
+      ref_frame_buf_new = cm->ref_frame_map[remapped_ref_idx_decoder[ref_idx]];
+    }
+    if (get_ref_frame_buf(cm, LAST_FRAME + ref_idx) != ref_frame_buf_new) {
+      frame_refs_short_signaling = 0;
       break;
     }
   }
 
 #if 0   // For debug
   printf("\nFrame=%d: \n", cm->current_frame.frame_number);
-  printf("***frame_refs_short_signaling=%d\n", cm->frame_refs_short_signaling);
+  printf("***frame_refs_short_signaling=%d\n", frame_refs_short_signaling);
   for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    printf("enc_ref(map_idx=%d, buf_idx=%d)=%d, vs. "
+    printf("enc_ref(map_idx=%d)=%d, vs. "
         "dec_ref(map_idx=%d)=%d\n",
-        get_ref_frame_map_idx(cpi, ref_frame),
-        get_ref_frame_buf_idx(cpi, ref_frame), ref_frame,
-        cm->current_frame.frame_refs[ref_frame - LAST_FRAME].map_idx,
+        get_ref_frame_map_idx(cm, ref_frame), ref_frame,
+        cm->remapped_ref_idx[ref_frame - LAST_FRAME],
         ref_frame);
   }
 #endif  // 0
 
-  // Restore the frame refs info if frame_refs_short_signaling is off.
-  if (!cm->frame_refs_short_signaling)
-    memcpy(cm->current_frame.frame_refs, frame_refs_copy,
-           INTER_REFS_PER_FRAME * sizeof(RefBuffer));
+  return frame_refs_short_signaling;
 }
 
 // New function based on HLS R18
@@ -2925,10 +2800,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   CurrentFrame *const current_frame = &cm->current_frame;
 
-  // NOTE: By default all coded frames to be used as a reference
-  cm->is_reference_frame = 1;
-  current_frame->frame_type =
-      current_frame->intra_only ? INTRA_ONLY_FRAME : current_frame->frame_type;
+  current_frame->frame_refs_short_signaling = 0;
 
   if (seq_params->still_picture) {
     assert(cm->show_existing_frame == 0);
@@ -2937,17 +2809,6 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
   }
   if (!seq_params->reduced_still_picture_hdr) {
     if (encode_show_existing_frame(cm)) {
-      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-      const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
-
-      if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Buffer %d does not contain a reconstructed frame",
-                           frame_to_show);
-      }
-      assign_frame_buffer(frame_bufs, &cm->new_fb_idx, frame_to_show);
-      cm->cur_frame = &frame_bufs[cm->new_fb_idx];
-
       aom_wb_write_bit(wb, 1);  // show_existing_frame
       aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
 
@@ -2960,14 +2821,6 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
         int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
         aom_wb_write_literal(wb, display_frame_id, frame_id_len);
       }
-
-      if (cm->reset_decoder_state &&
-          frame_bufs[frame_to_show].frame_type != KEY_FRAME) {
-        aom_internal_error(
-            &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-            "show_existing_frame to reset state on KEY_FRAME only");
-      }
-
       return;
     } else {
       aom_wb_write_bit(wb, 0);  // show_existing_frame
@@ -3008,29 +2861,28 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
     assert(cm->cur_frame_force_integer_mv == 0);
   }
 
-  cm->invalid_delta_frame_id_minus_1 = 0;
   int frame_size_override_flag = 0;
-  cm->frame_refs_short_signaling = 0;
 
   if (seq_params->reduced_still_picture_hdr) {
-    assert(cm->width == seq_params->max_frame_width &&
-           cm->height == seq_params->max_frame_height);
+    assert(cm->superres_upscaled_width == seq_params->max_frame_width &&
+           cm->superres_upscaled_height == seq_params->max_frame_height);
   } else {
     if (seq_params->frame_id_numbers_present_flag) {
       int frame_id_len = seq_params->frame_id_length;
       aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
     }
 
-    if (cm->width > seq_params->max_frame_width ||
-        cm->height > seq_params->max_frame_height) {
+    if (cm->superres_upscaled_width > seq_params->max_frame_width ||
+        cm->superres_upscaled_height > seq_params->max_frame_height) {
       aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Frame dimensions are larger than the maximum values");
     }
 
     frame_size_override_flag =
-        frame_is_sframe(cm) ? 1
-                            : (cm->width != seq_params->max_frame_width ||
-                               cm->height != seq_params->max_frame_height);
+        frame_is_sframe(cm)
+            ? 1
+            : (cm->superres_upscaled_width != seq_params->max_frame_width ||
+               cm->superres_upscaled_height != seq_params->max_frame_height);
     if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag);
 
     if (seq_params->order_hint_info.enable_order_hint)
@@ -3069,70 +2921,21 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
       }
     }
   }
-  cpi->refresh_frame_mask = get_refresh_mask(cpi);
-  if (current_frame->frame_type == KEY_FRAME) {
-    if (!cm->show_frame) {  // unshown keyframe (forward keyframe)
-      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
-    } else {
-      assert(cpi->refresh_frame_mask == 0xFF);
-    }
-  } else {
-    if (current_frame->frame_type == INTRA_ONLY_FRAME) {
-      assert(cpi->refresh_frame_mask != 0xFF);
-      int updated_fb = -1;
-      for (int i = 0; i < REF_FRAMES; i++) {
-        // If more than one frame is refreshed, it doesn't matter which one
-        // we pick, so pick the first.
-        if (cpi->refresh_frame_mask & (1 << i)) {
-          updated_fb = i;
-          break;
-        }
-      }
-      assert(updated_fb >= 0);
-      cm->fb_of_context_type[cm->frame_context_idx] = updated_fb;
-      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
-    } else if (current_frame->frame_type == INTER_FRAME ||
-               frame_is_sframe(cm)) {
-      if (current_frame->frame_type == INTER_FRAME) {
-        aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
-      } else {
-        assert(frame_is_sframe(cm) && cpi->refresh_frame_mask == 0xFF);
-      }
-      int updated_fb = -1;
-      for (int i = 0; i < REF_FRAMES; i++) {
-        // If more than one frame is refreshed, it doesn't matter which one
-        // we pick, so pick the first.
-        if (cpi->refresh_frame_mask & (1 << i)) {
-          updated_fb = i;
-          break;
-        }
-      }
-      // large scale tile sometimes won't refresh any fbs
-      if (updated_fb >= 0) {
-        cm->fb_of_context_type[cm->frame_context_idx] = updated_fb;
-      }
 
-      if (!cpi->refresh_frame_mask) {
-        // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
-        //       will not be used as a reference
-        cm->is_reference_frame = 0;
-      }
-    }
-  }
+  // Shown keyframes and switch-frames automatically refreshes all reference
+  // frames.  For all other frame types, we need to write refresh_frame_flags.
+  if ((current_frame->frame_type == KEY_FRAME && !cm->show_frame) ||
+      current_frame->frame_type == INTER_FRAME ||
+      current_frame->frame_type == INTRA_ONLY_FRAME)
+    aom_wb_write_literal(wb, current_frame->refresh_frame_flags, REF_FRAMES);
 
-  if (!frame_is_intra_only(cm) || cpi->refresh_frame_mask != 0xFF) {
+  if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xff) {
     // Write all ref frame order hints if error_resilient_mode == 1
     if (cm->error_resilient_mode &&
         seq_params->order_hint_info.enable_order_hint) {
-      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
       for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
-        // Get buffer index
-        const int buf_idx = cm->ref_frame_map[ref_idx];
-        assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS);
-
-        // Write order hint to bit stream
         aom_wb_write_literal(
-            wb, frame_bufs[buf_idx].order_hint,
+            wb, cm->ref_frame_map[ref_idx]->order_hint,
             seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
       }
     }
@@ -3143,8 +2946,6 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
     assert(!av1_superres_scaled(cm) || !cm->allow_intrabc);
     if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
       aom_wb_write_bit(wb, cm->allow_intrabc);
-    // all eight fbs are refreshed, pick one that will live long enough
-    cm->fb_of_context_type[REGULAR_FRAME] = 0;
   } else {
     if (current_frame->frame_type == INTRA_ONLY_FRAME) {
       write_frame_size(cm, frame_size_override_flag, wb);
@@ -3159,36 +2960,37 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
       //       automatically.
 #define FRAME_REFS_SHORT_SIGNALING 0
 #if FRAME_REFS_SHORT_SIGNALING
-      cm->frame_refs_short_signaling =
+      current_frame->frame_refs_short_signaling =
           seq_params->order_hint_info.enable_order_hint;
 #endif  // FRAME_REFS_SHORT_SIGNALING
 
-      if (cm->frame_refs_short_signaling) {
+      if (current_frame->frame_refs_short_signaling) {
         // NOTE(zoeliu@google.com):
         //   An example solution for encoder-side implementation on frame refs
         //   short signaling, which is only turned on when the encoder side
         //   decision on ref frames is identical to that at the decoder side.
-        check_frame_refs_short_signaling(cpi);
+        current_frame->frame_refs_short_signaling =
+            check_frame_refs_short_signaling(cm);
       }
 
       if (seq_params->order_hint_info.enable_order_hint)
-        aom_wb_write_bit(wb, cm->frame_refs_short_signaling);
+        aom_wb_write_bit(wb, current_frame->frame_refs_short_signaling);
 
-      if (cm->frame_refs_short_signaling) {
-        const int lst_ref = get_ref_frame_map_idx(cpi, LAST_FRAME);
+      if (current_frame->frame_refs_short_signaling) {
+        const int lst_ref = get_ref_frame_map_idx(cm, LAST_FRAME);
         aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2);
 
-        const int gld_ref = get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+        const int gld_ref = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
         aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2);
       }
 
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
-        if (!cm->frame_refs_short_signaling)
-          aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+        assert(get_ref_frame_map_idx(cm, ref_frame) != INVALID_IDX);
+        if (!current_frame->frame_refs_short_signaling)
+          aom_wb_write_literal(wb, get_ref_frame_map_idx(cm, ref_frame),
                                REF_FRAMES_LOG2);
         if (seq_params->frame_id_numbers_present_flag) {
-          int i = get_ref_frame_map_idx(cpi, ref_frame);
+          int i = get_ref_frame_map_idx(cm, ref_frame);
           int frame_id_len = seq_params->frame_id_length;
           int diff_len = seq_params->delta_frame_id_length;
           int delta_frame_id_minus_1 =
@@ -3197,24 +2999,22 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
                (1 << frame_id_len)) -
               1;
           if (delta_frame_id_minus_1 < 0 ||
-              delta_frame_id_minus_1 >= (1 << diff_len))
-            cm->invalid_delta_frame_id_minus_1 = 1;
+              delta_frame_id_minus_1 >= (1 << diff_len)) {
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+                               "Invalid delta_frame_id_minus_1");
+          }
           aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
         }
       }
 
       if (!cm->error_resilient_mode && frame_size_override_flag) {
-        write_frame_size_with_refs(cpi, wb);
+        write_frame_size_with_refs(cm, wb);
       } else {
         write_frame_size(cm, frame_size_override_flag, wb);
       }
 
-      if (cm->cur_frame_force_integer_mv) {
-        cm->allow_high_precision_mv = 0;
-      } else {
+      if (!cm->cur_frame_force_integer_mv)
         aom_wb_write_bit(wb, cm->allow_high_precision_mv);
-      }
-      fix_interp_filter(cm, cpi->td.counts);
       write_frame_interp_filter(cm->interp_filter, wb);
       aom_wb_write_bit(wb, cm->switchable_motion_mode);
       if (frame_might_allow_ref_frame_mvs(cm)) {
@@ -3228,7 +3028,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
   const int might_bwd_adapt =
       !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update);
   if (cm->large_scale_tile)
-    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+    assert(cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
 
   if (might_bwd_adapt) {
     aom_wb_write_bit(
@@ -3268,9 +3068,13 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
     encode_restoration_mode(cm, wb);
   }
 
-  write_tx_mode(cm, &cm->tx_mode, wb);
+  // Write TX mode
+  if (cm->coded_lossless)
+    assert(cm->tx_mode == ONLY_4X4);
+  else
+    aom_wb_write_bit(wb, cm->tx_mode == TX_MODE_SELECT);
 
-  if (cpi->allow_comp_inter_inter) {
+  if (!frame_is_intra_only(cm)) {
     const int use_hybrid_pred =
         current_frame->reference_mode == REFERENCE_MODE_SELECT;
 
@@ -3290,19 +3094,9 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
   if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
 
   if (seq_params->film_grain_params_present &&
-      (cm->show_frame || cm->showable_frame)) {
-    int flip_back_update_parameters_flag = 0;
-    if (current_frame->frame_type != INTER_FRAME &&
-        cm->film_grain_params.update_parameters == 0) {
-      cm->film_grain_params.update_parameters = 1;
-      flip_back_update_parameters_flag = 1;
-    }
+      (cm->show_frame || cm->showable_frame))
     write_film_grain_params(cpi, wb);
 
-    if (flip_back_update_parameters_flag)
-      cm->film_grain_params.update_parameters = 0;
-  }
-
   if (cm->large_scale_tile) write_ext_tile_info(cm, saved_wb, wb);
 }
 
@@ -3440,8 +3234,12 @@ static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
   return wpos;
 }
 
-uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
-                          uint8_t *const dst) {
+uint32_t av1_write_obu_header(AV1_COMP *const cpi, OBU_TYPE obu_type,
+                              int obu_extension, uint8_t *const dst) {
+  if (cpi->keep_level_stats &&
+      (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER))
+    ++cpi->frame_header_count;
+
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
 
@@ -3493,9 +3291,8 @@ static void add_trailing_bits(struct aom_write_bit_buffer *wb) {
   }
 }
 
-static void write_bitstream_level(BitstreamLevel bl,
+static void write_bitstream_level(AV1_LEVEL seq_level_idx,
                                   struct aom_write_bit_buffer *wb) {
-  uint8_t seq_level_idx = major_minor_to_seq_level_idx(bl);
   assert(is_valid_seq_level_idx(seq_level_idx));
   aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
 }
@@ -3518,7 +3315,7 @@ uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
     assert(cm->timing_info_present == 0);
     assert(cm->seq_params.decoder_model_info_present_flag == 0);
     assert(cm->seq_params.display_model_info_present_flag == 0);
-    write_bitstream_level(cm->seq_params.level[0], &wb);
+    write_bitstream_level(cm->seq_params.seq_level_idx[0], &wb);
   } else {
     aom_wb_write_bit(&wb, cm->timing_info_present);  // timing info present flag
 
@@ -3537,8 +3334,8 @@ uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
     for (i = 0; i < cm->seq_params.operating_points_cnt_minus_1 + 1; i++) {
       aom_wb_write_literal(&wb, cm->seq_params.operating_point_idc[i],
                            OP_POINTS_IDC_BITS);
-      write_bitstream_level(cm->seq_params.level[i], &wb);
-      if (cm->seq_params.level[i].major > 3)
+      write_bitstream_level(cm->seq_params.seq_level_idx[i], &wb);
+      if (cm->seq_params.seq_level_idx[i] >= SEQ_LEVEL_4_0)
         aom_wb_write_bit(&wb, cm->seq_params.tier[i]);
       if (cm->seq_params.decoder_model_info_present_flag) {
         aom_wb_write_bit(&wb,
@@ -3557,7 +3354,7 @@ uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
       }
     }
   }
-  write_sequence_header(cpi, &wb);
+  write_sequence_header(&cm->seq_params, &wb);
 
   write_color_config(&cm->seq_params, &wb);
 
@@ -3607,11 +3404,13 @@ typedef struct {
 static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
                                        struct aom_write_bit_buffer *saved_wb,
                                        uint8_t obu_extension_header,
-                                       const FrameHeaderInfo *fh_info) {
+                                       const FrameHeaderInfo *fh_info,
+                                       int *const largest_tile_id) {
   AV1_COMMON *const cm = &cpi->common;
   aom_writer mode_bc;
   int tile_row, tile_col;
-  TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
+  // Store the location and size of each tile's data in the bitstream:
+  TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
   uint32_t total_size = 0;
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
@@ -3632,13 +3431,13 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
   const int have_tiles = tile_cols * tile_rows > 1;
   int first_tg = 1;
 
-  cm->largest_tile_id = 0;
+  *largest_tile_id = 0;
 
   if (cm->large_scale_tile) {
     // For large_scale_tile case, we always have only one tile group, so it can
     // be written as an OBU_FRAME.
     const OBU_TYPE obu_type = OBU_FRAME;
-    const uint32_t tg_hdr_size = write_obu_header(obu_type, 0, data);
+    const uint32_t tg_hdr_size = av1_write_obu_header(cpi, obu_type, 0, data);
     data += tg_hdr_size;
 
     const uint32_t frame_header_size =
@@ -3685,8 +3484,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
         // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
         // even for the last one, unless no tiling is used at all.
         total_size += data_offset;
-        // Initialise tile context from the frame context
-        this_tile->tctx = *cm->fc;
         cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
         mode_bc.allow_update_cdf = !cm->large_scale_tile;
         mode_bc.allow_update_cdf =
@@ -3700,7 +3497,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
         // Record the maximum tile size we see, so we can compact headers later.
         if (tile_size > max_tile_size) {
           max_tile_size = tile_size;
-          cm->largest_tile_id = tile_cols * tile_row + tile_col;
+          *largest_tile_id = tile_cols * tile_row + tile_col;
         }
 
         if (have_tiles) {
@@ -3718,6 +3515,9 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
             const int identical_tile_offset =
                 find_identical_tile(tile_row, tile_col, tile_buffers);
 
+            // Indicate a copy-tile by setting the most significant bit.
+            // The row-offset to copy from is stored in the highest byte.
+            // remux_tiles will move these around later
             if (identical_tile_offset > 0) {
               tile_size = 0;
               tile_header = identical_tile_offset | 0x80;
@@ -3792,7 +3592,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
         const OBU_TYPE obu_type =
             (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP;
         curr_tg_data_size =
-            write_obu_header(obu_type, obu_extension_header, data);
+            av1_write_obu_header(cpi, obu_type, obu_extension_header, data);
         obu_header_size = curr_tg_data_size;
 
         if (num_tg_hdrs == 1) {
@@ -3823,8 +3623,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
       // The last tile of the tile group does not have a header.
       if (!is_last_tile_in_tg) total_size += 4;
 
-      // Initialise tile context from the frame context
-      this_tile->tctx = *cm->fc;
       cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
       mode_bc.allow_update_cdf = 1;
       mode_bc.allow_update_cdf =
@@ -3841,7 +3639,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
       curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4));
       buf->size = tile_size;
       if (tile_size > max_tile_size) {
-        cm->largest_tile_id = tile_cols * tile_row + tile_col;
+        *largest_tile_id = tile_cols * tile_row + tile_col;
         max_tile_size = tile_size;
       }
 
@@ -3876,12 +3674,13 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
           // Force context update tile to be the first tile in error
           // resiliant mode as the duplicate frame headers will have
           // context_update_tile_id set to 0
-          cm->largest_tile_id = 0;
+          *largest_tile_id = 0;
 
           // Rewrite the OBU header to change the OBU type to Redundant Frame
           // Header.
-          write_obu_header(OBU_REDUNDANT_FRAME_HEADER, obu_extension_header,
-                           &data[fh_info->obu_header_byte_offset]);
+          av1_write_obu_header(cpi, OBU_REDUNDANT_FRAME_HEADER,
+                               obu_extension_header,
+                               &data[fh_info->obu_header_byte_offset]);
 
           data += fh_info->total_length;
 
@@ -3899,7 +3698,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
     // Fill in context_update_tile_id indicating the tile to use for the
     // cdf update. The encoder currently sets it to the largest tile
     // (but is up to the encoder)
-    aom_wb_overwrite_literal(saved_wb, cm->largest_tile_id,
+    aom_wb_overwrite_literal(saved_wb, *largest_tile_id,
                              cm->log2_tile_cols + cm->log2_tile_rows);
     // If more than one tile group. tile_size_bytes takes the default value 4
     // and does not need to be set. For a single tile group it is set in the
@@ -3945,7 +3744,8 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
   return total_size;
 }
 
-int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+                       int *const largest_tile_id) {
   uint8_t *data = dst;
   uint32_t data_size;
   AV1_COMMON *const cm = &cpi->common;
@@ -3959,11 +3759,13 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
   bitstream_queue_reset_write();
 #endif
 
+  cpi->frame_header_count = 0;
+
   // The TD is now written outside the frame encode loop
 
   // write sequence header obu if KEY_FRAME, preceded by 4-byte size
   if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
-    obu_header_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data);
+    obu_header_size = av1_write_obu_header(cpi, OBU_SEQUENCE_HEADER, 0, data);
 
     obu_payload_size = write_sequence_header_obu(cpi, data + obu_header_size);
     const size_t length_field_size =
@@ -3983,7 +3785,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
     // Write Frame Header OBU.
     fh_info.frame_header = data;
     obu_header_size =
-        write_obu_header(OBU_FRAME_HEADER, obu_extension_header, data);
+        av1_write_obu_header(cpi, OBU_FRAME_HEADER, obu_extension_header, data);
     obu_payload_size =
         write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1);
 
@@ -4009,8 +3811,8 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
   } else {
     //  Each tile group obu will be preceded by 4-byte size of the tile group
     //  obu
-    data_size = write_tiles_in_tg_obus(cpi, data, &saved_wb,
-                                       obu_extension_header, &fh_info);
+    data_size = write_tiles_in_tg_obus(
+        cpi, data, &saved_wb, obu_extension_header, &fh_info, largest_tile_id);
   }
   data += data_size;
   *size = data - dst;
diff --git a/libaom/av1/encoder/bitstream.h b/libaom/av1/encoder/bitstream.h
index 465ccae..b05d0d5 100644
--- a/libaom/av1/encoder/bitstream.h
+++ b/libaom/av1/encoder/bitstream.h
@@ -27,18 +27,14 @@ uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst);
 
 // Writes the OBU header byte, and the OBU header extension byte when
 // 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
-uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
-                          uint8_t *const dst);
+uint32_t av1_write_obu_header(AV1_COMP *const cpi, OBU_TYPE obu_type,
+                              int obu_extension, uint8_t *const dst);
 
 int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size,
                         uint8_t *dest);
 
-int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
-
-static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) {
-  // Do not swap gf and arf indices for internal overlay frames
-  return cpi->rc.is_src_frame_alt_ref && !cpi->rc.is_src_frame_ext_arf;
-}
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+                       int *const largest_tile_id);
 
 void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
                        int blk_row, int blk_col, int plane, TX_SIZE tx_size,
diff --git a/libaom/av1/encoder/block.h b/libaom/av1/encoder/block.h
index 1b04519..96b0991 100644
--- a/libaom/av1/encoder/block.h
+++ b/libaom/av1/encoder/block.h
@@ -54,10 +54,10 @@ typedef struct macroblock_plane {
 typedef struct {
   int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
   int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
-  int base_cost[SIG_COEF_CONTEXTS][4];
+  int base_cost[SIG_COEF_CONTEXTS][8];
   int eob_extra_cost[EOB_COEF_CONTEXTS][2];
   int dc_sign_cost[DC_SIGN_CONTEXTS][2];
-  int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1];
+  int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1];
 } LV_MAP_COEFF_COST;
 
 typedef struct {
@@ -74,16 +74,13 @@ typedef struct {
 } CB_COEFF_BUFFER;
 
 typedef struct {
-  int16_t mode_context[MODE_CTX_REF_FRAMES];
   // TODO(angiebird): Reduce the buffer size according to sb_type
-  tran_low_t *tcoeff[MAX_MB_PLANE];
-  uint16_t *eobs[MAX_MB_PLANE];
-  uint8_t *txb_skip_ctx[MAX_MB_PLANE];
-  int *dc_sign_ctx[MAX_MB_PLANE];
-  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  CB_COEFF_BUFFER *cb_coef_buff;
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
   int_mv global_mvs[REF_FRAMES];
-  int16_t compound_mode_context[MODE_CTX_REF_FRAMES];
+  int cb_offset;
+  int16_t mode_context[MODE_CTX_REF_FRAMES];
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
 } MB_MODE_INFO_EXT;
 
 typedef struct {
@@ -156,7 +153,7 @@ typedef struct {
 
 // Region size for mode decision sampling in the first pass of partition
 // search(two_pass_partition_search speed feature), in units of mi size(4).
-// Used by the mode_pruning_based_on_two_pass_partition_search speed feature.
+// Used by the mode pruning in two_pass_partition_search feature.
 #define FIRST_PARTITION_PASS_SAMPLE_REGION 8
 #define FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2 3
 #define FIRST_PARTITION_PASS_STATS_TABLES                     \
@@ -177,6 +174,8 @@ typedef struct {
   uint8_t ref0_counts[REF_FRAMES];  // Counters for ref_frame[0].
   uint8_t ref1_counts[REF_FRAMES];  // Counters for ref_frame[1].
   int sample_counts;                // Number of samples collected.
+  uint8_t interintra_motion_mode_count[REF_FRAMES];  // Counter for interintra
+                                                     // motion mode
 } FIRST_PARTITION_PASS_STATS;
 
 #define MAX_INTERP_FILTER_STATS 64
@@ -185,11 +184,26 @@ typedef struct {
   int_mv mv[2];
   int8_t ref_frames[2];
   COMPOUND_TYPE comp_type;
+  int64_t rd;
+  int skip_txfm_sb;
+  int64_t skip_sse_sb;
+  unsigned int pred_sse;
 } INTERPOLATION_FILTER_STATS;
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+#define MAX_COMP_RD_STATS 64
+typedef struct {
+  int32_t rate[COMPOUND_TYPES];
+  int64_t dist[COMPOUND_TYPES];
+  int64_t comp_model_rd[COMPOUND_TYPES];
+  int_mv mv[2];
+  MV_REFERENCE_FRAME ref_frames[2];
+  PREDICTION_MODE mode;
+  InterpFilters filter;
+  int ref_mv_idx;
+  int is_global[2];
+} COMP_RD_STATS;
+
 struct inter_modes_info;
-#endif
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
   struct macroblock_plane plane[MAX_MB_PLANE];
@@ -251,6 +265,9 @@ struct macroblock {
   int *ex_search_count_ptr;
 
   unsigned int txb_split_count;
+#if CONFIG_SPEED_STATS
+  unsigned int tx_search_count;
+#endif  // CONFIG_SPEED_STATS
 
   // These are set to their default values at the beginning, and then adjusted
   // further in the encoding process.
@@ -259,6 +276,7 @@ struct macroblock {
 
   unsigned int max_mv_context[REF_FRAMES];
   unsigned int source_variance;
+  unsigned int simple_motion_pred_sse;
   unsigned int pred_sse[REF_FRAMES];
   int pred_mv_sad[REF_FRAMES];
 
@@ -277,7 +295,7 @@ struct macroblock {
   CONV_BUF_TYPE *tmp_conv_dst;
   uint8_t *tmp_obmc_bufs[2];
 
-  FRAME_CONTEXT *backup_tile_ctx;
+  FRAME_CONTEXT *row_ctx;
   // This context will be used to update color_map_cdf pointer which would be
   // used during pack bitstream. For single thread and tile-multithreading case
   // this ponter will be same as xd->tile_ctx, but for the case of row-mt:
@@ -285,9 +303,7 @@ struct macroblock {
   // to the accurate tile context.
   FRAME_CONTEXT *tile_pb_ctx;
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
   struct inter_modes_info *inter_modes_info;
-#endif
 
   // buffer for hash value calculation of a block
   // used only in av1_get_block_hash_value()
@@ -340,7 +356,7 @@ struct macroblock {
   // BWDREF_FRAME) in bidir-comp mode.
   int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
   int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
-  int compound_type_cost[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
+  int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
   int wedge_idx_cost[BLOCK_SIZES_ALL][16];
   int interintra_cost[BLOCK_SIZE_GROUPS][2];
   int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
@@ -385,6 +401,11 @@ struct macroblock {
   // Store the fractional best motion vector during sub/Qpel-pixel motion search
   int_mv fractional_best_mv[3];
 
+  // Ref frames that are selected by square partition blocks within a super-
+  // block, in MI resolution. They can be used to prune ref frames for
+  // rectangular blocks.
+  int picked_ref_frames_mask[32 * 32];
+
   // use default transform and skip transform type search for intra modes
   int use_default_intra_tx_type;
   // use default transform and skip transform type search for inter modes
@@ -405,6 +426,13 @@ struct macroblock {
   // detection). For reference, 556 is the value returned for a solid
   // vertical black/white edge.
   uint16_t edge_strength;
+  // The strongest edge strength seen along the x/y axis.
+  uint16_t edge_strength_x;
+  uint16_t edge_strength_y;
+
+  // [Saved stat index]
+  COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS];
+  int comp_rd_stats_idx;
 };
 
 static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
diff --git a/libaom/av1/encoder/context_tree.h b/libaom/av1/encoder/context_tree.h
index cde3f2b..205ac8a 100644
--- a/libaom/av1/encoder/context_tree.h
+++ b/libaom/av1/encoder/context_tree.h
@@ -23,7 +23,7 @@ struct AV1_COMP;
 struct AV1Common;
 struct ThreadData;
 
-typedef enum {
+enum {
   // Search all the partition types in this plane.
   SEARCH_FULL_PLANE = 0,
   // Only search none_partition coding block.
@@ -32,12 +32,14 @@ typedef enum {
   SEARCH_SAME_PLANE = 2,
   // Skip search partition on this plane. Go split directly.
   SPLIT_PLANE = 3,
-} CB_TREE_SEARCH;
+} UENUM1BYTE(CB_TREE_SEARCH);
 
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
   MB_MODE_INFO mic;
   MB_MODE_INFO_EXT mbmi_ext;
+  int64_t dist;
+  int64_t rdcost;
   uint8_t *color_index_map[2];
   uint8_t *blk_skip;
 
@@ -56,51 +58,32 @@ typedef struct {
   int hybrid_pred_diff;
   int comp_pred_diff;
   int single_pred_diff;
-  // Skip certain ref frames during RD search of rectangular partitions.
-  int skip_ref_frame_mask;
 
   // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
   // scope of refactoring.
   int rate;
-  int64_t dist;
-  int64_t rdcost;
+
   int rd_mode_is_ready;  // Flag to indicate whether rd pick mode decision has
                          // been made.
 
-#if CONFIG_ONE_PASS_SVM
-  // Features for one pass svm early term
-  int seg_feat;
-#endif
-
   // motion vector cache for adaptive motion search control in partition
   // search loop
   MV pred_mv[REF_FRAMES];
   InterpFilter pred_interp_filter;
   PARTITION_TYPE partition;
-
-  // Reference and prediction mode cache for ref/mode speedup
-  // TODO(zoeliu@gmail.com): The values of ref_selected and mode_selected will
-  // be explored for further encoder speedup, to differentiate this approach for
-  // setting skip_ref_frame_mask from others. For instance, it is possible that
-  // the underlying square block(s) share the same SIMPLE_TRANSLATION motion
-  // mode as well as the mode of GLOBALMV, more ref/mode combos could be
-  // skipped.
-  MV_REFERENCE_FRAME ref_selected[2];
-  int mode_selected;
 } PICK_MODE_CONTEXT;
 
 typedef struct {
+  int64_t rdcost;
+  int64_t sub_block_rdcost[4];
   int valid;
   int split;
-  int skip;
-  int64_t rdcost;
   int sub_block_split[4];
   int sub_block_skip[4];
-  int64_t sub_block_rdcost[4];
+  int skip;
 } PC_TREE_STATS;
 
 typedef struct PC_TREE {
-  int index;
   PARTITION_TYPE partitioning;
   BLOCK_SIZE block_size;
   PICK_MODE_CONTEXT none;
@@ -112,9 +95,11 @@ typedef struct PC_TREE {
   PICK_MODE_CONTEXT verticalb[3];
   PICK_MODE_CONTEXT horizontal4[4];
   PICK_MODE_CONTEXT vertical4[4];
-  CB_TREE_SEARCH cb_search_range;
   struct PC_TREE *split[4];
   PC_TREE_STATS pc_tree_stats;
+  CB_TREE_SEARCH cb_search_range;
+  int index;
+  MV mv_ref_fulls[REF_FRAMES];
 } PC_TREE;
 
 void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td);
diff --git a/libaom/av1/encoder/cost.h b/libaom/av1/encoder/cost.h
index af5b098..be0241a 100644
--- a/libaom/av1/encoder/cost.h
+++ b/libaom/av1/encoder/cost.h
@@ -30,6 +30,10 @@ extern const uint16_t av1_prob_cost[128];
 
 // Calculate the cost of a symbol with probability p15 / 2^15
 static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
+  // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the
+  // following cost calculation works correctly. Otherwise, if p15 =
+  // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong.
+  p15 = (aom_cdf_prob)clamp(p15, 1, CDF_PROB_TOP - 1);
   assert(0 < p15 && p15 < CDF_PROB_TOP);
   const int shift = CDF_PROB_BITS - 1 - get_msb(p15);
   const int prob = get_prob(p15 << shift, CDF_PROB_TOP);
diff --git a/libaom/av1/encoder/encode_strategy.c b/libaom/av1/encoder/encode_strategy.c
new file mode 100644
index 0000000..e9d6ee7
--- /dev/null
+++ b/libaom/av1/encoder/encode_strategy.c
@@ -0,0 +1,1173 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "aom_ports/system_state.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/onyxc_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/tpl_model.h"
+
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+                                  EncodeFrameParams *const frame_params,
+                                  const FRAME_UPDATE_TYPE type,
+                                  int force_refresh_all) {
+  // NOTE(weitinglin): Should we define another function to take care of
+  // cpi->rc.is_$Source_Type to make this function as it is in the comment?
+
+  cpi->rc.is_src_frame_alt_ref = 0;
+  cpi->rc.is_src_frame_internal_arf = 0;
+
+  switch (type) {
+    case KF_UPDATE:
+      frame_params->refresh_last_frame = 1;
+      frame_params->refresh_golden_frame = 1;
+      frame_params->refresh_bwd_ref_frame = 1;
+      frame_params->refresh_alt2_ref_frame = 1;
+      frame_params->refresh_alt_ref_frame = 1;
+      break;
+
+    case LF_UPDATE:
+      frame_params->refresh_last_frame = 1;
+      frame_params->refresh_golden_frame = 0;
+      frame_params->refresh_bwd_ref_frame = 0;
+      frame_params->refresh_alt2_ref_frame = 0;
+      frame_params->refresh_alt_ref_frame = 0;
+      break;
+
+    case GF_UPDATE:
+      // TODO(zoeliu): To further investigate whether 'refresh_last_frame' is
+      //               needed.
+      frame_params->refresh_last_frame = 1;
+      frame_params->refresh_golden_frame = 1;
+      frame_params->refresh_bwd_ref_frame = 0;
+      frame_params->refresh_alt2_ref_frame = 0;
+      frame_params->refresh_alt_ref_frame = 0;
+      break;
+
+    case OVERLAY_UPDATE:
+      frame_params->refresh_last_frame = 0;
+      frame_params->refresh_golden_frame = 1;
+      frame_params->refresh_bwd_ref_frame = 0;
+      frame_params->refresh_alt2_ref_frame = 0;
+      frame_params->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+
+    case ARF_UPDATE:
+      frame_params->refresh_last_frame = 0;
+      frame_params->refresh_golden_frame = 0;
+      // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
+      frame_params->refresh_bwd_ref_frame = 0;
+      frame_params->refresh_alt2_ref_frame = 0;
+      frame_params->refresh_alt_ref_frame = 1;
+      break;
+
+    case INTNL_OVERLAY_UPDATE:
+      frame_params->refresh_last_frame = 1;
+      frame_params->refresh_golden_frame = 0;
+      frame_params->refresh_bwd_ref_frame = 0;
+      frame_params->refresh_alt2_ref_frame = 0;
+      frame_params->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_src_frame_alt_ref = 1;
+      cpi->rc.is_src_frame_internal_arf = 1;
+      break;
+
+    case INTNL_ARF_UPDATE:
+      frame_params->refresh_last_frame = 0;
+      frame_params->refresh_golden_frame = 0;
+      if (cpi->oxcf.pass == 2) {
+        frame_params->refresh_bwd_ref_frame = 1;
+        frame_params->refresh_alt2_ref_frame = 0;
+      } else {
+        frame_params->refresh_bwd_ref_frame = 0;
+        frame_params->refresh_alt2_ref_frame = 1;
+      }
+      frame_params->refresh_alt_ref_frame = 0;
+      break;
+
+    default: assert(0); break;
+  }
+
+  if (cpi->ext_refresh_frame_flags_pending &&
+      (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2)) {
+    frame_params->refresh_last_frame = cpi->ext_refresh_last_frame;
+    frame_params->refresh_golden_frame = cpi->ext_refresh_golden_frame;
+    frame_params->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+    frame_params->refresh_bwd_ref_frame = cpi->ext_refresh_bwd_ref_frame;
+    frame_params->refresh_alt2_ref_frame = cpi->ext_refresh_alt2_ref_frame;
+  }
+
+  if (force_refresh_all) {
+    frame_params->refresh_last_frame = 1;
+    frame_params->refresh_golden_frame = 1;
+    frame_params->refresh_bwd_ref_frame = 1;
+    frame_params->refresh_alt2_ref_frame = 1;
+    frame_params->refresh_alt_ref_frame = 1;
+  }
+}
+
+static void set_additional_frame_flags(const AV1_COMMON *const cm,
+                                       unsigned int *const frame_flags) {
+  if (frame_is_intra_only(cm)) *frame_flags |= FRAMEFLAGS_INTRAONLY;
+  if (frame_is_sframe(cm)) *frame_flags |= FRAMEFLAGS_SWITCH;
+  if (cm->error_resilient_mode) *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT;
+}
+
+static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+  if (cpi->common.show_frame) {
+    if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+        cpi->common.current_frame.frame_type == KEY_FRAME) {
+      // If this is a show_existing_frame with a source other than altref,
+      // or if it is not a displayed forward keyframe, the keyframe update
+      // counters were incremented when it was originally encoded.
+      cpi->rc.frames_since_key++;
+      cpi->rc.frames_to_key--;
+    }
+  }
+}
+
+static INLINE int is_frame_droppable(const AV1_COMP *const cpi) {
+  return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
+           cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame ||
+           cpi->refresh_last_frame);
+}
+
+static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+  // TODO(weitinglin): Updating this counter for is_frame_droppable
+  // is a work-around to handle the condition when a frame is drop.
+  // We should fix the cpi->common.show_frame flag
+  // instead of checking the other condition to update the counter properly.
+  if (cpi->common.show_frame || is_frame_droppable(cpi)) {
+    // Decrement count down till next gf
+    if (cpi->rc.frames_till_gf_update_due > 0)
+      cpi->rc.frames_till_gf_update_due--;
+  }
+}
+
+static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) {
+  // Increment the gf group index ready for the next frame. If this is
+  // a show_existing_frame with a source other than altref, or if it is not
+  // a displayed forward keyframe, the index was incremented when it was
+  // originally encoded.
+  if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+      cpi->common.current_frame.frame_type == KEY_FRAME) {
+    ++cpi->twopass.gf_group.index;
+  }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+  update_keyframe_counters(cpi);
+  update_frames_till_gf_update(cpi);
+  if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi);
+}
+
+static void check_show_existing_frame(AV1_COMP *const cpi,
+                                      EncodeFrameParams *const frame_params) {
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+  const FRAME_UPDATE_TYPE frame_update_type =
+      gf_group->update_type[gf_group->index];
+  const int which_arf = (gf_group->arf_update_idx[gf_group->index] > 0);
+
+  if (cm->show_existing_frame == 1) {
+    frame_params->show_existing_frame = 0;
+  } else if (cpi->is_arf_filter_off[which_arf] &&
+             (frame_update_type == OVERLAY_UPDATE ||
+              frame_update_type == INTNL_OVERLAY_UPDATE)) {
+    // Other parameters related to OVERLAY_UPDATE will be taken care of
+    // in av1_get_second_pass_params(cpi)
+    frame_params->show_existing_frame = 1;
+    frame_params->existing_fb_idx_to_show =
+        (frame_update_type == OVERLAY_UPDATE)
+            ? get_ref_frame_map_idx(cm, ALTREF_FRAME)
+            : get_ref_frame_map_idx(cm, BWDREF_FRAME);
+  }
+}
+
+static void set_ext_overrides(AV1_COMP *const cpi,
+                              EncodeFrameParams *const frame_params) {
+  // Overrides the defaults with the externally supplied values with
+  // av1_update_reference() and av1_update_entropy() calls
+  // Note: The overrides are valid only for the next frame passed
+  // to av1_encode_lowlevel()
+
+  AV1_COMMON *const cm = &cpi->common;
+
+  if (cpi->ext_use_s_frame) {
+    frame_params->frame_type = S_FRAME;
+  }
+
+  if (cpi->ext_refresh_frame_context_pending) {
+    cm->refresh_frame_context = cpi->ext_refresh_frame_context;
+    cpi->ext_refresh_frame_context_pending = 0;
+  }
+  cm->allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs;
+
+  frame_params->error_resilient_mode = cpi->ext_use_error_resilient;
+  // A keyframe is already error resilient and keyframes with
+  // error_resilient_mode interferes with the use of show_existing_frame
+  // when forward reference keyframes are enabled.
+  frame_params->error_resilient_mode &= frame_params->frame_type != KEY_FRAME;
+  // For bitstream conformance, s-frames must be error-resilient
+  frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME;
+}
+
+static int get_ref_frame_flags(const AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const RefCntBuffer *last_buf = get_ref_frame_buf(cm, LAST_FRAME);
+  const RefCntBuffer *last2_buf = get_ref_frame_buf(cm, LAST2_FRAME);
+  const RefCntBuffer *last3_buf = get_ref_frame_buf(cm, LAST3_FRAME);
+  const RefCntBuffer *golden_buf = get_ref_frame_buf(cm, GOLDEN_FRAME);
+  const RefCntBuffer *bwd_buf = get_ref_frame_buf(cm, BWDREF_FRAME);
+  const RefCntBuffer *alt2_buf = get_ref_frame_buf(cm, ALTREF2_FRAME);
+  const RefCntBuffer *alt_buf = get_ref_frame_buf(cm, ALTREF_FRAME);
+
+  // No.1 Priority: LAST_FRAME
+  const int last2_is_last = (last2_buf == last_buf);
+  const int last3_is_last = (last3_buf == last_buf);
+  const int gld_is_last = (golden_buf == last_buf);
+  const int bwd_is_last = (bwd_buf == last_buf);
+  const int alt2_is_last = (alt2_buf == last_buf);
+  const int alt_is_last = (alt_buf == last_buf);
+
+  // No.2 Priority: ALTREF_FRAME
+  const int last2_is_alt = (last2_buf == alt_buf);
+  const int last3_is_alt = (last3_buf == alt_buf);
+  const int gld_is_alt = (golden_buf == alt_buf);
+  const int bwd_is_alt = (bwd_buf == alt_buf);
+  const int alt2_is_alt = (alt2_buf == alt_buf);
+
+  // No.3 Priority: LAST2_FRAME
+  const int last3_is_last2 = (last3_buf == last2_buf);
+  const int gld_is_last2 = (golden_buf == last2_buf);
+  const int bwd_is_last2 = (bwd_buf == last2_buf);
+  const int alt2_is_last2 = (alt2_buf == last2_buf);
+
+  // No.4 Priority: LAST3_FRAME
+  const int gld_is_last3 = (golden_buf == last3_buf);
+  const int bwd_is_last3 = (bwd_buf == last3_buf);
+  const int alt2_is_last3 = (alt2_buf == last3_buf);
+
+  // No.5 Priority: GOLDEN_FRAME
+  const int bwd_is_gld = (bwd_buf == golden_buf);
+  const int alt2_is_gld = (alt2_buf == golden_buf);
+
+  // No.6 Priority: BWDREF_FRAME
+  const int alt2_is_bwd = (alt2_buf == bwd_buf);
+
+  // No.7 Priority: ALTREF2_FRAME
+
+  // cpi->ext_ref_frame_flags allows certain reference types to be disabled
+  // by the external interface.  These are set by av1_apply_encoding_flags().
+  // Start with what the external interface allows, then suppress any reference
+  // types which we have found to be duplicates.
+
+  int flags = cpi->ext_ref_frame_flags;
+
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG;
+
+  if (alt_is_last) flags &= ~AOM_ALT_FLAG;
+
+  if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG;
+
+  if (last3_is_last || last3_is_alt || last3_is_last2) flags &= ~AOM_LAST3_FLAG;
+
+  if (gld_is_last || gld_is_alt || gld_is_last2 || gld_is_last3)
+    flags &= ~AOM_GOLD_FLAG;
+
+  if ((bwd_is_last || bwd_is_alt || bwd_is_last2 || bwd_is_last3 || bwd_is_gld))
+    flags &= ~AOM_BWD_FLAG;
+
+  if ((alt2_is_last || alt2_is_alt || alt2_is_last2 || alt2_is_last3 ||
+       alt2_is_gld || alt2_is_bwd))
+    flags &= ~AOM_ALT2_FLAG;
+
+  return flags;
+}
+
+static int get_current_frame_ref_type(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  // We choose the reference "type" of this frame from the flags which indicate
+  // which reference frames will be refreshed by it.  More than one of these
+  // flags may be set, so the order here implies an order of precedence.
+  // This is just used to choose the primary_ref_frame (as the most recent
+  // reference buffer of the same reference-type as the current frame)
+
+  const int intra_only = frame_params->frame_type == KEY_FRAME ||
+                         frame_params->frame_type == INTRA_ONLY_FRAME;
+  if (intra_only || frame_params->error_resilient_mode ||
+      cpi->ext_use_primary_ref_none)
+    return REGULAR_FRAME;
+  else if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE)
+    return INTERNAL_ARF_FRAME;
+  else if (frame_params->refresh_alt_ref_frame)
+    return ARF_FRAME;
+  else if (cpi->rc.is_src_frame_alt_ref)
+    return OVERLAY_FRAME;
+  else if (frame_params->refresh_golden_frame)
+    return GLD_FRAME;
+  else if (frame_params->refresh_bwd_ref_frame)
+    return BRF_FRAME;
+  else
+    return REGULAR_FRAME;
+}
+
+static int choose_primary_ref_frame(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int intra_only = frame_params->frame_type == KEY_FRAME ||
+                         frame_params->frame_type == INTRA_ONLY_FRAME;
+  if (intra_only || frame_params->error_resilient_mode ||
+      cpi->ext_use_primary_ref_none) {
+    return PRIMARY_REF_NONE;
+  }
+
+  // Find the most recent reference frame with the same reference type as the
+  // current frame
+  const FRAME_CONTEXT_INDEX current_ref_type =
+      get_current_frame_ref_type(cpi, frame_params);
+  int wanted_fb = cpi->fb_of_context_type[current_ref_type];
+
+  int primary_ref_frame = PRIMARY_REF_NONE;
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) {
+      primary_ref_frame = ref_frame - LAST_FRAME;
+    }
+  }
+  return primary_ref_frame;
+}
+
+static void update_fb_of_context_type(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
+    int *const fb_of_context_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cpi->ext_use_primary_ref_none) {
+    for (int i = 0; i < REF_FRAMES; i++) {
+      fb_of_context_type[i] = -1;
+    }
+    fb_of_context_type[REGULAR_FRAME] =
+        cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME)
+                       : get_ref_frame_map_idx(cm, ALTREF_FRAME);
+  }
+
+  if (!encode_show_existing_frame(cm)) {
+    // Refresh fb_of_context_type[]: see encoder.h for explanation
+    if (cm->current_frame.frame_type == KEY_FRAME) {
+      // All ref frames are refreshed, pick one that will live long enough
+      fb_of_context_type[REGULAR_FRAME] = 0;
+    } else {
+      // If more than one frame is refreshed, it doesn't matter which one we
+      // pick so pick the first.  LST sometimes doesn't refresh any: this is ok
+      const int current_frame_ref_type =
+          get_current_frame_ref_type(cpi, frame_params);
+      for (int i = 0; i < REF_FRAMES; i++) {
+        if (cm->current_frame.refresh_frame_flags & (1 << i)) {
+          fb_of_context_type[current_frame_ref_type] = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+static int get_order_offset(const GF_GROUP *const gf_group,
+                            const EncodeFrameParams *const frame_params) {
+  // shown frame by definition has order offset 0
+  // show_existing_frame ignores order_offset and simply takes the order_hint
+  // from the reference frame being shown.
+  if (frame_params->show_frame || frame_params->show_existing_frame) return 0;
+
+  const int arf_offset =
+      AOMMIN((MAX_GF_INTERVAL - 1), gf_group->arf_src_offset[gf_group->index]);
+  return AOMMIN((MAX_GF_INTERVAL - 1), arf_offset);
+}
+
+static void adjust_frame_rate(AV1_COMP *cpi,
+                              const struct lookahead_entry *source) {
+  int64_t this_duration;
+  int step = 0;
+
+  // Clear down mmx registers
+  aom_clear_system_state();
+
+  if (source->ts_start == cpi->first_time_stamp_ever) {
+    this_duration = source->ts_end - source->ts_start;
+    step = 1;
+  } else {
+    int64_t last_duration =
+        cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+
+    this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
+
+    // do a step update if the duration changes by 10%
+    if (last_duration)
+      step = (int)((this_duration - last_duration) * 10 / last_duration);
+  }
+
+  if (this_duration) {
+    if (step) {
+      av1_new_framerate(cpi, 10000000.0 / this_duration);
+    } else {
+      // Average this frame's rate into the last second's average
+      // frame rate. If we haven't seen 1 second yet, then average
+      // over the whole interval seen.
+      const double interval = AOMMIN(
+          (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
+      double avg_duration = 10000000.0 / cpi->framerate;
+      avg_duration *= (interval - avg_duration + this_duration);
+      avg_duration /= interval;
+
+      av1_new_framerate(cpi, 10000000.0 / avg_duration);
+    }
+  }
+  cpi->last_time_stamp_seen = source->ts_start;
+  cpi->last_end_time_stamp_seen = source->ts_end;
+}
+
+// If this is an alt-ref, returns the offset of the source frame used
+// as the arf midpoint. Otherwise, returns 0.
+static int get_arf_src_index(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int arf_src_index = 0;
+  if (cpi->oxcf.pass == 2) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+      assert(is_altref_enabled(cpi));
+      arf_src_index = gf_group->arf_src_offset[gf_group->index];
+    }
+  } else if (rc->source_alt_ref_pending) {
+    arf_src_index = rc->frames_till_gf_update_due;
+  }
+  return arf_src_index;
+}
+
+// If this is an internal alt-ref, returns the offset of the source frame used
+// as the internal arf midpoint. Otherwise, returns 0.
+static int get_internal_arf_src_index(AV1_COMP *cpi) {
+  int internal_arf_src_index = 0;
+  if (cpi->oxcf.pass == 2) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+      assert(is_altref_enabled(cpi) && cpi->internal_altref_allowed);
+      internal_arf_src_index = gf_group->arf_src_offset[gf_group->index];
+    }
+  }
+  return internal_arf_src_index;
+}
+
+// Called if this frame is an ARF or ARF2. Also handles forward-keyframes
+// For an ARF set arf2=0, for ARF2 set arf2=1
+// temporal_filtered is set to 1 if we temporally filter the ARF frame, so that
+// the correct post-filter buffer can be used.
+static struct lookahead_entry *setup_arf_or_arf2(
+    AV1_COMP *const cpi, const int arf_src_index, const int arf2,
+    int *temporal_filtered, EncodeFrameParams *const frame_params) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  assert(arf_src_index <= rc->frames_to_key);
+  *temporal_filtered = 0;
+
+  struct lookahead_entry *source =
+      av1_lookahead_peek(cpi->lookahead, arf_src_index);
+
+  if (source != NULL) {
+    cm->showable_frame = 1;
+    cpi->alt_ref_source = source;
+
+    // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
+    if (!arf2 && arf_src_index == rc->frames_to_key) {
+      // Skip temporal filtering and mark as intra_only if we have a fwd_kf
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      int which_arf = gf_group->arf_update_idx[gf_group->index];
+      cpi->is_arf_filter_off[which_arf] = 1;
+      cpi->no_show_kf = 1;
+    } else {
+      if (oxcf->arnr_max_frames > 0) {
+        // Produce the filtered ARF frame.
+        av1_temporal_filter(cpi, arf_src_index);
+        aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
+        *temporal_filtered = 1;
+      }
+    }
+    frame_params->show_frame = 0;
+  }
+  rc->source_alt_ref_pending = 0;
+  return source;
+}
+
+// Determine whether there is a forced keyframe pending in the lookahead buffer
+static int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+                                      const int up_to_index) {
+  for (int i = 0; i <= up_to_index; i++) {
+    const struct lookahead_entry *e = av1_lookahead_peek(lookahead, i);
+    if (e == NULL) {
+      // We have reached the end of the lookahead buffer and not early-returned
+      // so there isn't a forced key-frame pending.
+      return 0;
+    } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+      return 1;
+    } else {
+      continue;
+    }
+  }
+  return 0;  // Never reached
+}
+
+// Check if we should encode an ARF or internal ARF.  If not, try a LAST
+// Do some setup associated with the chosen source
+// temporal_filtered, flush, and frame_update_type are outputs.
+// Return the frame source, or NULL if we couldn't find one
+struct lookahead_entry *choose_frame_source(
+    AV1_COMP *const cpi, int *const temporal_filtered, int *const flush,
+    struct lookahead_entry **last_source, FRAME_UPDATE_TYPE *frame_update_type,
+    EncodeFrameParams *const frame_params) {
+  AV1_COMMON *const cm = &cpi->common;
+  struct lookahead_entry *source = NULL;
+  *temporal_filtered = 0;
+
+  // Should we encode an alt-ref frame.
+  int arf_src_index = get_arf_src_index(cpi);
+  if (arf_src_index &&
+      is_forced_keyframe_pending(cpi->lookahead, arf_src_index)) {
+    arf_src_index = 0;
+    *flush = 1;
+  }
+
+  if (arf_src_index) {
+    source = setup_arf_or_arf2(cpi, arf_src_index, 0, temporal_filtered,
+                               frame_params);
+    *frame_update_type = ARF_UPDATE;
+  }
+
+  // Should we encode an internal Alt-ref frame (mutually exclusive to ARF)
+  arf_src_index = get_internal_arf_src_index(cpi);
+  if (arf_src_index &&
+      is_forced_keyframe_pending(cpi->lookahead, arf_src_index)) {
+    arf_src_index = 0;
+    *flush = 1;
+  }
+
+  if (arf_src_index) {
+    source = setup_arf_or_arf2(cpi, arf_src_index, 1, temporal_filtered,
+                               frame_params);
+    *frame_update_type = INTNL_ARF_UPDATE;
+  }
+
+  if (!source) {
+    // Get last frame source.
+    if (cm->current_frame.frame_number > 0) {
+      *last_source = av1_lookahead_peek(cpi->lookahead, -1);
+    }
+    // Read in the source frame.
+    source = av1_lookahead_pop(cpi->lookahead, *flush);
+    if (source == NULL) return NULL;
+    *frame_update_type = LF_UPDATE;  // Default update type
+    frame_params->show_frame = 1;
+
+    // Check to see if the frame should be encoded as an arf overlay.
+    if (cpi->alt_ref_source == source) {
+      *frame_update_type = OVERLAY_UPDATE;
+      cpi->alt_ref_source = NULL;
+    }
+  }
+  return source;
+}
+
+// Don't allow a show_existing_frame to coincide with an error resilient or
+// S-Frame. An exception can be made in the case of a keyframe, since it does
+// not depend on any previous frames.
+static int allow_show_existing(const AV1_COMP *const cpi,
+                               unsigned int frame_flags) {
+  if (cpi->common.current_frame.frame_number == 0) return 0;
+
+  const struct lookahead_entry *lookahead_src =
+      av1_lookahead_peek(cpi->lookahead, 0);
+  if (lookahead_src == NULL) return 1;
+
+  const int is_error_resilient =
+      cpi->oxcf.error_resilient_mode ||
+      (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
+  const int is_s_frame =
+      cpi->oxcf.s_frame_mode || (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+  const int is_key_frame =
+      (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY);
+  return !(is_error_resilient || is_s_frame) || is_key_frame;
+}
+
+// Update frame_flags to tell the encoder's caller what sort of frame was
+// encoded.
+static void update_frame_flags(AV1_COMP *cpi, unsigned int *frame_flags) {
+  if (encode_show_existing_frame(&cpi->common)) {
+    *frame_flags &= ~FRAMEFLAGS_GOLDEN;
+    *frame_flags &= ~FRAMEFLAGS_BWDREF;
+    *frame_flags &= ~FRAMEFLAGS_ALTREF;
+    *frame_flags &= ~FRAMEFLAGS_KEY;
+    return;
+  }
+
+  if (cpi->refresh_golden_frame == 1) {
+    *frame_flags |= FRAMEFLAGS_GOLDEN;
+  } else {
+    *frame_flags &= ~FRAMEFLAGS_GOLDEN;
+  }
+
+  if (cpi->refresh_alt_ref_frame == 1) {
+    *frame_flags |= FRAMEFLAGS_ALTREF;
+  } else {
+    *frame_flags &= ~FRAMEFLAGS_ALTREF;
+  }
+
+  if (cpi->refresh_bwd_ref_frame == 1) {
+    *frame_flags |= FRAMEFLAGS_BWDREF;
+  } else {
+    *frame_flags &= ~FRAMEFLAGS_BWDREF;
+  }
+
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+    *frame_flags |= FRAMEFLAGS_KEY;
+  } else {
+    *frame_flags &= ~FRAMEFLAGS_KEY;
+  }
+}
+
+#define DUMP_REF_FRAME_IMAGES 0
+
+#if DUMP_REF_FRAME_IMAGES == 1
+static int dump_one_image(AV1_COMMON *cm,
+                          const YV12_BUFFER_CONFIG *const ref_buf,
+                          char *file_name) {
+  int h;
+  FILE *f_ref = NULL;
+
+  if (ref_buf == NULL) {
+    printf("Frame data buffer is NULL.\n");
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  if ((f_ref = fopen(file_name, "wb")) == NULL) {
+    printf("Unable to open file %s to write.\n", file_name);
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+           f_ref);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+           f_ref);
+  }
+
+  fclose(f_ref);
+
+  return AOM_CODEC_OK;
+}
+
+static void dump_ref_frame_images(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    char file_name[256] = "";
+    snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
+             cm->current_frame.frame_number, ref_frame);
+    dump_one_image(cm, get_ref_frame_yv12_buf(cpi, ref_frame), file_name);
+  }
+}
+#endif  // DUMP_REF_FRAME_IMAGES == 1
+
+// Assign new_ref in the new mapping to point at the reference buffer pointed at
+// by old_ref in the old_map.  The new mapping is stored in *new_map, while the
+// old map comes from cm->remapped_ref_idx[].
+static void assign_new_map(AV1_COMMON *const cm, int *new_map, int new_ref,
+                           int old_ref) {
+  new_map[new_ref - LAST_FRAME] = cm->remapped_ref_idx[old_ref - LAST_FRAME];
+}
+
+// Generate a new reference frame mapping.  This function updates
+// cm->remapped_ref_idx[] depending on the frame_update_type of this frame.
+// This determines which references (e.g. LAST_FRAME, ALTREF_FRAME) point at the
+// 8 underlying buffers and, together with get_refresh_frame_flags(), implements
+// our reference frame management strategy.
+static void update_ref_frame_map(AV1_COMP *cpi,
+                                 FRAME_UPDATE_TYPE frame_update_type) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  // If check_frame_refs_short_signaling() decided to set
+  // frame_refs_short_signaling=1 then we update remapped_ref_idx[] here.  Every
+  // reference will still map to the same RefCntBuffer (through ref_frame_map[])
+  // after this, but that does not necessarily mean that remapped_ref_idx[] is
+  // unchanged.
+  if (cm->current_frame.frame_refs_short_signaling) {
+    const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+    const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+    av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_map_idx, gld_map_idx);
+  }
+
+  // For shown keyframes and S-frames all buffers are refreshed, but we don't
+  // change any of the mapping.
+  if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
+      frame_is_sframe(cm)) {
+    return;
+  }
+
+  // Initialize the new reference map as a copy of the old one.
+  int new_map[REF_FRAMES];
+  memcpy(new_map, cm->remapped_ref_idx, sizeof(new_map));
+
+  // The reference management strategy is currently as follows.  See
+  // gop_structure.c for more details of the structure and DOI
+  // 10.1109/DCC.2018.00045 for a higher-level explanation
+  //
+  // * ALTREF_FRAME and GOLDEN_FRAME are kept separate from the other
+  //   references.  When we code an ALTREF it refreshes the ALTREF buffer.  When
+  //   we code an OVERLAY the old GOLDEN becomes the new ALTREF and the old
+  //   ALTREF (possibly refreshed by the OVERLAY) becomes the new GOLDEN.
+  // * LAST_FRAME, LAST2_FRAME, and LAST3_FRAME work like a FIFO.  When we code
+  //   a frame which does a last-frame update we pick a buffer to refresh and
+  //   then point the LAST_FRAME reference at it.  The old LAST_FRAME becomes
+  //   LAST2_FRAME and the old LAST2_FRAME becomes LAST3_FRAME.  The old
+  //   LAST3_FRAME is re-used somewhere else.
+  // * BWDREF, ALTREF2, and EXTREF act like a stack structure, so we can
+  //   "push" and "pop" internal alt-ref frames through the three references.
+  // * When we code a BRF or internal-ARF (they work the same in this
+  //   structure) we push it onto the bwdref stack.  Because we have a finite
+  //   number of buffers, we actually refresh EXTREF, the bottom of the stack,
+  //   and rotate the three references to make EXTREF the top.
+  // * When we code an INTNL_OVERLAY we refresh BWDREF, then pop it off of the
+  //   bwdref stack and push it into the last-frame FIFO.  The old LAST3
+  //   buffer gets pushed out of the last-frame FIFO and becomes the new
+  //   EXTREF, bottom of the bwdref stack.
+  // * LAST_BIPRED just acts like a LAST_FRAME.  The BWDREF will have an
+  //   INTNL_OVERLAY and so can do its own ref map update.
+  //
+  // Note that this function runs *after* a frame has been coded, so it does not
+  // affect reference assignment of the current frame, it only affects future
+  // frames.  This is why we refresh buffers using the old reference map before
+  // remapping them.
+  //
+  // show_existing_frames don't refresh any buffers or send the reference map to
+  // the decoder, but we can still update our reference map if we want to: the
+  // decoder will update its map next time we code a non-show-existing frame.
+
+  if (frame_update_type == OVERLAY_UPDATE) {
+    // We want the old golden-frame to become our new ARF so swap the
+    // references.  If cpi->preserve_arf_as_gld == 0 then we will refresh the
+    // old ARF before it becomes our new GF
+    assign_new_map(cm, new_map, ALTREF_FRAME, GOLDEN_FRAME);
+    assign_new_map(cm, new_map, GOLDEN_FRAME, ALTREF_FRAME);
+  } else if (frame_update_type == INTNL_OVERLAY_UPDATE &&
+             encode_show_existing_frame(cm)) {
+    // Note that because encode_show_existing_frame(cm) we don't refresh any
+    // buffers.
+    // Pop BWDREF (shown as current frame) from the bwdref stack and make it
+    // the new LAST_FRAME.
+    assign_new_map(cm, new_map, LAST_FRAME, BWDREF_FRAME);
+
+    // Progress the last-frame FIFO and the bwdref stack
+    assign_new_map(cm, new_map, LAST2_FRAME, LAST_FRAME);
+    assign_new_map(cm, new_map, LAST3_FRAME, LAST2_FRAME);
+    assign_new_map(cm, new_map, BWDREF_FRAME, ALTREF2_FRAME);
+    assign_new_map(cm, new_map, ALTREF2_FRAME, EXTREF_FRAME);
+    assign_new_map(cm, new_map, EXTREF_FRAME, LAST3_FRAME);
+  } else if (frame_update_type == INTNL_ARF_UPDATE &&
+             !cm->show_existing_frame) {
+    // We want to push the current frame onto the bwdref stack.  We refresh
+    // EXTREF (the old bottom of the stack) and rotate the references so it
+    // becomes BWDREF, the top of the stack.
+    assign_new_map(cm, new_map, BWDREF_FRAME, EXTREF_FRAME);
+    assign_new_map(cm, new_map, ALTREF2_FRAME, BWDREF_FRAME);
+    assign_new_map(cm, new_map, EXTREF_FRAME, ALTREF2_FRAME);
+  }
+
+  if ((frame_update_type == LF_UPDATE || frame_update_type == GF_UPDATE ||
+       frame_update_type == INTNL_OVERLAY_UPDATE) &&
+      !encode_show_existing_frame(cm) &&
+      (!cm->show_existing_frame || frame_update_type == INTNL_OVERLAY_UPDATE)) {
+    // A standard last-frame: we refresh the LAST3_FRAME buffer and then push it
+    // into the last-frame FIFO.
+    assign_new_map(cm, new_map, LAST3_FRAME, LAST2_FRAME);
+    assign_new_map(cm, new_map, LAST2_FRAME, LAST_FRAME);
+    assign_new_map(cm, new_map, LAST_FRAME, LAST3_FRAME);
+  }
+
+  memcpy(cm->remapped_ref_idx, new_map, sizeof(new_map));
+
+#if DUMP_REF_FRAME_IMAGES == 1
+  // Dump out all reference frame images.
+  dump_ref_frame_images(cpi);
+#endif  // DUMP_REF_FRAME_IMAGES
+}
+
+static int get_refresh_frame_flags(const AV1_COMP *const cpi,
+                                   const EncodeFrameParams *const frame_params,
+                                   FRAME_UPDATE_TYPE frame_update_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  // Switch frames and shown key-frames overwrite all reference slots
+  if ((frame_params->frame_type == KEY_FRAME && frame_params->show_frame) ||
+      frame_params->frame_type == S_FRAME)
+    return 0xFF;
+
+  // show_existing_frames don't actually send refresh_frame_flags so set the
+  // flags to 0 to keep things consistent.
+  if (frame_params->show_existing_frame &&
+      (!frame_params->error_resilient_mode ||
+       frame_params->frame_type == KEY_FRAME)) {
+    return 0;
+  }
+
+  int refresh_mask = 0;
+
+  if (cpi->ext_refresh_frame_flags_pending) {
+    // Unfortunately the encoder interface reflects the old refresh_*_frame
+    // flags so we have to replicate the old refresh_frame_flags logic here in
+    // order to preserve the behaviour of the flag overrides.
+    refresh_mask |= cpi->ext_refresh_last_frame
+                    << get_ref_frame_map_idx(cm, LAST3_FRAME);
+    refresh_mask |= cpi->ext_refresh_bwd_ref_frame
+                    << get_ref_frame_map_idx(cm, EXTREF_FRAME);
+    refresh_mask |= cpi->ext_refresh_alt2_ref_frame
+                    << get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+    if (frame_update_type == OVERLAY_UPDATE) {
+      if (!cpi->preserve_arf_as_gld) {
+        refresh_mask |= cpi->ext_refresh_golden_frame
+                        << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+      }
+    } else {
+      refresh_mask |= cpi->ext_refresh_golden_frame
+                      << get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+      refresh_mask |= cpi->ext_refresh_alt_ref_frame
+                      << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+    }
+    return refresh_mask;
+  }
+
+  // See update_ref_frame_map() for a thorough description of the reference
+  // buffer management strategy currently in use.  This function just decides
+  // which buffers should be refreshed.
+
+  switch (frame_update_type) {
+    case KF_UPDATE:
+      // Note that a real shown key-frame or S-frame refreshes every buffer,
+      // handled in a special case above.  This case is for frames which aren't
+      // really a shown key-frame or S-frame but want to refresh all the
+      // important buffers.
+      refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME);
+      refresh_mask |= 1 << get_ref_frame_map_idx(cm, EXTREF_FRAME);
+      refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+      refresh_mask |= 1 << get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+      refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+      break;
+    case LF_UPDATE:
+      // Refresh LAST3, which becomes the new LAST while LAST becomes LAST2
+      // and LAST2 becomes the new LAST3 (like a FIFO but circular)
+      refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME);
+      break;
+    case GF_UPDATE:
+      // In addition to refreshing the GF buffer, we refresh LAST3 and push it
+      // into the last-frame FIFO.
+      refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME);
+      refresh_mask |= 1 << get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+      break;
+    case OVERLAY_UPDATE:
+      if (!cpi->preserve_arf_as_gld) {
+        // The result of our OVERLAY should become the GOLDEN_FRAME but we'd
+        // like to keep the old GOLDEN as our new ALTREF.  So we refresh the
+        // ALTREF and swap around the ALTREF and GOLDEN references.
+        refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+      }
+      break;
+    case ARF_UPDATE:
+      refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+      break;
+    case INTNL_OVERLAY_UPDATE:
+      // INTNL_OVERLAY may be a show_existing_frame in which case we don't
+      // refresh anything and the BWDREF or ALTREF2 being shown becomes the new
+      // LAST_FRAME.  But, if it's not a show_existing_frame, then we update as
+      // though it's a normal LF_UPDATE: we refresh LAST3 and
+      // update_ref_frame_map() makes that the new LAST_FRAME.
+      refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME);
+      break;
+    case INTNL_ARF_UPDATE:
+      if (cpi->oxcf.pass == 2) {
+        // Push the new ARF2 onto the bwdref stack.  We refresh EXTREF which is
+        // at the bottom of the stack then move it to the top.
+        refresh_mask |= 1 << get_ref_frame_map_idx(cm, EXTREF_FRAME);
+      } else {
+        // ARF2 just gets stored in the ARF2 slot, no reference map change.
+        refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+      }
+      break;
+    default: assert(0); break;
+  }
+  return refresh_mask;
+}
+
+int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
+                        uint8_t *const dest, unsigned int *frame_flags,
+                        int64_t *const time_stamp, int64_t *const time_end,
+                        const aom_rational_t *const timebase, int flush) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+
+  EncodeFrameInput frame_input;
+  EncodeFrameParams frame_params;
+  EncodeFrameResults frame_results;
+  memset(&frame_input, 0, sizeof(frame_input));
+  memset(&frame_params, 0, sizeof(frame_params));
+  memset(&frame_results, 0, sizeof(frame_results));
+
+  if (oxcf->pass == 0 || oxcf->pass == 2) {
+    check_show_existing_frame(cpi, &frame_params);
+    frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags);
+  } else {
+    frame_params.show_existing_frame = 0;
+  }
+
+  int temporal_filtered = 0;
+  struct lookahead_entry *source = NULL;
+  struct lookahead_entry *last_source = NULL;
+  FRAME_UPDATE_TYPE frame_update_type;
+  if (frame_params.show_existing_frame) {
+    source = av1_lookahead_pop(cpi->lookahead, flush);
+    frame_update_type = LF_UPDATE;
+  } else {
+    source = choose_frame_source(cpi, &temporal_filtered, &flush, &last_source,
+                                 &frame_update_type, &frame_params);
+  }
+
+  // In pass 2 we get the frame_update_type from gf_group
+  if (oxcf->pass == 2) {
+    frame_update_type =
+        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+  }
+
+  if (source == NULL) {  // If no source was found, we can't encode a frame.
+    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+      av1_end_first_pass(cpi); /* get last stats packet */
+      cpi->twopass.first_pass_done = 1;
+    }
+    return -1;
+  }
+
+  frame_input.source = temporal_filtered ? &cpi->alt_ref_buffer : &source->img;
+  frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
+  frame_input.ts_duration = source->ts_end - source->ts_start;
+
+  *time_stamp = source->ts_start;
+  *time_end = source->ts_end;
+  if (source->ts_start < cpi->first_time_stamp_ever) {
+    cpi->first_time_stamp_ever = source->ts_start;
+    cpi->last_end_time_stamp_seen = source->ts_start;
+  }
+
+  av1_apply_encoding_flags(cpi, source->flags);
+  if (!frame_params.show_existing_frame)
+    *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+  const int is_overlay = frame_params.show_existing_frame &&
+                         (frame_update_type == OVERLAY_UPDATE ||
+                          frame_update_type == INTNL_OVERLAY_UPDATE);
+  if (frame_params.show_frame || is_overlay) {
+    // Shown frames and arf-overlay frames need frame-rate considering
+    adjust_frame_rate(cpi, source);
+  }
+
+  if (frame_params.show_existing_frame) {
+    // show_existing_frame implies this frame is shown!
+    frame_params.show_frame = 1;
+  } else {
+    if (cpi->film_grain_table) {
+      cm->cur_frame->film_grain_params_present = aom_film_grain_table_lookup(
+          cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
+          &cm->film_grain_params);
+    } else {
+      cm->cur_frame->film_grain_params_present =
+          cm->seq_params.film_grain_params_present;
+    }
+    // only one operating point supported now
+    const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp);
+    if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
+    cpi->common.frame_presentation_time = (uint32_t)pts64;
+  }
+
+  if (oxcf->pass == 2 && (!frame_params.show_existing_frame || is_overlay)) {
+    // GF_GROUP needs updating for arf overlays as well as non-show-existing
+    av1_get_second_pass_params(cpi, &frame_params, *frame_flags);
+    frame_update_type =
+        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+  }
+
+  if (frame_params.show_existing_frame &&
+      frame_params.frame_type != KEY_FRAME) {
+    // Force show-existing frames to be INTER, except forward keyframes
+    frame_params.frame_type = INTER_FRAME;
+  }
+
+  // TODO(david.turner@argondesign.com): Move all the encode strategy
+  // (largely near av1_get_compressed_data) in here
+
+  // TODO(david.turner@argondesign.com): Change all the encode strategy to
+  // modify frame_params instead of cm or cpi.
+
+  // Per-frame encode speed.  In theory this can vary, but things may have been
+  // written assuming speed-level will not change within a sequence, so this
+  // parameter should be used with caution.
+  frame_params.speed = oxcf->speed;
+
+  if (!frame_params.show_existing_frame) {
+    cm->using_qmatrix = cpi->oxcf.using_qm;
+    cm->min_qmlevel = cpi->oxcf.qm_minlevel;
+    cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
+    if (cpi->twopass.gf_group.index == 1 && cpi->oxcf.enable_tpl_model) {
+      av1_configure_buffer_updates(cpi, &frame_params, frame_update_type, 0);
+      av1_set_frame_size(cpi, cm->width, cm->height);
+      av1_tpl_setup_stats(cpi, &frame_input);
+    }
+  }
+
+  // Work out some encoding parameters specific to the pass:
+  if (oxcf->pass == 0) {
+    if (cpi->oxcf.rc_mode == AOM_CBR) {
+      av1_rc_get_one_pass_cbr_params(cpi, &frame_update_type, &frame_params,
+                                     *frame_flags);
+    } else {
+      av1_rc_get_one_pass_vbr_params(cpi, &frame_update_type, &frame_params,
+                                     *frame_flags);
+    }
+  } else if (oxcf->pass == 1) {
+    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&cpi->oxcf);
+    const int kf_requested = (cm->current_frame.frame_number == 0 ||
+                              (*frame_flags & FRAMEFLAGS_KEY));
+    if (kf_requested && frame_update_type != OVERLAY_UPDATE &&
+        frame_update_type != INTNL_OVERLAY_UPDATE) {
+      frame_params.frame_type = KEY_FRAME;
+    } else {
+      frame_params.frame_type = INTER_FRAME;
+    }
+  } else if (oxcf->pass == 2) {
+#if CONFIG_MISMATCH_DEBUG
+    mismatch_move_frame_idx_w();
+#endif
+#if TXCOEFF_COST_TIMER
+    cm->txcoeff_cost_timer = 0;
+    cm->txcoeff_cost_count = 0;
+#endif
+  }
+
+  if (oxcf->pass == 0 || oxcf->pass == 2) set_ext_overrides(cpi, &frame_params);
+
+  // Shown keyframes and S frames refresh all reference buffers
+  const int force_refresh_all =
+      ((frame_params.frame_type == KEY_FRAME && frame_params.show_frame) ||
+       frame_params.frame_type == S_FRAME) &&
+      !frame_params.show_existing_frame;
+
+  av1_configure_buffer_updates(cpi, &frame_params, frame_update_type,
+                               force_refresh_all);
+
+  if (oxcf->pass == 0 || oxcf->pass == 2) {
+    // Work out which reference frame slots may be used.
+    frame_params.ref_frame_flags = get_ref_frame_flags(cpi);
+
+    frame_params.primary_ref_frame =
+        choose_primary_ref_frame(cpi, &frame_params);
+    frame_params.order_offset =
+        get_order_offset(&cpi->twopass.gf_group, &frame_params);
+
+    frame_params.refresh_frame_flags =
+        get_refresh_frame_flags(cpi, &frame_params, frame_update_type);
+  }
+
+  // The way frame_params->remapped_ref_idx is setup is a placeholder.
+  // Currently, reference buffer assignment is done by update_ref_frame_map()
+  // which is called by high-level strategy AFTER encoding a frame.  It modifies
+  // cm->remapped_ref_idx.  If you want to use an alternative method to
+  // determine reference buffer assignment, just put your assignments into
+  // frame_params->remapped_ref_idx here and they will be used when encoding
+  // this frame.  If frame_params->remapped_ref_idx is setup independently of
+  // cm->remapped_ref_idx then update_ref_frame_map() will have no effect.
+  memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx,
+         REF_FRAMES * sizeof(*cm->remapped_ref_idx));
+
+  if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
+      AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+
+  if (oxcf->pass == 0 || oxcf->pass == 2) {
+    // First pass doesn't modify reference buffer assignment or produce frame
+    // flags
+    update_frame_flags(cpi, frame_flags);
+    update_ref_frame_map(cpi, frame_update_type);
+  }
+
+  if (oxcf->pass == 2) {
+#if TXCOEFF_COST_TIMER
+    cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
+    fprintf(stderr,
+            "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
+            "in us\n",
+            cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
+            cm->cum_txcoeff_cost_timer);
+#endif
+    av1_twopass_postencode_update(cpi);
+  }
+
+  if (oxcf->pass == 0 || oxcf->pass == 2) {
+    update_fb_of_context_type(cpi, &frame_params, cpi->fb_of_context_type);
+    set_additional_frame_flags(cm, frame_flags);
+    update_rc_counts(cpi);
+  }
+
+  // Unpack frame_results:
+  *size = frame_results.size;
+
+  // Leave a signal for a higher level caller about if this frame is droppable
+  if (*size > 0) {
+    cpi->droppable = is_frame_droppable(cpi);
+  }
+
+  return AOM_CODEC_OK;
+}
diff --git a/libaom/av1/encoder/encode_strategy.h b/libaom/av1/encoder/encode_strategy.h
new file mode 100644
index 0000000..6830e44
--- /dev/null
+++ b/libaom/av1/encoder/encode_strategy.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
+#define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "aom/aom_encoder.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+
+// This function will implement high-level encode strategy, choosing frame type,
+// frame placement, etc.  It populates an EncodeFrameParams struct with the
+// results of these decisions and then calls av1_encode()
+int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
+                        uint8_t *const dest, unsigned int *frame_flags,
+                        int64_t *const time_stamp, int64_t *const time_end,
+                        const aom_rational_t *const timebase, int flush);
+
+// Set individual buffer update flags based on frame reference type.
+// force_refresh_all is used when we have a KEY_FRAME or S_FRAME.  It forces all
+// refresh_*_frame flags to be set, because we refresh all buffers in this case.
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+                                  EncodeFrameParams *const frame_params,
+                                  const FRAME_UPDATE_TYPE type,
+                                  int force_refresh_all);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
diff --git a/libaom/av1/encoder/encodeframe.c b/libaom/av1/encoder/encodeframe.c
index ebfc8c2..2952184 100644
--- a/libaom/av1/encoder/encodeframe.c
+++ b/libaom/av1/encoder/encodeframe.c
@@ -10,6 +10,7 @@
  */
 
 #include <limits.h>
+#include <float.h>
 #include <math.h>
 #include <stdbool.h>
 #include <stdio.h>
@@ -54,12 +55,14 @@
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/ml.h"
+#include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/partition_model_weights.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
+#include "av1/encoder/var_based_part.h"
 
 static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                               ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
@@ -74,7 +77,7 @@ static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
 //  which will be faster.
-static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
@@ -139,15 +142,6 @@ static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
   128 * 16, 128 * 16
 };
 
-#if CONFIG_FP_MB_STATS
-static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES_ALL] = {
-  1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 1, 1, 1, 2, 2, 4
-};
-static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES_ALL] = {
-  1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 1, 1, 2, 1, 4, 2
-};
-#endif  // CONFIG_FP_MB_STATS
-
 unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
                                            const struct buf_2d *ref,
                                            BLOCK_SIZE bs) {
@@ -188,7 +182,8 @@ static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi,
                                                    BLOCK_SIZE bs) {
   unsigned int sse, var;
   uint8_t *last_y;
-  const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
+  const YV12_BUFFER_CONFIG *last =
+      get_ref_frame_yv12_buf(&cpi->common, LAST_FRAME);
 
   assert(last != NULL);
   last_y =
@@ -211,18 +206,6 @@ static BLOCK_SIZE get_rd_var_based_fixed_partition(AV1_COMP *cpi, MACROBLOCK *x,
     return BLOCK_8X8;
 }
 
-// Lighter version of set_offsets that only sets the mode info
-// pointers.
-static void set_mode_info_offsets(const AV1_COMP *const cpi,
-                                  MACROBLOCK *const x, MACROBLOCKD *const xd,
-                                  int mi_row, int mi_col) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int idx_str = xd->mi_stride * mi_row + mi_col;
-  xd->mi = cm->mi_grid_visible + idx_str;
-  xd->mi[0] = cm->mi + idx_str;
-  x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
-}
-
 static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
                                            const TileInfo *const tile,
                                            MACROBLOCK *const x, int mi_row,
@@ -267,25 +250,24 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
 
   // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
   xd->tile = *tile;
+
+  xd->cfl.mi_row = mi_row;
+  xd->cfl.mi_col = mi_col;
 }
 
 static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
                         MACROBLOCK *const x, int mi_row, int mi_col,
                         BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
-  const struct segmentation *const seg = &cm->seg;
 
   set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
 
+  // Setup segment ID.
   mbmi = xd->mi[0];
-  xd->cfl.mi_row = mi_row;
-  xd->cfl.mi_col = mi_col;
-
   mbmi->segment_id = 0;
-
-  // Setup segment ID.
   if (seg->enabled) {
     if (seg->enabled && !cpi->vaq_refresh) {
       const uint8_t *const map =
@@ -297,15 +279,6 @@ static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
   }
 }
 
-static void reset_intmv_filter_type(MB_MODE_INFO *mbmi) {
-  InterpFilter filters[2];
-
-  for (int dir = 0; dir < 2; ++dir) {
-    filters[dir] = av1_extract_interp_filter(mbmi->interp_filters, dir);
-  }
-  mbmi->interp_filters = av1_make_interp_filters(filters[0], filters[1]);
-}
-
 static void update_filter_type_count(uint8_t allow_update_cdf,
                                      FRAME_COUNTS *counts,
                                      const MACROBLOCKD *xd,
@@ -380,8 +353,6 @@ static void update_state(const AV1_COMP *const cpi,
   *mi_addr = *mi;
   *x->mbmi_ext = ctx->mbmi_ext;
 
-  reset_intmv_filter_type(mi_addr);
-
   memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
 
   x->skip = ctx->skip;
@@ -401,7 +372,6 @@ static void update_state(const AV1_COMP *const cpi,
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
       av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize,
                                         ctx->rate, ctx->dist, x->skip);
-      reset_tx_size(x, mi_addr, cm->tx_mode);
     }
     if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
       mi_addr->uv_mode = UV_DC_PRED;
@@ -512,24 +482,32 @@ static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
       cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q);
 }
 
-static uint16_t edge_strength(const struct buf_2d *ref, const BLOCK_SIZE bsize,
-                              const bool high_bd, const int bd) {
+static EdgeInfo edge_info(const struct buf_2d *ref, const BLOCK_SIZE bsize,
+                          const bool high_bd, const int bd) {
   const int width = block_size_wide[bsize];
   const int height = block_size_high[bsize];
   // Implementation requires width to be a multiple of 8. It also requires
   // height to be a multiple of 4, but this is always the case.
   assert(height % 4 == 0);
   if (width % 8 != 0) {
-    return 0;
+    EdgeInfo ei = { .magnitude = 0, .x = 0, .y = 0 };
+    return ei;
   }
   return av1_edge_exists(ref->buf, ref->stride, width, height, high_bd, bd);
 }
 
-static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
-                             MACROBLOCK *const x, int mi_row, int mi_col,
-                             RD_STATS *rd_cost, PARTITION_TYPE partition,
-                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                             int64_t best_rd) {
+static int use_pb_simple_motion_pred_sse(const AV1_COMP *const cpi) {
+  // TODO(debargha, yuec): Not in use, need to implement a speed feature
+  // utilizing this data point, and replace '0' by the corresponding speed
+  // feature flag.
+  return 0 && !frame_is_intra_only(&cpi->common);
+}
+
+static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                          MACROBLOCK *const x, int mi_row, int mi_col,
+                          RD_STATS *rd_cost, PARTITION_TYPE partition,
+                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                          int64_t best_rd, int use_nonrd_pick_mode) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   TileInfo *const tile_info = &tile_data->tile_info;
@@ -542,6 +520,10 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
   const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode;
   int i, orig_rdmult;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_sb_modes_time);
+#endif
+
   if (best_rd < 0) {
     ctx->rdcost = INT64_MAX;
     ctx->skip = 0;
@@ -602,21 +584,32 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
     return;
   }
 
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     x->source_variance = av1_high_get_sby_perpixel_variance(
         cpi, &x->plane[0].src, bsize, xd->bd);
   } else {
     x->source_variance =
         av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
   }
+  if (use_pb_simple_motion_pred_sse(cpi)) {
+    const MV ref_mv_full = { .row = 0, .col = 0 };
+    unsigned int var = 0;
+    av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full, 0,
+                              &x->simple_motion_pred_sse, &var);
+  }
+
   // If the threshold for disabling wedge search is zero, it means the feature
   // should not be used. Use a value that will always succeed in the check.
   if (cpi->sf.disable_wedge_search_edge_thresh == 0) {
     x->edge_strength = UINT16_MAX;
+    x->edge_strength_x = UINT16_MAX;
+    x->edge_strength_y = UINT16_MAX;
   } else {
-    x->edge_strength =
-        edge_strength(&x->plane[0].src, bsize,
-                      xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd);
+    EdgeInfo ei =
+        edge_info(&x->plane[0].src, bsize, is_cur_buf_hbd(xd), xd->bd);
+    x->edge_strength = ei.magnitude;
+    x->edge_strength_x = ei.x;
+    x->edge_strength_y = ei.y;
   }
   // Save rdmult before it might be changed, so it can be restored later.
   orig_rdmult = x->rdmult;
@@ -644,22 +637,35 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
   // Find best coding mode & reconstruct the MB so it is available
   // as a predictor for MBs that follow in the SB
   if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
     av1_rd_pick_intra_mode_sb(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx,
                               best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
   } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
     if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
                                          rd_cost, bsize, ctx, best_rd);
-#if CONFIG_ONE_PASS_SVM
-      ctx->seg_feat = 1;
-#endif
     } else {
-      av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
-                                bsize, ctx, best_rd);
-#if CONFIG_ONE_PASS_SVM
-      ctx->seg_feat = 0;
-#endif
+      // TODO(kyslov): do the same for pick_intra_mode and
+      //               pick_inter_mode_sb_seg_skip
+      if (use_nonrd_pick_mode) {
+        av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                                     bsize, ctx, best_rd);
+      } else {
+        av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                                  bsize, ctx, best_rd);
+      }
     }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
   }
 
   // Examine the resulting rate and for AQ mode 2 make a segment choice.
@@ -680,6 +686,10 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
   ctx->rate = rd_cost->rate;
   ctx->dist = rd_cost->dist;
   ctx->rdcost = rd_cost->rdcost;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_sb_modes_time);
+#endif
 }
 
 static void update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
@@ -1287,11 +1297,13 @@ static void update_stats(const AV1_COMMON *const cm, TileDataEnc *tile_data,
             assert(masked_compound_used);
             if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
 #if CONFIG_ENTROPY_STATS
-              ++counts->compound_type[bsize][mbmi->interinter_comp.type - 1];
+              ++counts->compound_type[bsize][mbmi->interinter_comp.type -
+                                             COMPOUND_WEDGE];
 #endif
               if (allow_update_cdf) {
                 update_cdf(fc->compound_type_cdf[bsize],
-                           mbmi->interinter_comp.type - 1, COMPOUND_TYPES - 1);
+                           mbmi->interinter_comp.type - COMPOUND_WEDGE,
+                           MASKED_COMPOUND_TYPES);
               }
             }
           }
@@ -1474,10 +1486,8 @@ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   encode_superblock(cpi, tile_data, td, tp, dry_run, mi_row, mi_col, bsize,
                     rate);
 
-  if (dry_run == 0)
-    x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
-
   if (!dry_run) {
+    x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
     if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 &&
         cpi->common.delta_q_info.delta_lf_present_flag) {
       const int frame_lf_count = av1_num_planes(&cpi->common) > 1
@@ -1624,25 +1634,6 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
-// Check to see if the given partition size is allowed for a specified number
-// of mi block rows and columns remaining in the image.
-// If not then return the largest allowed partition size
-static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
-                                      int cols_left, int *bh, int *bw) {
-  if (rows_left <= 0 || cols_left <= 0) {
-    return AOMMIN(bsize, BLOCK_8X8);
-  } else {
-    for (; bsize > 0; bsize -= 3) {
-      *bh = mi_size_high[bsize];
-      *bw = mi_size_wide[bsize];
-      if ((*bh <= rows_left) && (*bw <= cols_left)) {
-        break;
-      }
-    }
-  }
-  return bsize;
-}
-
 static void set_partial_sb_partition(const AV1_COMMON *const cm,
                                      MB_MODE_INFO *mi, int bh_in, int bw_in,
                                      int mi_rows_remaining,
@@ -1766,8 +1757,8 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
     if (partition != PARTITION_NONE && !splits_below &&
         mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
-                       PARTITION_NONE, bsize, ctx_none, INT64_MAX);
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+                    PARTITION_NONE, bsize, ctx_none, INT64_MAX, 0);
 
       if (none_rdc.rate < INT_MAX) {
         none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
@@ -1779,29 +1770,16 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
       pc_tree->partitioning = partition;
     }
   }
-  for (int b = 0; b < 2; ++b) {
-    pc_tree->horizontal[b].skip_ref_frame_mask = 0;
-    pc_tree->vertical[b].skip_ref_frame_mask = 0;
-  }
-  for (int b = 0; b < 3; ++b) {
-    pc_tree->horizontala[b].skip_ref_frame_mask = 0;
-    pc_tree->horizontalb[b].skip_ref_frame_mask = 0;
-    pc_tree->verticala[b].skip_ref_frame_mask = 0;
-    pc_tree->verticalb[b].skip_ref_frame_mask = 0;
-  }
-  for (int b = 0; b < 4; ++b) {
-    pc_tree->horizontal4[b].skip_ref_frame_mask = 0;
-    pc_tree->vertical4[b].skip_ref_frame_mask = 0;
-  }
+
   switch (partition) {
     case PARTITION_NONE:
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                       PARTITION_NONE, bsize, ctx_none, INT64_MAX);
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_NONE, bsize, ctx_none, INT64_MAX, 0);
       break;
     case PARTITION_HORZ:
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                       PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
-                       INT64_MAX);
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0], INT64_MAX,
+                    0);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_row + hbs < cm->mi_rows) {
         RD_STATS tmp_rdc;
@@ -1810,9 +1788,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
         update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
         encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
                           mi_col, subsize, NULL);
-        rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
-                         PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                         INT64_MAX);
+        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                      INT64_MAX, 0);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
           break;
@@ -1823,9 +1801,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
       }
       break;
     case PARTITION_VERT:
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                       PARTITION_VERT, subsize, &pc_tree->vertical[0],
-                       INT64_MAX);
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_VERT, subsize, &pc_tree->vertical[0], INT64_MAX,
+                    0);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_col + hbs < cm->mi_cols) {
         RD_STATS tmp_rdc;
@@ -1834,9 +1812,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
         update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
         encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
                           mi_col, subsize, NULL);
-        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
-                         PARTITION_VERT, subsize,
-                         &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX);
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+                      PARTITION_VERT, subsize,
+                      &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX, 0);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
           break;
@@ -1910,9 +1888,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
 
       save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                       &tmp_rdc, PARTITION_SPLIT, split_subsize,
-                       &pc_tree->split[i]->none, INT64_MAX);
+      pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+                    PARTITION_SPLIT, split_subsize, &pc_tree->split[i]->none,
+                    INT64_MAX, 0);
 
       restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
       if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
@@ -1973,67 +1951,170 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
   *dist = chosen_rdc.dist;
 }
 
-/* clang-format off */
-static const BLOCK_SIZE min_partition_size[BLOCK_SIZES_ALL] = {
-                            BLOCK_4X4,    //                     4x4
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,    //    4x8,    8x4,     8x8
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   8x16,   16x8,   16x16
-  BLOCK_8X8,   BLOCK_8X8,   BLOCK_16X16,  //  16x32,  32x16,   32x32
-  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  //  32x64,  64x32,   64x64
-  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  // 64x128, 128x64, 128x128
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   4x16,   16x4,    8x32
-  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,  //   32x8,  16x64,   64x16
-};
+// TODO(kyslov): now this is very similar to rd_use_partition (except that
+// doesn't do extra search arounf suggested partitioning)
+//               consider passing a flag to select non-rd path (similar to
+//               encode_sb_row)
+static void nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+                                TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                                TOKENEXTRA **tp, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                                int do_recon, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  int i;
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_STATS last_part_rdc;
+  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
 
-static const BLOCK_SIZE max_partition_size[BLOCK_SIZES_ALL] = {
-                                  BLOCK_8X8,    //                     4x4
-  BLOCK_16X16,   BLOCK_16X16,   BLOCK_16X16,    //    4x8,    8x4,     8x8
-  BLOCK_32X32,   BLOCK_32X32,   BLOCK_32X32,    //   8x16,   16x8,   16x16
-  BLOCK_64X64,   BLOCK_64X64,   BLOCK_64X64,    //  16x32,  32x16,   32x32
-  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  //  32x64,  64x32,   64x64
-  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  // 64x128, 128x64, 128x128
-  BLOCK_16X16,   BLOCK_16X16,   BLOCK_32X32,    //   4x16,   16x4,    8x32
-  BLOCK_32X32,   BLOCK_LARGEST, BLOCK_LARGEST,  //   32x8,  16x64,   64x16
-};
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-// Next square block size less or equal than current block size.
-static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = {
-                              BLOCK_4X4,    //                     4x4
-  BLOCK_4X4,   BLOCK_4X4,     BLOCK_8X8,    //    4x8,    8x4,     8x8
-  BLOCK_8X8,   BLOCK_8X8,     BLOCK_16X16,  //   8x16,   16x8,   16x16
-  BLOCK_16X16, BLOCK_16X16,   BLOCK_32X32,  //  16x32,  32x16,   32x32
-  BLOCK_32X32, BLOCK_32X32,   BLOCK_64X64,  //  32x64,  64x32,   64x64
-  BLOCK_64X64, BLOCK_64X64, BLOCK_128X128,  // 64x128, 128x64, 128x128
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,      //   4x16,   16x4,    8x32
-  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,    //   32x8,  16x64,   64x16
-};
-/* clang-format on */
-
-// Look at all the mode_info entries for blocks that are part of this
-// partition and find the min and max values for sb_type.
-// At the moment this is designed to work on a superblock but could be
-// adjusted to use a size parameter.
-//
-// The min and max are assumed to have been initialized prior to calling this
-// function so repeat calls can accumulate a min and max of more than one
-// superblock.
-static void get_sb_partition_size_range(const AV1_COMMON *const cm,
-                                        MACROBLOCKD *xd, MB_MODE_INFO **mib,
-                                        BLOCK_SIZE *min_block_size,
-                                        BLOCK_SIZE *max_block_size) {
-  int i, j;
-  int index = 0;
-
-  // Check the sb_type for each block that belongs to this region.
-  for (i = 0; i < cm->seq_params.mib_size; ++i) {
-    for (j = 0; j < cm->seq_params.mib_size; ++j) {
-      MB_MODE_INFO *mi = mib[index + j];
-      BLOCK_SIZE sb_type = mi ? mi->sb_type : BLOCK_4X4;
-      *min_block_size = AOMMIN(*min_block_size, sb_type);
-      *max_block_size = AOMMAX(*max_block_size, sb_type);
-    }
-    index += xd->mi_stride;
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  av1_invalid_rd_stats(&last_part_rdc);
+
+  pc_tree->partitioning = partition;
+
+  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
   }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_NONE, bsize, ctx_none, INT64_MAX, 1);
+      break;
+    case PARTITION_HORZ:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0], INT64_MAX,
+                    1);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_row + hbs < cm->mi_rows) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
+        av1_init_rd_stats(&tmp_rdc);
+        update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+                          mi_col, subsize, NULL);
+        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                      INT64_MAX, 1);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_VERT:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_VERT, subsize, &pc_tree->vertical[0], INT64_MAX,
+                    1);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_col + hbs < cm->mi_cols) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
+        av1_init_rd_stats(&tmp_rdc);
+        update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+                          mi_col, subsize, NULL);
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+                      PARTITION_VERT, subsize,
+                      &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX, 1);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_SPLIT:
+      last_part_rdc.rate = 0;
+      last_part_rdc.dist = 0;
+      last_part_rdc.rdcost = 0;
+      for (i = 0; i < 4; i++) {
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
+        int jj = i >> 1, ii = i & 0x01;
+        RD_STATS tmp_rdc;
+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+          continue;
+
+        av1_init_rd_stats(&tmp_rdc);
+        nonrd_use_partition(
+            cpi, td, tile_data, mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
+            mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+            &tmp_rdc.dist, i != 3, pc_tree->split[i]);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+      }
+      break;
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4:
+      assert(0 && "Cannot handle extended partition types");
+    default: assert(0); break;
+  }
+
+  if (last_part_rdc.rate < INT_MAX) {
+    last_part_rdc.rate += x->partition_cost[pl][partition];
+    last_part_rdc.rdcost =
+        RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
+  }
+
+  restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  // We must have chosen a partitioning and encoding or we'll fail later on.
+  // No other opportunities for success.
+  if (bsize == cm->seq_params.sb_size)
+    assert(last_part_rdc.rate < INT_MAX && last_part_rdc.dist < INT64_MAX);
+
+  if (do_recon) {
+    if (bsize == cm->seq_params.sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      x->cb_offset = 0;
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+  *rate = last_part_rdc.rate;
+  *dist = last_part_rdc.dist;
 }
 
 // Checks to see if a super block is on a horizontal image edge.
@@ -2090,234 +2171,6 @@ static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
   return is_active_v_edge;
 }
 
-// Checks to see if a super block is at the edge of the active image.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-static int active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
-  return active_h_edge(cpi, mi_row, cpi->common.seq_params.mib_size) ||
-         active_v_edge(cpi, mi_col, cpi->common.seq_params.mib_size);
-}
-
-// Performs a motion search in SIMPLE_TRANSLATION mode using
-// reference frame ref. Returns the sad of the result
-static void simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
-                                 int mi_col, BLOCK_SIZE bsize, int ref,
-                                 int num_planes, int use_subpixel) {
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-
-  mbmi->ref_frame[0] = ref;
-  mbmi->ref_frame[1] = NONE_FRAME;
-  mbmi->sb_type = bsize;
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
-
-  YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref);
-  const YV12_BUFFER_CONFIG *scaled_ref_frame =
-      av1_get_scaled_ref_frame(cpi, ref);
-  struct buf_2d backup_yv12;
-  // ref_mv is in units of 1/8-pel whereas ref_mv_full is in units of pel
-  MV ref_mv = { 0, 0 };
-  MV ref_mv_full = { 0, 0 };
-  const int step_param = cpi->mv_step_param;
-  const MvLimits tmp_mv_limits = x->mv_limits;
-  const SEARCH_METHODS search_methods = NSTEP;
-  const int do_mesh_search = 0;
-  const int sadpb = x->sadperbit16;
-  int cost_list[5];
-  const int ref_idx = 0;
-  int var;
-
-  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-
-  if (scaled_ref_frame) {
-    backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
-    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
-                         num_planes);
-  } else {
-    av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
-                         &cm->current_frame.frame_refs[ref - LAST_FRAME].sf,
-                         num_planes);
-  }
-
-  // This overwrites the mv_limits so we will need to restore it later.
-  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
-  var = av1_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param,
-                              search_methods, do_mesh_search, sadpb,
-                              cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX,
-                              1, mi_col * MI_SIZE, mi_row * MI_SIZE, 0);
-  // Restore
-  x->mv_limits = tmp_mv_limits;
-
-  const int use_subpel_search =
-      var < INT_MAX && !cpi->common.cur_frame_force_integer_mv && use_subpixel;
-  if (use_subpel_search) {
-    int not_used = 0;
-    if (cpi->sf.use_accurate_subpel_search) {
-      const int pw = block_size_wide[bsize];
-      const int ph = block_size_high[bsize];
-      cpi->find_fractional_mv_step(
-          x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
-          x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-          cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-          x->nmv_vec_cost, x->mv_cost_stack, &not_used, &x->pred_sse[ref], NULL,
-          NULL, 0, 0, pw, ph, cpi->sf.use_accurate_subpel_search, 1);
-    } else {
-      cpi->find_fractional_mv_step(
-          x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
-          x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-          cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-          x->nmv_vec_cost, x->mv_cost_stack, &not_used, &x->pred_sse[ref], NULL,
-          NULL, 0, 0, 0, 0, 0, 1);
-    }
-  } else {
-    // Manually convert from units of pixel to 1/8-pixels if we are not doing
-    // subpel search
-    x->best_mv.as_mv.row *= 8;
-    x->best_mv.as_mv.col *= 8;
-  }
-
-  mbmi->mv[0].as_mv = x->best_mv.as_mv;
-
-  // Get a copy of the prediction output
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
-
-  aom_clear_system_state();
-
-  if (scaled_ref_frame) {
-    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
-  }
-}
-
-// Look at neighboring blocks and set a min and max partition size based on
-// what they chose.
-static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
-                                    MACROBLOCKD *const xd, int mi_row,
-                                    int mi_col, BLOCK_SIZE *min_block_size,
-                                    BLOCK_SIZE *max_block_size) {
-  AV1_COMMON *const cm = &cpi->common;
-  MB_MODE_INFO **mi = xd->mi;
-  const int left_in_image = xd->left_available && mi[-1];
-  const int above_in_image = xd->up_available && mi[-xd->mi_stride];
-  const int mi_rows_remaining = tile->mi_row_end - mi_row;
-  const int mi_cols_remaining = tile->mi_col_end - mi_col;
-  int bh, bw;
-  BLOCK_SIZE min_size = BLOCK_4X4;
-  BLOCK_SIZE max_size = BLOCK_LARGEST;
-
-  // Trap case where we do not have a prediction.
-  if (left_in_image || above_in_image ||
-      cm->current_frame.frame_type != KEY_FRAME) {
-    // Default "min to max" and "max to min"
-    min_size = BLOCK_LARGEST;
-    max_size = BLOCK_4X4;
-
-    // NOTE: each call to get_sb_partition_size_range() uses the previous
-    // passed in values for min and max as a starting point.
-    // Find the min and max partition used in previous frame at this location
-    if (cm->current_frame.frame_type != KEY_FRAME) {
-      MB_MODE_INFO **prev_mi =
-          &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
-      get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size);
-    }
-    // Find the min and max partition sizes used in the left superblock
-    if (left_in_image) {
-      MB_MODE_INFO **left_sb_mi = &mi[-cm->seq_params.mib_size];
-      get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size);
-    }
-    // Find the min and max partition sizes used in the above suprblock.
-    if (above_in_image) {
-      MB_MODE_INFO **above_sb_mi =
-          &mi[-xd->mi_stride * cm->seq_params.mib_size];
-      get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size);
-    }
-
-    // Adjust observed min and max for "relaxed" auto partition case.
-    if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
-      min_size = min_partition_size[min_size];
-      max_size = max_partition_size[max_size];
-    }
-  }
-
-  // Check border cases where max and min from neighbors may not be legal.
-  max_size = find_partition_size(max_size, mi_rows_remaining, mi_cols_remaining,
-                                 &bh, &bw);
-  min_size = AOMMIN(min_size, max_size);
-
-  // Test for blocks at the edge of the active image.
-  // This may be the actual edge of the image or where there are formatting
-  // bars.
-  if (active_edge_sb(cpi, mi_row, mi_col)) {
-    min_size = BLOCK_4X4;
-  } else {
-    min_size = AOMMIN(cpi->sf.rd_auto_partition_min_limit, min_size);
-  }
-
-  // When use_square_partition_only is true, make sure at least one square
-  // partition is allowed by selecting the next smaller square size as
-  // *min_block_size.
-  if (min_size >= cpi->sf.use_square_partition_only_threshold) {
-    min_size = AOMMIN(min_size, next_square_size[max_size]);
-  }
-
-  *min_block_size = AOMMIN(min_size, cm->seq_params.sb_size);
-  *max_block_size = AOMMIN(max_size, cm->seq_params.sb_size);
-}
-
-// TODO(jingning) refactor functions setting partition search range
-static void set_partition_range(const AV1_COMMON *const cm,
-                                const MACROBLOCKD *const xd, int mi_row,
-                                int mi_col, BLOCK_SIZE bsize,
-                                BLOCK_SIZE *const min_bs,
-                                BLOCK_SIZE *const max_bs) {
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  int idx, idy;
-
-  const int idx_str = cm->mi_stride * mi_row + mi_col;
-  MB_MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str];
-  BLOCK_SIZE min_size = cm->seq_params.sb_size;  // default values
-  BLOCK_SIZE max_size = BLOCK_4X4;
-
-  if (prev_mi) {
-    for (idy = 0; idy < mi_height; ++idy) {
-      for (idx = 0; idx < mi_width; ++idx) {
-        const MB_MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx];
-        const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
-        min_size = AOMMIN(min_size, bs);
-        max_size = AOMMAX(max_size, bs);
-      }
-    }
-  }
-
-  if (xd->left_available) {
-    for (idy = 0; idy < mi_height; ++idy) {
-      const MB_MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1];
-      const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
-      min_size = AOMMIN(min_size, bs);
-      max_size = AOMMAX(max_size, bs);
-    }
-  }
-
-  if (xd->up_available) {
-    for (idx = 0; idx < mi_width; ++idx) {
-      const MB_MODE_INFO *const mi = xd->mi[idx - cm->mi_stride];
-      const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
-      min_size = AOMMIN(min_size, bs);
-      max_size = AOMMAX(max_size, bs);
-    }
-  }
-
-  if (min_size == max_size) {
-    min_size = min_partition_size[min_size];
-    max_size = max_partition_size[max_size];
-  }
-
-  *min_bs = AOMMIN(min_size, cm->seq_params.sb_size);
-  *max_bs = AOMMIN(max_size, cm->seq_params.sb_size);
-}
-
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
 }
@@ -2327,56 +2180,6 @@ static INLINE void load_pred_mv(MACROBLOCK *x,
   memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
 }
 
-#if CONFIG_FP_MB_STATS
-const int qindex_skip_threshold_lookup[BLOCK_SIZES] = {
-  0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120,
-  // TODO(debargha): What are the correct numbers here?
-  130, 130, 150
-};
-const int qindex_split_threshold_lookup[BLOCK_SIZES] = {
-  0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120,
-  // TODO(debargha): What are the correct numbers here?
-  160, 160, 240
-};
-const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = {
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6,
-  // TODO(debargha): What are the correct numbers here?
-  8, 8, 10
-};
-
-typedef enum {
-  MV_ZERO = 0,
-  MV_LEFT = 1,
-  MV_UP = 2,
-  MV_RIGHT = 3,
-  MV_DOWN = 4,
-  MV_INVALID
-} MOTION_DIRECTION;
-
-static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
-  if (fp_byte & FPMB_MOTION_ZERO_MASK) {
-    return MV_ZERO;
-  } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
-    return MV_LEFT;
-  } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
-    return MV_RIGHT;
-  } else if (fp_byte & FPMB_MOTION_UP_MASK) {
-    return MV_UP;
-  } else {
-    return MV_DOWN;
-  }
-}
-
-static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
-                                           MOTION_DIRECTION that_mv) {
-  if (this_mv == that_mv) {
-    return 0;
-  } else {
-    return abs(this_mv - that_mv) == 2 ? 2 : 1;
-  }
-}
-#endif
-
 // Try searching for an encoding for the given subblock. Returns zero if the
 // rdcost is already too high (to tell the caller not to bother searching for
 // encodings of further subblocks)
@@ -2398,9 +2201,9 @@ static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
                                        ? INT64_MAX
                                        : (best_rdc->rdcost - sum_rdc->rdcost);
 
-  rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
-                   RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
-                   rdcost_remaining);
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
+                RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
+                rdcost_remaining, 0);
 
   if (this_rdc->rate == INT_MAX) {
     sum_rdc->rdcost = INT64_MAX;
@@ -2616,8 +2419,8 @@ static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
     const int64_t best_remain_rdcost =
         best_rdc.rdcost == INT64_MAX ? INT64_MAX
                                      : (best_rdc.rdcost - partition_rd_cost);
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
-                     PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE,
+                  bsize, ctx_none, best_remain_rdcost, 0);
 
     pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost;
     pc_tree->pc_tree_stats.skip = ctx_none->skip;
@@ -2669,6 +2472,17 @@ static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
             do_square_split = 0;
           }
         }
+
+        if (cpi->sf.firstpass_simple_motion_search_early_term &&
+            cm->show_frame && bsize <= BLOCK_32X32 && bsize >= BLOCK_8X8 &&
+            !frame_is_intra_only(cm) && mi_row + mi_step < cm->mi_rows &&
+            mi_col + mi_step < cm->mi_cols && this_rdc.rdcost < INT64_MAX &&
+            this_rdc.rdcost >= 0 && this_rdc.rate < INT_MAX &&
+            this_rdc.rate >= 0 && do_square_split) {
+          av1_firstpass_simple_motion_search_early_term(
+              cpi, x, pc_tree, mi_row, mi_col, bsize, &this_rdc,
+              &do_square_split);
+        }
       }
     }
 
@@ -2788,79 +2602,9 @@ static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
   }
 }
 
-#define FEATURE_SIZE 19
-static const float two_pass_split_partition_weights_128[FEATURE_SIZE + 1] = {
-  2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f,
-  0.125296f, -1.134961f, 0.862757f,  -0.418799f, -0.637666f,
-  0.016232f, 0.345013f,  0.018823f,  -0.393394f, -1.130700f,
-  0.695357f, 0.112569f,  -0.341975f, -0.513882f, 5.7488966f,
-};
-
-static const float two_pass_split_partition_weights_64[FEATURE_SIZE + 1] = {
-  2.990993f,  0.423273f,  -0.926544f, 0.454646f,  -0.292698f,
-  -1.311632f, -0.284432f, 0.717141f,  -0.419257f, -0.574760f,
-  -0.674444f, 0.669047f,  -0.374255f, 0.380624f,  -0.804036f,
-  0.264021f,  0.004163f,  1.896802f,  0.924287f,  0.13490619f,
-};
-
-static const float two_pass_split_partition_weights_32[FEATURE_SIZE + 1] = {
-  2.795181f,  -0.136943f, -0.924842f, 0.405330f,  -0.463505f,
-  -0.584076f, -0.831472f, 0.382985f,  -0.597544f, -0.138915f,
-  -1.354350f, 0.466035f,  -0.553961f, 0.213202f,  -1.166429f,
-  0.010776f,  -0.096236f, 2.335084f,  1.699857f,  -0.58178353f,
-};
-
-static const float two_pass_split_partition_weights_16[FEATURE_SIZE + 1] = {
-  1.987888f,  -0.431100f, -1.687703f, 0.262602f,  -0.425298f,
-  -0.463870f, -1.493457f, 0.470917f,  -0.528457f, -0.087700f,
-  -1.815092f, 0.152883f,  -0.337908f, 0.093679f,  -1.548267f,
-  -0.042387f, -0.000861f, 2.556746f,  1.619192f,  0.03643292f,
-};
-
-static const float two_pass_split_partition_weights_8[FEATURE_SIZE + 1] = {
-  2.188344f,  -0.817528f, -2.119219f, 0.000000f,  -0.348167f,
-  -0.658074f, -1.960362f, 0.000000f,  -0.403080f, 0.282699f,
-  -2.061088f, 0.000000f,  -0.431919f, -0.127960f, -1.099550f,
-  0.000000f,  0.121622f,  2.017455f,  2.058228f,  -0.15475988f,
-};
-
-static const float two_pass_none_partition_weights_128[FEATURE_SIZE + 1] = {
-  -1.006689f, 0.777908f,  4.461072f,  -0.395782f, -0.014610f,
-  -0.853863f, 0.729997f,  -0.420477f, 0.282429f,  -1.194595f,
-  3.181220f,  -0.511416f, 0.117084f,  -1.149348f, 1.507990f,
-  -0.477212f, 0.202963f,  -1.469581f, 0.624461f,  -0.89081228f,
-};
-
-static const float two_pass_none_partition_weights_64[FEATURE_SIZE + 1] = {
-  -1.241117f, 0.844878f,  5.638803f,  -0.489780f, -0.108796f,
-  -4.576821f, 1.540624f,  -0.477519f, 0.227791f,  -1.443968f,
-  1.586911f,  -0.505125f, 0.140764f,  -0.464194f, 1.466658f,
-  -0.641166f, 0.195412f,  1.427905f,  2.080007f,  -1.98272777f,
-};
-
-static const float two_pass_none_partition_weights_32[FEATURE_SIZE + 1] = {
-  -2.130825f, 0.476023f,  5.907343f,  -0.516002f, -0.097471f,
-  -2.662754f, 0.614858f,  -0.576728f, 0.085261f,  -0.031901f,
-  0.727842f,  -0.600034f, 0.079326f,  0.324328f,  0.504502f,
-  -0.547105f, -0.037670f, 0.304995f,  0.369018f,  -2.66299987f,
-};
-
-static const float two_pass_none_partition_weights_16[FEATURE_SIZE + 1] = {
-  -1.626410f, 0.872047f,  5.414965f,  -0.554781f, -0.084514f,
-  -3.020550f, 0.467632f,  -0.382280f, 0.199568f,  0.426220f,
-  0.829426f,  -0.467100f, 0.153098f,  0.662994f,  0.327545f,
-  -0.560106f, -0.141610f, 0.403372f,  0.523991f,  -3.02891231f,
-};
-
-static const float two_pass_none_partition_weights_8[FEATURE_SIZE + 1] = {
-  -1.463349f, 0.375376f,  4.751430f, 0.000000f, -0.184451f,
-  -1.655447f, 0.443214f,  0.000000f, 0.127961f, 0.152435f,
-  0.083288f,  0.000000f,  0.143105f, 0.438012f, 0.073238f,
-  0.000000f,  -0.278137f, 0.186134f, 0.073737f, -1.6494962f,
-};
-
 // split_score indicates confidence of picking split partition;
 // none_score indicates confidence of picking none partition;
+#define FEATURE_SIZE 19
 static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
                                           BLOCK_SIZE bsize, int *split_score,
                                           int *none_score) {
@@ -2870,24 +2614,24 @@ static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
   switch (bsize) {
     case BLOCK_4X4: break;
     case BLOCK_8X8:
-      split_weights = two_pass_split_partition_weights_8;
-      none_weights = two_pass_none_partition_weights_8;
+      split_weights = av1_2pass_split_partition_weights_8;
+      none_weights = av1_2pass_none_partition_weights_8;
       break;
     case BLOCK_16X16:
-      split_weights = two_pass_split_partition_weights_16;
-      none_weights = two_pass_none_partition_weights_16;
+      split_weights = av1_2pass_split_partition_weights_16;
+      none_weights = av1_2pass_none_partition_weights_16;
       break;
     case BLOCK_32X32:
-      split_weights = two_pass_split_partition_weights_32;
-      none_weights = two_pass_none_partition_weights_32;
+      split_weights = av1_2pass_split_partition_weights_32;
+      none_weights = av1_2pass_none_partition_weights_32;
       break;
     case BLOCK_64X64:
-      split_weights = two_pass_split_partition_weights_64;
-      none_weights = two_pass_none_partition_weights_64;
+      split_weights = av1_2pass_split_partition_weights_64;
+      none_weights = av1_2pass_none_partition_weights_64;
       break;
     case BLOCK_128X128:
-      split_weights = two_pass_split_partition_weights_128;
-      none_weights = two_pass_none_partition_weights_128;
+      split_weights = av1_2pass_split_partition_weights_128;
+      none_weights = av1_2pass_none_partition_weights_128;
       break;
     default: assert(0 && "Unexpected bsize.");
   }
@@ -2981,7 +2725,7 @@ static void ml_prune_rect_partition(const AV1_COMP *const cpi,
   // Variance ratios
   const MACROBLOCKD *const xd = &x->e_mbd;
   int whole_block_variance;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     whole_block_variance = av1_high_get_sby_perpixel_variance(
         cpi, &x->plane[0].src, bsize, xd->bd);
   } else {
@@ -2999,7 +2743,7 @@ static void ml_prune_rect_partition(const AV1_COMP *const cpi,
     const int x_idx = (i & 1) * bw / 2;
     const int y_idx = (i >> 1) * bw / 2;
     buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
       split_variance[i] =
           av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
     } else {
@@ -3181,7 +2925,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
           src + i * block_size_high[horz_4_bs] * src_stride;
       const uint8_t *vert_src = src + i * block_size_wide[vert_4_bs];
       unsigned int horz_var, vert_var, sse;
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (is_cur_buf_hbd(xd)) {
         switch (xd->bd) {
           case 10:
             horz_var = cpi->fn_ptr[horz_4_bs].vf(
@@ -3340,204 +3084,32 @@ static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
 }
 #undef FEATURES
 
-#if CONFIG_ONE_PASS_SVM
-#define FEATURES 24
-static void ml_op_svm_early_term(const AV1_COMP *const cpi,
-                                 const MACROBLOCK *const x,
-                                 const MACROBLOCKD *const xd,
-                                 const PICK_MODE_CONTEXT *ctx_none,
-                                 const RD_STATS *none_rdc, int pb_source_var,
-                                 BLOCK_SIZE bsize, float *const score) {
-  const float *ml_weights = NULL, *ml_mean = NULL, *ml_std = NULL;
-  if (bsize == BLOCK_128X128) {
-    ml_weights = av1_op_svm_early_term_weights_128;
-    ml_mean = av1_op_svm_early_term_mean_128;
-    ml_std = av1_op_svm_early_term_std_128;
-  } else if (bsize == BLOCK_64X64) {
-    ml_weights = av1_op_svm_early_term_weights_64;
-    ml_mean = av1_op_svm_early_term_mean_64;
-    ml_std = av1_op_svm_early_term_std_64;
-  } else if (bsize == BLOCK_32X32) {
-    ml_weights = av1_op_svm_early_term_weights_32;
-    ml_mean = av1_op_svm_early_term_mean_32;
-    ml_std = av1_op_svm_early_term_std_32;
-  } else if (bsize == BLOCK_16X16) {
-    ml_weights = av1_op_svm_early_term_weights_16;
-    ml_mean = av1_op_svm_early_term_mean_16;
-    ml_std = av1_op_svm_early_term_std_16;
-  } else {
-    assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
-           bsize == BLOCK_32X32 || bsize == BLOCK_8X8);
-  }
-  if (ml_weights != NULL) {
-    // Compute some features
-
-    float features[FEATURES] = { 0 };
-    int f_idx = 0;
-    int r_idx = 0;
-
-    // None features
-    // Get none stats
-    features[f_idx++] = none_rdc->rate;
-    features[f_idx++] = none_rdc->dist;
-    features[f_idx++] = none_rdc->rdcost;
-    features[f_idx++] = ctx_none->skip;
-
-    // EOBS
-    features[f_idx++] = none_rdc->eob;
-    int scaled_eob = none_rdc->eob * 32 * 32;
-    features[f_idx++] = (1.0f + none_rdc->eob_0) / (4.0f + scaled_eob);
-    features[f_idx++] = (1.0f + none_rdc->eob_1) / (4.0f + scaled_eob);
-    features[f_idx++] = (1.0f + none_rdc->eob_2) / (4.0f + scaled_eob);
-    features[f_idx++] = (1.0f + none_rdc->eob_3) / (4.0f + scaled_eob);
-
-    // Y_RD
-    features[f_idx++] = none_rdc->rd;
-    int64_t scaled_rd = none_rdc->rd * 32 * 32;
-    features[f_idx++] = (1.0f + none_rdc->rd_0) / (4.0f + scaled_rd);
-    features[f_idx++] = (1.0f + none_rdc->rd_1) / (4.0f + scaled_rd);
-    features[f_idx++] = (1.0f + none_rdc->rd_2) / (4.0f + scaled_rd);
-    features[f_idx++] = (1.0f + none_rdc->rd_3) / (4.0f + scaled_rd);
-
-    // Q_SQUARED
-    features[f_idx++] =
-        (x->plane[0].dequant_QTX[0]) * (x->plane[0].dequant_QTX[0]);
-
-    // SIZE
-    // Get size of surrounding blocks
-    int above_size = 18, left_size = 18;
-    const MB_MODE_INFO *above_block = xd->above_mbmi;
-    const MB_MODE_INFO *left_block = xd->left_mbmi;
-
-    if (above_block) {
-      above_size = above_block->sb_type;
-    }
-    if (left_block) {
-      left_size = left_block->sb_type;
-    }
-
-    features[f_idx++] = left_size;
-    features[f_idx++] = left_size != 18;
-
-    features[f_idx++] = above_size;
-    features[f_idx++] = above_size != 18;
-
-    // Variance
-    // Get variance
-    int var = pb_source_var, var_reg[4] = { 0 };
-    const int bw = block_size_wide[bsize];
-    const int bh = block_size_high[bsize];
-    const BLOCK_SIZE split_size = get_partition_subsize(bsize, PARTITION_SPLIT);
-    struct buf_2d buf;
-    buf.stride = x->plane[0].src.stride;
-    for (int i = 0; i < 4; ++i) {
-      const int x_idx = (i & 1) * bw / 2;
-      const int y_idx = (i >> 1) * bh / 2;
-      buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        var_reg[i] =
-            av1_high_get_sby_perpixel_variance(cpi, &buf, split_size, xd->bd);
-      } else {
-        var_reg[i] = av1_get_sby_perpixel_variance(cpi, &buf, split_size);
-      }
-    }
-
-    features[f_idx++] = var;
-    for (r_idx = 0; r_idx < 4; r_idx++) {
-      features[f_idx] = (var_reg[r_idx] + 1.0f) / (var + 4.0f);
-      f_idx++;
-    }
-
-    assert(f_idx == FEATURES);
-
-    // Calculate the score
-    *score = 0.0f;
-    for (f_idx = 0; f_idx < FEATURES; f_idx++) {
-      *score += ml_weights[f_idx] * (features[f_idx] - ml_mean[f_idx]) /
-                ml_std[f_idx];
-    }
-    // Dont forget the bias
-    *score += ml_weights[FEATURES];
-  }
-}
-#undef FEATURES
-#endif
-
-// Performs a full_pixel_motion_search with a single reference frame and extract
-// the variance of residues. Here features is assumed to be a length 6 array.
-// After this function is called, we will store the following in to features:
-// features[0] = log(1 + dc_q**2/256)
-// features[1] = log(1 + variance_of_residue)
-// for i in [2, 3, 4, 5]:
-//  features[i] = log(1 + variance_of_residue_in_block[i]/variance_of_residue)
-static void get_res_var_features(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
-                                 int mi_col, BLOCK_SIZE bsize,
-                                 float *features) {
-  // TODO(chiyotsai@google.com): The data this model trained on did not also use
-  // SIMPLE_TRANSLATION to build the inter_predictor. Retraining and tuning the
-  // model with the correct data should give better performance.
+// Record the ref frames that have been selected by square partition blocks.
+static void update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
+                                          BLOCK_SIZE bsize, int mib_size,
+                                          int mi_row, int mi_col) {
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-
-  MACROBLOCKD *xd = &x->e_mbd;
-  DECLARE_ALIGNED(16, uint16_t, pred_buffer[MAX_SB_SQUARE]);
-  int pred_stride = 128;
-
-  // Perform a single motion search in Y_PLANE to make a prediction
-  const MV_REFERENCE_FRAME ref =
-      cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
-  const int use_subpixel = 0;
-  const int num_planes = 1;
-
-  uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-                                ? CONVERT_TO_BYTEPTR(pred_buffer)
-                                : (uint8_t *)pred_buffer;
-  xd->plane[0].dst.buf = pred_buf;
-  xd->plane[0].dst.stride = pred_stride;
-
-  simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, num_planes,
-                       use_subpixel);
-
-  // Start getting the features
-  int f_idx = 0;
-
-  // Q_INDEX
-  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
-  features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
-
-  // VARIANCE
-  const uint8_t *src = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  unsigned int sse = 0;
-
-  // Whole block
-  const unsigned int var =
-      cpi->fn_ptr[bsize].vf(src, src_stride, pred_buf, pred_stride, &sse);
-  features[f_idx++] = logf(1.0f + (float)var);
-
-  // Regional
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-  int r_idx = 0;
-  for (r_idx = 0; r_idx < 4; r_idx++) {
-    const int x_idx = (r_idx & 1) * bw / 2;
-    const int y_idx = (r_idx >> 1) * bh / 2;
-    const int src_offset = y_idx * src_stride + x_idx;
-    const int pred_offset = y_idx * pred_stride + x_idx;
-    const unsigned int sub_var =
-        cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
-                                pred_buf + pred_offset, pred_stride, &sse);
-    const float var_ratio = (1.0f + (float)sub_var) / (4.0f + (float)var);
-    features[f_idx++] = var_ratio;
+  const int sb_size_mask = mib_size - 1;
+  const int mi_row_in_sb = mi_row & sb_size_mask;
+  const int mi_col_in_sb = mi_col & sb_size_mask;
+  const int mi_size = mi_size_wide[bsize];
+  for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) {
+    for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) {
+      x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type;
+    }
   }
 }
 
-// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// TODO(jinging,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
+// TODO(chiyotsai@google.com): Move these ml related varables to a seprate file
+// to separate low level ml logic from partition logic
+#define NUM_SIMPLE_MOTION_FEATURES 28
 static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
                               TileDataEnc *tile_data, TOKENEXTRA **tp,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              BLOCK_SIZE max_sq_part, BLOCK_SIZE min_sq_part,
                               RD_STATS *rd_cost, int64_t best_rd,
                               PC_TREE *pc_tree, int64_t *none_rd) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -3560,11 +3132,14 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
   const int *partition_cost =
       pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
 
-  int do_rectangular_split = 1;
+  int do_rectangular_split = cpi->oxcf.enable_rect_partitions;
   int64_t cur_none_rd = 0;
   int64_t split_rd[4] = { 0, 0, 0, 0 };
   int64_t horz_rd[2] = { 0, 0 };
   int64_t vert_rd[2] = { 0, 0 };
+  int prune_horz = 0;
+  int prune_vert = 0;
+  int terminate_partition_search = 0;
 
   int split_ctx_is_ready[2] = { 0, 0 };
   int horz_ctx_is_ready = 0;
@@ -3585,22 +3160,26 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
   const int xss = x->e_mbd.plane[1].subsampling_x;
   const int yss = x->e_mbd.plane[1].subsampling_y;
 
-  BLOCK_SIZE min_size = x->min_partition_size;
-  BLOCK_SIZE max_size = x->max_partition_size;
-
   if (none_rd) *none_rd = 0;
-
-#if CONFIG_FP_MB_STATS
-  unsigned int src_diff_var = UINT_MAX;
-  int none_complexity = 0;
-#endif
-
   int partition_none_allowed = has_rows && has_cols;
-  int partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
-  int partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+  int partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8 &&
+                               cpi->oxcf.enable_rect_partitions;
+  int partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8 &&
+                               cpi->oxcf.enable_rect_partitions;
 
   (void)*tp_orig;
 
+#if CONFIG_COLLECT_PARTITION_STATS
+  int partition_decisions[EXT_PARTITION_TYPES] = { 0 };
+  int partition_attempts[EXT_PARTITION_TYPES] = { 0 };
+  int64_t partition_times[EXT_PARTITION_TYPES] = { 0 };
+  struct aom_usec_timer partition_timer = { 0 };
+  int partition_timer_on = 0;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+  PartitionStats *part_stats = &cpi->partition_stats;
+#endif
+#endif
+
   // Override partition costs at the edges of the frame in the same
   // way as in read_partition (see decodeframe.c)
   if (!(has_rows && has_cols)) {
@@ -3625,6 +3204,7 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
     }
 
     partition_cost = tmp_partition_cost;
+    do_square_split &= partition_cost[PARTITION_SPLIT] != INT_MAX;
   }
 
 #ifndef NDEBUG
@@ -3647,35 +3227,12 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
   if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
     x->mb_energy = av1_log_block_var(cpi, x, bsize);
 
-  if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
-    const int cb_partition_search_ctrl =
-        ((pc_tree->index == 0 || pc_tree->index == 3) +
-         get_chessboard_index(cm->current_frame.frame_number)) &
-        0x1;
-
-    if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
-      set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
-  }
-
-  // Determine partition types in search according to the speed features.
-  // The threshold set here has to be of square block size.
-  if (cpi->sf.auto_min_max_partition_size) {
-    const int no_partition_allowed = (bsize <= max_size && bsize >= min_size);
-    // Note: Further partitioning is NOT allowed when bsize == min_size already.
-    const int partition_allowed = (bsize <= max_size && bsize > min_size);
-    partition_none_allowed &= no_partition_allowed;
-    partition_horz_allowed &= partition_allowed || !has_rows;
-    partition_vert_allowed &= partition_allowed || !has_cols;
-    do_square_split &= bsize > min_size;
-  }
-
   if (bsize > cpi->sf.use_square_partition_only_threshold) {
     partition_horz_allowed &= !has_rows;
     partition_vert_allowed &= !has_cols;
   }
 
-  if (bsize > BLOCK_4X4 && x->use_cb_search_range &&
-      cpi->sf.auto_min_max_partition_size == 0) {
+  if (bsize > BLOCK_4X4 && x->use_cb_search_range) {
     int split_score = 0;
     int none_score = 0;
     const int score_valid = ml_prune_2pass_split_partition(
@@ -3720,8 +3277,10 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
         partition_horz_allowed == 0 && partition_vert_allowed == 0) {
       do_square_split = bsize_at_least_8x8;
       partition_none_allowed = has_rows && has_cols;
-      partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
-      partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+      partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8 &&
+                               cpi->oxcf.enable_rect_partitions;
+      partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8 &&
+                               cpi->oxcf.enable_rect_partitions;
     }
   }
 
@@ -3730,127 +3289,91 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
   save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-    src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row,
-                                                  mi_col, bsize);
-  }
-
-  // Decide whether we shall split directly and skip searching NONE by using
-  // the first pass block statistics
-  if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_square_split &&
-      partition_none_allowed && src_diff_var > 4 &&
-      cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
-    int mb_row = mi_row >> 1;
-    int mb_col = mi_col >> 1;
-    int mb_row_end =
-        AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
-    int mb_col_end =
-        AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
-    int r, c;
-
-    // compute a complexity measure, basically measure inconsistency of motion
-    // vectors obtained from the first pass in the current block
-    for (r = mb_row; r < mb_row_end; r++) {
-      for (c = mb_col; c < mb_col_end; c++) {
-        const int mb_index = r * cm->mb_cols + c;
-
-        MOTION_DIRECTION this_mv;
-        MOTION_DIRECTION right_mv;
-        MOTION_DIRECTION bottom_mv;
-
-        this_mv =
-            get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
-
-        // to its right
-        if (c != mb_col_end - 1) {
-          right_mv = get_motion_direction_fp(
-              cpi->twopass.this_frame_mb_stats[mb_index + 1]);
-          none_complexity += get_motion_inconsistency(this_mv, right_mv);
-        }
-
-        // to its bottom
-        if (r != mb_row_end - 1) {
-          bottom_mv = get_motion_direction_fp(
-              cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
-          none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
-        }
-
-        // do not count its left and top neighbors to avoid double counting
-      }
-    }
-
-    if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
-      partition_none_allowed = 0;
-    }
-  }
-#endif
-
-  // Ref frames picked in the [i_th] quarter subblock during square partition
-  // RD search. It may be used to prune ref frame selection of rect partitions.
-  int ref_frames_used[4] = {
-    0,
-  };
-
-  MB_MODE_INFO *split_mbmi[4] = { 0 };
-
-  // Perform a full_pixel_search and use the residue to estimate whether we
-  // should split directly.
-  // TODO(chiyotsai@google.com): Try the algorithm on hbd and speed 0.
-  // Also try pruning PARTITION_SPLIT
-  if (cpi->sf.full_pixel_motion_search_based_split && bsize >= BLOCK_8X8 &&
+  // Use simple_motion_search to prune partitions. This must be done prior to
+  // PARTITION_SPLIT to propagate the initial mvs to a smaller blocksize.
+  const int try_split_only =
+      cpi->sf.simple_motion_search_split_only && bsize >= BLOCK_8X8 &&
       do_square_split && mi_row + mi_size_high[bsize] <= cm->mi_rows &&
       mi_col + mi_size_wide[bsize] <= cm->mi_cols && !frame_is_intra_only(cm) &&
-      !cm->seq_params.enable_superres) {
-    const NN_CONFIG *nn_config = NULL;
-    float split_only_thresh = 0.0f;
-    if (bsize == BLOCK_128X128) {
-      nn_config = &full_pixel_motion_search_based_split_nn_config_128;
-      split_only_thresh = full_pixel_motion_search_based_split_thresh_128;
-    } else if (bsize == BLOCK_64X64) {
-      nn_config = &full_pixel_motion_search_based_split_nn_config_64;
-      split_only_thresh = full_pixel_motion_search_based_split_thresh_64;
-    } else if (bsize == BLOCK_32X32) {
-      nn_config = &full_pixel_motion_search_based_split_nn_config_32;
-      split_only_thresh = full_pixel_motion_search_based_split_thresh_32;
-    } else if (bsize == BLOCK_16X16) {
-      nn_config = &full_pixel_motion_search_based_split_nn_config_16;
-      split_only_thresh = full_pixel_motion_search_based_split_thresh_16;
-    } else if (bsize == BLOCK_8X8) {
-#if !CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8
-      // Disable BLOCK_8X8 for now
-      nn_config = &full_pixel_motion_search_based_split_nn_config_8;
-      split_only_thresh = full_pixel_motion_search_based_split_thresh_8;
-#endif
-    } else {
-      assert(0 && "Unexpected block size in full_pixel_motion_based_split");
-    }
-    if (nn_config) {
-      float features[6] = { 0 };
-      float score = 0;
-      get_res_var_features(cpi, x, mi_row, mi_col, bsize, features);
-      av1_nn_predict(features, nn_config, &score);
-
-      if (score > split_only_thresh) {
-        partition_none_allowed = 0;
-        partition_horz_allowed = 0;
-        partition_vert_allowed = 0;
-        do_rectangular_split = 0;
-      }
-    }
-  }
+      !av1_superres_scaled(cm);
+
+  if (try_split_only) {
+    av1_simple_motion_search_based_split(
+        cpi, x, mi_row, mi_col, bsize, &partition_none_allowed,
+        &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split,
+        &do_square_split);
+  }
+
+  const int try_prune_rect =
+      cpi->sf.simple_motion_search_prune_rect && !frame_is_intra_only(cm) &&
+      do_rectangular_split &&
+      (do_square_split || partition_none_allowed ||
+       (prune_horz && prune_vert)) &&
+      (partition_horz_allowed || partition_vert_allowed) && bsize >= BLOCK_8X8;
+
+  float simple_motion_features[NUM_SIMPLE_MOTION_FEATURES] = { 0.0f };
+  int simple_motion_features_are_valid = 0;
+
+  if (try_prune_rect) {
+    av1_simple_motion_search_prune_part(
+        cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_none_allowed,
+        &partition_horz_allowed, &partition_vert_allowed, &do_square_split,
+        &do_rectangular_split, &prune_horz, &prune_vert, simple_motion_features,
+        &simple_motion_features_are_valid);
+  }
+
+  // Max and min square partition levels are defined as the partition nodes that
+  // the recursive function rd_pick_partition() can reach. To implement this:
+  // only PARTITION_NONE is allowed if the current node equals min_sq_part,
+  // only PARTITION_SPLIT is allowed if the current node exceeds max_sq_part.
+  assert(block_size_wide[min_sq_part] == block_size_high[min_sq_part]);
+  assert(block_size_wide[max_sq_part] == block_size_high[max_sq_part]);
+  assert(min_sq_part <= max_sq_part);
+  assert(block_size_wide[bsize] == block_size_high[bsize]);
+  const int max_partition_size = block_size_wide[max_sq_part];
+  const int min_partition_size = block_size_wide[min_sq_part];
+  const int blksize = block_size_wide[bsize];
+  assert(min_partition_size <= max_partition_size);
+  const int is_le_min_sq_part = blksize <= min_partition_size;
+  const int is_gt_max_sq_part = blksize > max_partition_size;
+  if (is_gt_max_sq_part) {
+    // If current block size is larger than max, only allow split.
+    partition_none_allowed = 0;
+    partition_horz_allowed = 0;
+    partition_vert_allowed = 0;
+    do_square_split = 1;
+  } else if (is_le_min_sq_part) {
+    // If current block size is less or equal to min, only allow none if valid
+    // block large enough; only allow split otherwise.
+    partition_horz_allowed = 0;
+    partition_vert_allowed = 0;
+    // only disable square split when current block is not at the picture
+    // boundary. otherwise, inherit the square split flag from previous logic
+    if (has_rows && has_cols) do_square_split = 0;
+    partition_none_allowed = !do_square_split;
+  }
+  do_square_split &= partition_cost[PARTITION_SPLIT] != INT_MAX;
 
 BEGIN_PARTITION_SEARCH:
   if (x->must_find_valid_partition) {
+    do_square_split =
+        bsize_at_least_8x8 && partition_cost[PARTITION_SPLIT] != INT_MAX;
     partition_none_allowed = has_rows && has_cols;
-    partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
-    partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+    partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8 &&
+                             cpi->oxcf.enable_rect_partitions;
+    partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8 &&
+                             cpi->oxcf.enable_rect_partitions;
+    terminate_partition_search = 0;
   }
 
   // Partition block source pixel variance.
   unsigned int pb_source_variance = UINT_MAX;
 
+  // Partition block sse after simple motion compensation, not in use now,
+  // but will be used for upcoming speed features
+  unsigned int pb_simple_motion_pred_sse = UINT_MAX;
+  (void)pb_simple_motion_pred_sse;
+
 #if CONFIG_DIST_8X8
   if (x->using_dist_8x8) {
     if (block_size_high[bsize] <= 8) partition_horz_allowed = 0;
@@ -3861,7 +3384,9 @@ BEGIN_PARTITION_SEARCH:
 #endif
 
   // PARTITION_NONE
-  if (partition_none_allowed) {
+  if (is_le_min_sq_part && has_rows && has_cols) partition_none_allowed = 1;
+  if (!terminate_partition_search && partition_none_allowed &&
+      !is_gt_max_sq_part) {
     int pt_cost = 0;
     if (bsize_at_least_8x8) {
       pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
@@ -3872,17 +3397,32 @@ BEGIN_PARTITION_SEARCH:
     const int64_t best_remain_rdcost =
         (best_rdc.rdcost == INT64_MAX) ? INT64_MAX
                                        : (best_rdc.rdcost - partition_rd_cost);
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
-                     PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_remain_rdcost >= 0) {
+      partition_attempts[PARTITION_NONE] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE,
+                  bsize, ctx_none, best_remain_rdcost, 0);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_NONE] += time;
+      partition_timer_on = 0;
+    }
+#endif
     pb_source_variance = x->source_variance;
+    pb_simple_motion_pred_sse = x->simple_motion_pred_sse;
     if (none_rd) *none_rd = this_rdc.rdcost;
     cur_none_rd = this_rdc.rdcost;
     if (this_rdc.rate != INT_MAX) {
       if (cpi->sf.prune_ref_frame_for_rect_partitions) {
         const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame);
-        for (int i = 0; i < 4; ++i) {
-          ref_frames_used[i] |= (1 << ref_type);
-        }
+        update_picked_ref_frames_mask(x, ref_type, bsize,
+                                      cm->seq_params.mib_size, mi_row, mi_col);
       }
       if (bsize_at_least_8x8) {
         this_rdc.rate += pt_cost;
@@ -3902,25 +3442,6 @@ BEGIN_PARTITION_SEARCH:
         best_rdc = this_rdc;
         if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
 
-#if CONFIG_ONE_PASS_SVM
-        // Use ML if the block size is square and >= 16X16
-        if (bsize >= BLOCK_16X16 && !frame_is_intra_only(cm) &&
-            this_rdc.rate < INT_MAX && this_rdc.rate >= 0 &&
-            !ctx_none->seg_feat) {
-          // Model Prediction
-          float score = 0.0f;
-          ml_op_svm_early_term(cpi, x, xd, ctx_none, &this_rdc,
-                               pb_source_variance, bsize, &score);
-
-          // Decide if we want to terminate early
-          if (score >= 0) {
-            do_square_split = 0;
-            do_rectangular_split = 0;
-            partition_horz_allowed = 0;
-            partition_vert_allowed = 0;
-          }
-        }
-#endif
         if ((do_square_split || do_rectangular_split) &&
             !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
           const int use_ml_based_breakout =
@@ -3946,51 +3467,17 @@ BEGIN_PARTITION_SEARCH:
           }
         }
 
-#if CONFIG_FP_MB_STATS
-        // Check if every 16x16 first pass block statistics has zero
-        // motion and the corresponding first pass residue is small enough.
-        // If that is the case, check the difference variance between the
-        // current frame and the last frame. If the variance is small enough,
-        // stop further splitting in RD optimization
-        if (cpi->use_fp_mb_stats && do_square_split &&
-            cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
-          int mb_row = mi_row >> 1;
-          int mb_col = mi_col >> 1;
-          int mb_row_end =
-              AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
-          int mb_col_end =
-              AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
-          int r, c;
-
-          int skip = 1;
-          for (r = mb_row; r < mb_row_end; r++) {
-            for (c = mb_col; c < mb_col_end; c++) {
-              const int mb_index = r * cm->mb_cols + c;
-              if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
-                    FPMB_MOTION_ZERO_MASK) ||
-                  !(cpi->twopass.this_frame_mb_stats[mb_index] &
-                    FPMB_ERROR_SMALL_MASK)) {
-                skip = 0;
-                break;
-              }
-            }
-            if (skip == 0) {
-              break;
-            }
-          }
-          if (skip) {
-            if (src_diff_var == UINT_MAX) {
-              set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-              src_diff_var = get_sby_perpixel_diff_variance(
-                  cpi, &x->plane[0].src, mi_row, mi_col, bsize);
-            }
-            if (src_diff_var < 8) {
-              do_square_split = 0;
-              do_rectangular_split = 0;
-            }
-          }
+        if (cpi->sf.simple_motion_search_early_term_none && cm->show_frame &&
+            !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 &&
+            mi_row + mi_step < cm->mi_rows && mi_col + mi_step < cm->mi_cols &&
+            this_rdc.rdcost < INT64_MAX && this_rdc.rdcost >= 0 &&
+            this_rdc.rate < INT_MAX && this_rdc.rate >= 0 &&
+            (do_square_split || do_rectangular_split)) {
+          av1_simple_motion_search_early_term_none(
+              cpi, x, pc_tree, mi_row, mi_col, bsize, &this_rdc,
+              &terminate_partition_search, simple_motion_features,
+              &simple_motion_features_are_valid);
         }
-#endif
       }
     }
 
@@ -4001,13 +3488,20 @@ BEGIN_PARTITION_SEARCH:
   if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
 
   // PARTITION_SPLIT
-  if (do_square_split) {
+  if ((!terminate_partition_search && do_square_split) || is_gt_max_sq_part) {
     av1_init_rd_stats(&sum_rdc);
     subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
     sum_rdc.rate = partition_cost[PARTITION_SPLIT];
     sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
 
     int idx;
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
+      partition_attempts[PARTITION_SPLIT] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
     for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
       const int x_idx = (idx & 1) * mi_step;
       const int y_idx = (idx >> 1) * mi_step;
@@ -4022,11 +3516,9 @@ BEGIN_PARTITION_SEARCH:
       const int64_t best_remain_rdcost =
           best_rdc.rdcost == INT64_MAX ? INT64_MAX
                                        : (best_rdc.rdcost - sum_rdc.rdcost);
-      if (cpi->sf.prune_ref_frame_for_rect_partitions)
-        pc_tree->split[idx]->none.rate = INT_MAX;
       rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
-                        subsize, &this_rdc, best_remain_rdcost,
-                        pc_tree->split[idx], p_split_rd);
+                        subsize, max_sq_part, min_sq_part, &this_rdc,
+                        best_remain_rdcost, pc_tree->split[idx], p_split_rd);
 
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
@@ -4035,16 +3527,6 @@ BEGIN_PARTITION_SEARCH:
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
-        if (cpi->sf.prune_ref_frame_for_rect_partitions &&
-            pc_tree->split[idx]->none.rate != INT_MAX) {
-          const int ref_type =
-              av1_ref_frame_type(pc_tree->split[idx]->none.mic.ref_frame);
-          ref_frames_used[idx] |= (1 << ref_type);
-
-          if (cpi->sf.prune_ref_mode_for_partitions) {
-            split_mbmi[idx] = &pc_tree->split[idx]->none.mic;
-          }
-        }
         if (idx <= 1 && (bsize <= BLOCK_8X8 ||
                          pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
           const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic;
@@ -4056,6 +3538,14 @@ BEGIN_PARTITION_SEARCH:
         }
       }
     }
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_SPLIT] += time;
+      partition_timer_on = 0;
+    }
+#endif
     const int reached_last_index = (idx == 4);
 
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
@@ -4075,108 +3565,19 @@ BEGIN_PARTITION_SEARCH:
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }  // if (do_split)
 
-  pc_tree->horizontal[0].skip_ref_frame_mask = 0;
-  pc_tree->horizontal[1].skip_ref_frame_mask = 0;
-  pc_tree->vertical[0].skip_ref_frame_mask = 0;
-  pc_tree->vertical[1].skip_ref_frame_mask = 0;
-  if (cpi->sf.prune_ref_frame_for_rect_partitions) {
-    int used_frames;
-    used_frames = ref_frames_used[0] | ref_frames_used[1];
-    if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames;
-    used_frames = ref_frames_used[2] | ref_frames_used[3];
-    if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames;
-    used_frames = ref_frames_used[0] | ref_frames_used[2];
-    if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames;
-    used_frames = ref_frames_used[1] | ref_frames_used[3];
-    if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames;
-  }
-
-  for (int i = 0; i < 2; ++i) {
-    pc_tree->horizontal[i].ref_selected[0] =
-        pc_tree->horizontal[i].ref_selected[1] = NONE_FRAME;
-    pc_tree->horizontal[i].mode_selected = -1;
-    pc_tree->vertical[i].ref_selected[0] =
-        pc_tree->vertical[i].ref_selected[1] = NONE_FRAME;
-    pc_tree->vertical[i].mode_selected = -1;
-  }
-
-  if (cpi->sf.prune_ref_mode_for_partitions) {
-    // horizontal partition
-    for (int idx = 0; idx < 4; idx += 2) {
-      const int horz_idx = idx / 2;
-      if (split_mbmi[idx] && split_mbmi[idx + 1] &&
-          split_mbmi[idx]->ref_frame[0] > INTRA_FRAME) {
-        if (!has_second_ref(split_mbmi[idx])) {
-          // Single ref
-          if (split_mbmi[idx]->ref_frame[0] ==
-                  split_mbmi[idx + 1]->ref_frame[0] &&
-              !has_second_ref(split_mbmi[idx + 1])) {
-            const int ref_type = av1_ref_frame_type(split_mbmi[idx]->ref_frame);
-            // Overwrite skip_ref_frame_mask for the current block
-            const int used_frames = (1 << ref_type);
-            pc_tree->horizontal[horz_idx].skip_ref_frame_mask = ~used_frames;
-            pc_tree->horizontal[horz_idx].ref_selected[0] =
-                split_mbmi[idx]->ref_frame[0];
-#if 0
-            // TODO(zoeliu@gmail.com): To consider the scenario of obmc
-            if (split_mbmi[idx]->motion_mode ==
-                    split_mbmi[idx + 1]->motion_mode &&
-                split_mbmi[idx]->motion_mode == SIMPLE_TRANSLATION &&
-                split_mbmi[idx]->use_wedge_interintra == 0) {
-              pc_tree->horizontal[horz_idx].mode_selected = SIMPLE_TRANSLATION;
-            }
-#endif  // 0
-          }
-        } else {
-          // TODO(zoeliu@gmail.com): To handle comp ref
-        }
-      }
-    }
-    // vertical partition
-    for (int idx = 0; idx < 2; ++idx) {
-      const int vert_idx = idx;
-      if (split_mbmi[idx] && split_mbmi[idx + 2] &&
-          split_mbmi[idx]->ref_frame[0] > INTRA_FRAME) {
-        if (!has_second_ref(split_mbmi[idx])) {
-          // Single ref
-          if (split_mbmi[idx]->ref_frame[0] ==
-                  split_mbmi[idx + 2]->ref_frame[0] &&
-              !has_second_ref(split_mbmi[idx + 2])) {
-            const int ref_type = av1_ref_frame_type(split_mbmi[idx]->ref_frame);
-            // Overwrite skip_ref_frame_mask for the current block
-            const int used_frames = (1 << ref_type);
-            pc_tree->vertical[vert_idx].skip_ref_frame_mask = ~used_frames;
-            pc_tree->vertical[vert_idx].ref_selected[0] =
-                split_mbmi[idx]->ref_frame[0];
-#if 0
-            // TODO(zoeliu@gmail.com): To consider the scenario of obmc
-            if (split_mbmi[idx]->motion_mode ==
-                    split_mbmi[idx + 2]->motion_mode &&
-                split_mbmi[idx]->motion_mode == SIMPLE_TRANSLATION &&
-                split_mbmi[idx]->use_wedge_interintra == 0) {
-              pc_tree->vertical[vert_idx].mode_selected = SIMPLE_TRANSLATION;
-            }
-#endif  // 0
-          }
-        } else {
-          // TODO(zoeliu@gmail.com): To handle comp ref
-        }
-      }
-    }
-  }
-
-  int prune_horz = 0;
-  int prune_vert = 0;
   if (cpi->sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
-      (partition_horz_allowed || partition_vert_allowed)) {
+      (partition_horz_allowed || partition_vert_allowed) &&
+      !(prune_horz || prune_vert)) {
     av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
     ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd,
                             split_rd, &prune_horz, &prune_vert);
   }
 
   // PARTITION_HORZ
-  if (partition_horz_allowed && !prune_horz &&
-      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
+  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz_allowed));
+  if (!terminate_partition_search && partition_horz_allowed && !prune_horz &&
+      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) &&
+      !is_gt_max_sq_part) {
     av1_init_rd_stats(&sum_rdc);
     subsize = get_partition_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
@@ -4185,14 +3586,20 @@ BEGIN_PARTITION_SEARCH:
       pc_tree->horizontal[0].pred_interp_filter =
           av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
     }
+    sum_rdc.rate = partition_cost[PARTITION_HORZ];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
     const int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
                                            ? INT64_MAX
                                            : (best_rdc.rdcost - sum_rdc.rdcost);
-    sum_rdc.rate = partition_cost[PARTITION_HORZ];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
-                     PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
-                     best_remain_rdcost);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_remain_rdcost >= 0) {
+      partition_attempts[PARTITION_HORZ] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_HORZ,
+                  subsize, &pc_tree->horizontal[0], best_remain_rdcost, 0);
 
     if (this_rdc.rate == INT_MAX) {
       sum_rdc.rdcost = INT64_MAX;
@@ -4222,9 +3629,9 @@ BEGIN_PARTITION_SEARCH:
         pc_tree->horizontal[1].pred_interp_filter =
             av1_extract_interp_filter(ctx_h->mic.interp_filters, 0);
       }
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
-                       PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                       best_rdc.rdcost - sum_rdc.rdcost);
+      pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+                    PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                    best_rdc.rdcost - sum_rdc.rdcost, 0);
       horz_rd[1] = this_rdc.rdcost;
 
       if (this_rdc.rate == INT_MAX) {
@@ -4235,6 +3642,14 @@ BEGIN_PARTITION_SEARCH:
         sum_rdc.rdcost += this_rdc.rdcost;
       }
     }
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_HORZ] += time;
+      partition_timer_on = 0;
+    }
+#endif
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -4248,8 +3663,10 @@ BEGIN_PARTITION_SEARCH:
   }
 
   // PARTITION_VERT
-  if (partition_vert_allowed && !prune_vert &&
-      (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) {
+  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert_allowed));
+  if (!terminate_partition_search && partition_vert_allowed && !prune_vert &&
+      (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step)) &&
+      !is_gt_max_sq_part) {
     av1_init_rd_stats(&sum_rdc);
     subsize = get_partition_subsize(bsize, PARTITION_VERT);
 
@@ -4265,9 +3682,15 @@ BEGIN_PARTITION_SEARCH:
     const int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
                                            ? INT64_MAX
                                            : (best_rdc.rdcost - sum_rdc.rdcost);
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
-                     PARTITION_VERT, subsize, &pc_tree->vertical[0],
-                     best_remain_rdcost);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_remain_rdcost >= 0) {
+      partition_attempts[PARTITION_VERT] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_VERT,
+                  subsize, &pc_tree->vertical[0], best_remain_rdcost, 0);
 
     if (this_rdc.rate == INT_MAX) {
       sum_rdc.rdcost = INT64_MAX;
@@ -4296,9 +3719,9 @@ BEGIN_PARTITION_SEARCH:
         pc_tree->vertical[1].pred_interp_filter =
             av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
       }
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
-                       PARTITION_VERT, subsize, &pc_tree->vertical[1],
-                       best_rdc.rdcost - sum_rdc.rdcost);
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+                    PARTITION_VERT, subsize, &pc_tree->vertical[1],
+                    best_rdc.rdcost - sum_rdc.rdcost, 0);
       vert_rd[1] = this_rdc.rdcost;
 
       if (this_rdc.rate == INT_MAX) {
@@ -4309,6 +3732,14 @@ BEGIN_PARTITION_SEARCH:
         sum_rdc.rdcost += this_rdc.rdcost;
       }
     }
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_VERT] += time;
+      partition_timer_on = 0;
+    }
+#endif
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -4323,7 +3754,7 @@ BEGIN_PARTITION_SEARCH:
 
   if (pb_source_variance == UINT_MAX) {
     av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
       pb_source_variance = av1_high_get_sby_perpixel_variance(
           cpi, &x->plane[0].src, bsize, xd->bd);
     } else {
@@ -4332,13 +3763,26 @@ BEGIN_PARTITION_SEARCH:
     }
   }
 
+  if (use_pb_simple_motion_pred_sse(cpi) &&
+      pb_simple_motion_pred_sse == UINT_MAX) {
+    const MV ref_mv_full = { .row = 0, .col = 0 };
+    unsigned int var = 0;
+
+    av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full, 0,
+                              &pb_simple_motion_pred_sse, &var);
+  }
+
+  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !do_rectangular_split));
+
   const int ext_partition_allowed =
       do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
 
   // The standard AB partitions are allowed whenever ext-partition-types are
   // allowed
-  int horzab_partition_allowed = ext_partition_allowed;
-  int vertab_partition_allowed = ext_partition_allowed;
+  int horzab_partition_allowed =
+      ext_partition_allowed & cpi->oxcf.enable_ab_partitions;
+  int vertab_partition_allowed =
+      ext_partition_allowed & cpi->oxcf.enable_ab_partitions;
 
 #if CONFIG_DIST_8X8
   if (x->using_dist_8x8) {
@@ -4414,9 +3858,9 @@ BEGIN_PARTITION_SEARCH:
 
   if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed &&
       partition_horz_allowed && partition_vert_allowed) {
-    // TODO(huisu@google.com): x->source_variance may not be the current block's
-    // variance. The correct one to use is pb_source_variance.
-    // Need to re-train the model to fix it.
+    // TODO(huisu@google.com): x->source_variance may not be the current
+    // block's variance. The correct one to use is pb_source_variance. Need to
+    // re-train the model to fix it.
     ml_prune_ab_partition(bsize, pc_tree->partitioning,
                           get_unsigned_bits(x->source_variance),
                           best_rdc.rdcost, horz_rd, vert_rd, split_rd,
@@ -4424,8 +3868,14 @@ BEGIN_PARTITION_SEARCH:
                           &verta_partition_allowed, &vertb_partition_allowed);
   }
 
+  horza_partition_allowed &= cpi->oxcf.enable_ab_partitions;
+  horzb_partition_allowed &= cpi->oxcf.enable_ab_partitions;
+  verta_partition_allowed &= cpi->oxcf.enable_ab_partitions;
+  vertb_partition_allowed &= cpi->oxcf.enable_ab_partitions;
+
   // PARTITION_HORZ_A
-  if (partition_horz_allowed && horza_partition_allowed) {
+  if (!terminate_partition_search && partition_horz_allowed &&
+      horza_partition_allowed && !is_gt_max_sq_part) {
     subsize = get_partition_subsize(bsize, PARTITION_HORZ_A);
     pc_tree->horizontala[0].rd_mode_is_ready = 0;
     pc_tree->horizontala[1].rd_mode_is_ready = 0;
@@ -4441,56 +3891,37 @@ BEGIN_PARTITION_SEARCH:
         pc_tree->horizontala[1].rd_mode_is_ready = 1;
       }
     }
-    for (int i = 0; i < 3; ++i) {
-      pc_tree->horizontala[i].skip_ref_frame_mask = 0;
-      pc_tree->horizontala[i].ref_selected[0] =
-          pc_tree->horizontala[i].ref_selected[1] = NONE_FRAME;
-    }
-    if (cpi->sf.prune_ref_frame_for_rect_partitions) {
-      int used_frames;
-      used_frames = ref_frames_used[0];
-      if (used_frames)
-        pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames;
-      used_frames = ref_frames_used[1];
-      if (used_frames)
-        pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames;
-      used_frames = ref_frames_used[2] | ref_frames_used[3];
-      if (used_frames)
-        pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames;
-    }
-    if (cpi->sf.prune_ref_mode_for_partitions) {
-      // Overwrite skip_ref_frame_mask for the current block
-      if (split_mbmi[0] && split_mbmi[0]->ref_frame[0] > INTRA_FRAME &&
-          !has_second_ref(split_mbmi[0])) {  // single ref
-        const int used_frames = 1 << (int)split_mbmi[0]->ref_frame[0];
-        pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames;
-        pc_tree->horizontala[0].ref_selected[0] = split_mbmi[0]->ref_frame[0];
-      }
-      if (split_mbmi[1] && split_mbmi[1]->ref_frame[0] > INTRA_FRAME &&
-          !has_second_ref(split_mbmi[1])) {  // single ref
-        const int used_frames = 1 << (int)split_mbmi[1]->ref_frame[0];
-        pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames;
-        pc_tree->horizontala[1].ref_selected[0] = split_mbmi[1]->ref_frame[0];
-      }
-      if (split_mbmi[2] && split_mbmi[3] &&
-          split_mbmi[2]->ref_frame[0] > INTRA_FRAME &&
-          split_mbmi[2]->ref_frame[0] == split_mbmi[3]->ref_frame[0] &&
-          !has_second_ref(split_mbmi[2]) &&
-          !has_second_ref(split_mbmi[3])) {  // single ref
-        const int used_frames = 1 << (int)split_mbmi[2]->ref_frame[0];
-        pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames;
-        pc_tree->horizontala[2].ref_selected[0] = split_mbmi[2]->ref_frame[0];
+#if CONFIG_COLLECT_PARTITION_STATS
+    {
+      RD_STATS tmp_sum_rdc;
+      av1_init_rd_stats(&tmp_sum_rdc);
+      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_A];
+      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+      if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+        partition_attempts[PARTITION_HORZ_A] += 1;
+        aom_usec_timer_start(&partition_timer);
+        partition_timer_on = 1;
       }
     }
+#endif
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
                        PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row,
                        mi_col + mi_step, bsize2, mi_row + mi_step, mi_col,
                        subsize);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_HORZ_A] += time;
+      partition_timer_on = 0;
+    }
+#endif
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
   // PARTITION_HORZ_B
-  if (partition_horz_allowed && horzb_partition_allowed) {
+  if (!terminate_partition_search && partition_horz_allowed &&
+      horzb_partition_allowed && !is_gt_max_sq_part) {
     subsize = get_partition_subsize(bsize, PARTITION_HORZ_B);
     pc_tree->horizontalb[0].rd_mode_is_ready = 0;
     pc_tree->horizontalb[1].rd_mode_is_ready = 0;
@@ -4500,57 +3931,39 @@ BEGIN_PARTITION_SEARCH:
       pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B;
       pc_tree->horizontalb[0].rd_mode_is_ready = 1;
     }
-    for (int i = 0; i < 3; ++i) {
-      pc_tree->horizontalb[i].skip_ref_frame_mask = 0;
-      pc_tree->horizontalb[i].ref_selected[0] =
-          pc_tree->horizontalb[i].ref_selected[1] = NONE_FRAME;
-    }
-    if (cpi->sf.prune_ref_frame_for_rect_partitions) {
-      int used_frames;
-      used_frames = ref_frames_used[0] | ref_frames_used[1];
-      if (used_frames)
-        pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames;
-      used_frames = ref_frames_used[2];
-      if (used_frames)
-        pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames;
-      used_frames = ref_frames_used[3];
-      if (used_frames)
-        pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames;
-    }
-    if (cpi->sf.prune_ref_mode_for_partitions) {
-      // Overwrite skip_ref_frame_mask for the current block
-      if (split_mbmi[0] && split_mbmi[1] &&
-          split_mbmi[0]->ref_frame[0] > INTRA_FRAME &&
-          split_mbmi[0]->ref_frame[0] == split_mbmi[1]->ref_frame[0] &&
-          !has_second_ref(split_mbmi[0]) &&
-          !has_second_ref(split_mbmi[1])) {  // single ref
-        const int used_frames = 1 << (int)split_mbmi[0]->ref_frame[0];
-        pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames;
-        pc_tree->horizontalb[0].ref_selected[0] = split_mbmi[0]->ref_frame[0];
-      }
-      if (split_mbmi[2] && split_mbmi[2]->ref_frame[0] > INTRA_FRAME &&
-          !has_second_ref(split_mbmi[2])) {  // single ref
-        const int used_frames = 1 << (int)split_mbmi[2]->ref_frame[0];
-        pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames;
-        pc_tree->horizontalb[1].ref_selected[0] = split_mbmi[2]->ref_frame[0];
-      }
-      if (split_mbmi[3] && split_mbmi[3]->ref_frame[0] > INTRA_FRAME &&
-          !has_second_ref(split_mbmi[3])) {  // single ref
-        const int used_frames = 1 << (int)split_mbmi[3]->ref_frame[0];
-        pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames;
-        pc_tree->horizontalb[2].ref_selected[0] = split_mbmi[3]->ref_frame[0];
+#if CONFIG_COLLECT_PARTITION_STATS
+    {
+      RD_STATS tmp_sum_rdc;
+      av1_init_rd_stats(&tmp_sum_rdc);
+      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_B];
+      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+      if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+        partition_attempts[PARTITION_HORZ_B] += 1;
+        aom_usec_timer_start(&partition_timer);
+        partition_timer_on = 1;
       }
     }
+#endif
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
                        PARTITION_HORZ_B, mi_row, mi_col, subsize,
                        mi_row + mi_step, mi_col, bsize2, mi_row + mi_step,
                        mi_col + mi_step, bsize2);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_HORZ_B] += time;
+      partition_timer_on = 0;
+    }
+#endif
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
   // PARTITION_VERT_A
-  if (partition_vert_allowed && verta_partition_allowed) {
+  if (!terminate_partition_search && partition_vert_allowed &&
+      verta_partition_allowed && !is_gt_max_sq_part) {
     subsize = get_partition_subsize(bsize, PARTITION_VERT_A);
     pc_tree->verticala[0].rd_mode_is_ready = 0;
     pc_tree->verticala[1].rd_mode_is_ready = 0;
@@ -4560,53 +3973,37 @@ BEGIN_PARTITION_SEARCH:
       pc_tree->verticala[0].mic.partition = PARTITION_VERT_A;
       pc_tree->verticala[0].rd_mode_is_ready = 1;
     }
-    for (int i = 0; i < 3; ++i) {
-      pc_tree->verticala[i].skip_ref_frame_mask = 0;
-      pc_tree->verticala[i].ref_selected[0] =
-          pc_tree->verticala[i].ref_selected[1] = NONE_FRAME;
-    }
-    if (cpi->sf.prune_ref_frame_for_rect_partitions) {
-      int used_frames;
-      used_frames = ref_frames_used[0];
-      if (used_frames) pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames;
-      used_frames = ref_frames_used[2];
-      if (used_frames) pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames;
-      used_frames = ref_frames_used[1] | ref_frames_used[3];
-      if (used_frames) pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames;
-    }
-    if (cpi->sf.prune_ref_mode_for_partitions) {
-      // Overwrite skip_ref_frame_mask for the current block
-      if (split_mbmi[0] && split_mbmi[0]->ref_frame[0] > INTRA_FRAME &&
-          !has_second_ref(split_mbmi[0])) {  // single ref
-        const int used_frames = 1 << (int)split_mbmi[0]->ref_frame[0];
-        pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames;
-        pc_tree->verticala[0].ref_selected[0] = split_mbmi[0]->ref_frame[0];
-      }
-      if (split_mbmi[2] && split_mbmi[2]->ref_frame[0] > INTRA_FRAME &&
-          !has_second_ref(split_mbmi[2])) {  // single ref
-        const int used_frames = 1 << (int)split_mbmi[2]->ref_frame[0];
-        pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames;
-        pc_tree->verticala[1].ref_selected[0] = split_mbmi[2]->ref_frame[0];
-      }
-      if (split_mbmi[1] && split_mbmi[3] &&
-          split_mbmi[1]->ref_frame[0] > INTRA_FRAME &&
-          split_mbmi[1]->ref_frame[0] == split_mbmi[3]->ref_frame[0] &&
-          !has_second_ref(split_mbmi[1]) &&
-          !has_second_ref(split_mbmi[3])) {  // single ref
-        const int used_frames = 1 << (int)split_mbmi[1]->ref_frame[0];
-        pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames;
-        pc_tree->verticala[2].ref_selected[0] = split_mbmi[1]->ref_frame[0];
+#if CONFIG_COLLECT_PARTITION_STATS
+    {
+      RD_STATS tmp_sum_rdc;
+      av1_init_rd_stats(&tmp_sum_rdc);
+      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_A];
+      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+      if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+        partition_attempts[PARTITION_VERT_A] += 1;
+        aom_usec_timer_start(&partition_timer);
+        partition_timer_on = 1;
       }
     }
+#endif
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
                        PARTITION_VERT_A, mi_row, mi_col, bsize2,
                        mi_row + mi_step, mi_col, bsize2, mi_row,
                        mi_col + mi_step, subsize);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_VERT_A] += time;
+      partition_timer_on = 0;
+    }
+#endif
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
   // PARTITION_VERT_B
-  if (partition_vert_allowed && vertb_partition_allowed) {
+  if (!terminate_partition_search && partition_vert_allowed &&
+      vertb_partition_allowed && !is_gt_max_sq_part) {
     subsize = get_partition_subsize(bsize, PARTITION_VERT_B);
     pc_tree->verticalb[0].rd_mode_is_ready = 0;
     pc_tree->verticalb[1].rd_mode_is_ready = 0;
@@ -4616,58 +4013,44 @@ BEGIN_PARTITION_SEARCH:
       pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B;
       pc_tree->verticalb[0].rd_mode_is_ready = 1;
     }
-    for (int i = 0; i < 3; ++i) {
-      pc_tree->verticalb[i].skip_ref_frame_mask = 0;
-      pc_tree->verticalb[i].ref_selected[0] =
-          pc_tree->verticalb[i].ref_selected[1] = NONE_FRAME;
-    }
-    if (cpi->sf.prune_ref_frame_for_rect_partitions) {
-      int used_frames;
-      used_frames = ref_frames_used[0] | ref_frames_used[2];
-      if (used_frames) pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames;
-      used_frames = ref_frames_used[1];
-      if (used_frames) pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames;
-      used_frames = ref_frames_used[3];
-      if (used_frames) pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames;
-    }
-    if (cpi->sf.prune_ref_mode_for_partitions) {
-      // Overwrite skip_ref_frame_mask for the current block
-      if (split_mbmi[0] && split_mbmi[2] &&
-          split_mbmi[0]->ref_frame[0] > INTRA_FRAME &&
-          split_mbmi[0]->ref_frame[0] == split_mbmi[2]->ref_frame[0] &&
-          !has_second_ref(split_mbmi[0]) &&
-          !has_second_ref(split_mbmi[2])) {  // single ref
-        const int used_frames = 1 << (int)split_mbmi[0]->ref_frame[0];
-        pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames;
-        pc_tree->verticalb[0].ref_selected[0] = split_mbmi[0]->ref_frame[0];
-      }
-      if (split_mbmi[1] && split_mbmi[1]->ref_frame[0] > INTRA_FRAME &&
-          !has_second_ref(split_mbmi[1])) {  // single ref
-        const int used_frames = 1 << (int)split_mbmi[1]->ref_frame[0];
-        pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames;
-        pc_tree->verticalb[1].ref_selected[0] = split_mbmi[1]->ref_frame[0];
-      }
-      if (split_mbmi[3] && split_mbmi[3]->ref_frame[0] > INTRA_FRAME &&
-          !has_second_ref(split_mbmi[3])) {  // single ref
-        const int used_frames = 1 << (int)split_mbmi[3]->ref_frame[0];
-        pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames;
-        pc_tree->verticalb[2].ref_selected[0] = split_mbmi[3]->ref_frame[0];
+#if CONFIG_COLLECT_PARTITION_STATS
+    {
+      RD_STATS tmp_sum_rdc;
+      av1_init_rd_stats(&tmp_sum_rdc);
+      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_B];
+      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+      if (!frame_is_intra_only(cm) &&
+          best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+        partition_attempts[PARTITION_VERT_B] += 1;
+        aom_usec_timer_start(&partition_timer);
+        partition_timer_on = 1;
       }
     }
+#endif
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
                        PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row,
                        mi_col + mi_step, bsize2, mi_row + mi_step,
                        mi_col + mi_step, bsize2);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_VERT_B] += time;
+      partition_timer_on = 0;
+    }
+#endif
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
   // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
   // PARTITION_VERT_4 for this block. This is almost the same as
-  // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks,
-  // so we require that bsize is not BLOCK_128X128.
-  const int partition4_allowed =
-      ext_partition_allowed && bsize != BLOCK_128X128;
+  // ext_partition_allowed, except that we don't allow 128x32 or 32x128
+  // blocks, so we require that bsize is not BLOCK_128X128.
+  const int partition4_allowed = cpi->oxcf.enable_1to4_partitions &&
+                                 ext_partition_allowed &&
+                                 bsize != BLOCK_128X128;
+
   int partition_horz4_allowed = partition4_allowed && partition_horz_allowed;
   int partition_vert4_allowed = partition4_allowed && partition_vert_allowed;
   if (cpi->sf.prune_ext_partition_types_search_level == 2) {
@@ -4699,9 +4082,16 @@ BEGIN_PARTITION_SEARCH:
   }
 #endif
 
+  if (blksize < (min_partition_size << 2)) {
+    partition_horz4_allowed = 0;
+    partition_vert4_allowed = 0;
+  }
+
   // PARTITION_HORZ_4
-  if (partition_horz4_allowed && has_rows &&
-      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
+  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz4_allowed));
+  if (!terminate_partition_search && partition_horz4_allowed && has_rows &&
+      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) &&
+      !is_gt_max_sq_part) {
     av1_init_rd_stats(&sum_rdc);
     const int quarter_step = mi_size_high[bsize] / 4;
     PICK_MODE_CONTEXT *ctx_prev = ctx_none;
@@ -4710,6 +4100,13 @@ BEGIN_PARTITION_SEARCH:
     sum_rdc.rate = partition_cost[PARTITION_HORZ_4];
     sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
 
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
+      partition_attempts[PARTITION_HORZ_4] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
     for (int i = 0; i < 4; ++i) {
       const int this_mi_row = mi_row + i * quarter_step;
 
@@ -4718,13 +4115,6 @@ BEGIN_PARTITION_SEARCH:
       PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
 
       ctx_this->rd_mode_is_ready = 0;
-      ctx_this->skip_ref_frame_mask = 0;
-      if (cpi->sf.prune_ref_frame_for_rect_partitions) {
-        const int used_frames = i <= 1
-                                    ? (ref_frames_used[0] | ref_frames_used[1])
-                                    : (ref_frames_used[2] | ref_frames_used[3]);
-        if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
-      }
       if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row,
                            mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
                            PARTITION_HORZ_4, ctx_prev, ctx_this))
@@ -4740,12 +4130,23 @@ BEGIN_PARTITION_SEARCH:
         pc_tree->partitioning = PARTITION_HORZ_4;
       }
     }
+
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_HORZ_4] += time;
+      partition_timer_on = 0;
+    }
+#endif
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
   // PARTITION_VERT_4
-  if (partition_vert4_allowed && has_cols &&
-      (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step))) {
+  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert4_allowed));
+  if (!terminate_partition_search && partition_vert4_allowed && has_cols &&
+      (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step)) &&
+      !is_gt_max_sq_part) {
     av1_init_rd_stats(&sum_rdc);
     const int quarter_step = mi_size_wide[bsize] / 4;
     PICK_MODE_CONTEXT *ctx_prev = ctx_none;
@@ -4754,6 +4155,13 @@ BEGIN_PARTITION_SEARCH:
     sum_rdc.rate = partition_cost[PARTITION_VERT_4];
     sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
 
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
+      partition_attempts[PARTITION_VERT_4] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
     for (int i = 0; i < 4; ++i) {
       const int this_mi_col = mi_col + i * quarter_step;
 
@@ -4762,13 +4170,6 @@ BEGIN_PARTITION_SEARCH:
       PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
 
       ctx_this->rd_mode_is_ready = 0;
-      ctx_this->skip_ref_frame_mask = 0;
-      if (cpi->sf.prune_ref_frame_for_rect_partitions) {
-        const int used_frames = i <= 1
-                                    ? (ref_frames_used[0] | ref_frames_used[2])
-                                    : (ref_frames_used[1] | ref_frames_used[3]);
-        if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
-      }
       if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row,
                            this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
                            PARTITION_VERT_4, ctx_prev, ctx_this))
@@ -4784,6 +4185,14 @@ BEGIN_PARTITION_SEARCH:
         pc_tree->partitioning = PARTITION_VERT_4;
       }
     }
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_VERT_4] += time;
+      partition_timer_on = 0;
+    }
+#endif
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
@@ -4791,6 +4200,9 @@ BEGIN_PARTITION_SEARCH:
     // Did not find a valid partition, go back and search again, with less
     // constraint on which partition types to search.
     x->must_find_valid_partition = 1;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+    part_stats->partition_redo += 1;
+#endif
     goto BEGIN_PARTITION_SEARCH;
   }
 
@@ -4801,6 +4213,44 @@ BEGIN_PARTITION_SEARCH:
   (void)best_rd;
   *rd_cost = best_rdc;
 
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
+    partition_decisions[pc_tree->partitioning] += 1;
+  }
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS == 1
+  // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each
+  // prediction block
+  FILE *f = fopen("data.csv", "a");
+  fprintf(f, "%d,%d,%d,", bsize, cm->show_frame, frame_is_intra_only(cm));
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", partition_decisions[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", partition_attempts[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%ld,", partition_times[idx]);
+  }
+  fprintf(f, "\n");
+  fclose(f);
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+  // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for
+  // the whole clip. So we need to pass the information upstream to the encoder
+  const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
+  int *agg_attempts = part_stats->partition_attempts[bsize_idx];
+  int *agg_decisions = part_stats->partition_decisions[bsize_idx];
+  int64_t *agg_times = part_stats->partition_times[bsize_idx];
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    agg_attempts[idx] += partition_attempts[idx];
+    agg_decisions[idx] += partition_decisions[idx];
+    agg_times[idx] += partition_times[idx];
+  }
+#endif
+
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
     if (bsize == cm->seq_params.sb_size) {
@@ -4820,19 +4270,23 @@ BEGIN_PARTITION_SEARCH:
     assert(tp_orig == *tp);
   }
 }
+#undef NUM_SIMPLE_MOTION_FEATURES
 
 // Set all the counters as max.
 static void init_first_partition_pass_stats_tables(
-    FIRST_PARTITION_PASS_STATS *stats) {
+    AV1_COMP *cpi, FIRST_PARTITION_PASS_STATS *stats) {
   for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
     memset(stats[i].ref0_counts, 0xff, sizeof(stats[i].ref0_counts));
     memset(stats[i].ref1_counts, 0xff, sizeof(stats[i].ref1_counts));
     stats[i].sample_counts = INT_MAX;
+    if (cpi->sf.use_first_partition_pass_interintra_stats)
+      memset(stats[i].interintra_motion_mode_count, 0xff,
+             sizeof(stats[i].interintra_motion_mode_count));
   }
 }
 
-// Minimum number of samples to trigger the
-// mode_pruning_based_on_two_pass_partition_search feature.
+// Minimum number of samples to trigger the mode pruning in
+// two_pass_partition_search feature.
 #define FIRST_PARTITION_PASS_MIN_SAMPLES 16
 
 static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
@@ -4847,7 +4301,6 @@ static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
   int row, col;
 
   int dr = 0;
-  int count = 0;
   double r0, rk, beta;
 
   if (tpl_frame->is_valid == 0) return orig_rdmult;
@@ -4864,8 +4317,6 @@ static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
 
       intra_cost += this_stats->intra_cost;
       mc_dep_cost += this_stats->mc_dep_cost;
-
-      ++count;
     }
   }
 
@@ -4955,8 +4406,7 @@ static void first_partition_search_pass(AV1_COMP *cpi, ThreadData *td,
 
   const SPEED_FEATURES *const sf = &cpi->sf;
   // Reset the stats tables.
-  if (sf->mode_pruning_based_on_two_pass_partition_search)
-    av1_zero(x->first_partition_pass_stats);
+  av1_zero(x->first_partition_pass_stats);
 
   AV1_COMMON *const cm = &cpi->common;
   const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
@@ -4968,6 +4418,7 @@ static void first_partition_search_pass(AV1_COMP *cpi, ThreadData *td,
   x->cb_partition_scan = 0;
 
   x->source_variance = UINT_MAX;
+  x->simple_motion_pred_sse = UINT_MAX;
   if (sf->adaptive_pred_interp_filter) {
     const int leaf_nodes = 256;
     for (int i = 0; i < leaf_nodes; ++i) {
@@ -4996,29 +4447,208 @@ static void first_partition_search_pass(AV1_COMP *cpi, ThreadData *td,
 
   x->use_cb_search_range = 1;
 
-  if (sf->mode_pruning_based_on_two_pass_partition_search) {
-    for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
-      FIRST_PARTITION_PASS_STATS *const stat =
-          &x->first_partition_pass_stats[i];
-      if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
-        // If there are not enough samples collected, make all available.
-        memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts));
-        memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts));
-      } else if (sf->selective_ref_frame < 3) {
-        // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the
-        // initial partition scan, so we don't eliminate them.
-        stat->ref0_counts[ALTREF2_FRAME] = 0xff;
-        stat->ref1_counts[ALTREF2_FRAME] = 0xff;
-        stat->ref0_counts[BWDREF_FRAME] = 0xff;
-        stat->ref1_counts[BWDREF_FRAME] = 0xff;
+  for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+    FIRST_PARTITION_PASS_STATS *const stat = &x->first_partition_pass_stats[i];
+    if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
+      // If there are not enough samples collected, make all available.
+      memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts));
+      memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts));
+      if (cpi->sf.use_first_partition_pass_interintra_stats)
+        memset(stat->interintra_motion_mode_count, 0xff,
+               sizeof(stat->interintra_motion_mode_count));
+    } else if (sf->selective_ref_frame < 3) {
+      // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the
+      // initial partition scan, so we don't eliminate them.
+      stat->ref0_counts[ALTREF2_FRAME] = 0xff;
+      stat->ref1_counts[ALTREF2_FRAME] = 0xff;
+      stat->ref0_counts[BWDREF_FRAME] = 0xff;
+      stat->ref1_counts[BWDREF_FRAME] = 0xff;
+      if (cpi->sf.use_first_partition_pass_interintra_stats) {
+        stat->interintra_motion_mode_count[ALTREF2_FRAME] = 0xff;
+        stat->interintra_motion_mode_count[BWDREF_FRAME] = 0xff;
       }
     }
   }
 }
 
-static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
-                             TileDataEnc *tile_data, int mi_row,
-                             TOKENEXTRA **tp) {
+#define AVG_CDF_WEIGHT_LEFT 3
+#define AVG_CDF_WEIGHT_TOP_RIGHT 1
+
+static void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, aom_cdf_prob *cdf_ptr_tr,
+                           int num_cdfs, int cdf_stride, int nsymbs,
+                           int wt_left, int wt_tr) {
+  for (int i = 0; i < num_cdfs; i++) {
+    for (int j = 0; j <= nsymbs; j++) {
+      cdf_ptr_left[i * cdf_stride + j] =
+          (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left +
+                          (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr +
+                          ((wt_left + wt_tr) / 2)) /
+                         (wt_left + wt_tr));
+      assert(cdf_ptr_left[i * cdf_stride + j] >= 0 &&
+             cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP);
+    }
+  }
+}
+
+#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \
+  AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs))
+
+#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride)           \
+  do {                                                                     \
+    aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left;               \
+    aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr;                   \
+    int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob);       \
+    int num_cdfs = array_size / cdf_stride;                                \
+    avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \
+                   wt_left, wt_tr);                                        \
+  } while (0)
+
+static void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, int wt_left,
+                    int wt_tr) {
+  AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4);
+  for (int i = 0; i < 2; i++) {
+    AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf,
+                MV_CLASSES);
+    AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf,
+                nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf,
+                nmv_tr->comps[i].class0_hp_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf,
+                CLASS0_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2);
+  }
+}
+
+// In case of row-based multi-threading of encoder, since we always
+// keep a top - right sync, we can average the top - right SB's CDFs and
+// the left SB's CDFs and use the same for current SB's encoding to
+// improve the performance. This function facilitates the averaging
+// of CDF and used only when row-mt is enabled in encoder.
+static void avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
+                            int wt_left, int wt_tr) {
+  AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2);
+  AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2);
+  AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11);
+  AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3);
+  AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4);
+  AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE);
+  AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2);
+  AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2);
+  AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2);
+  AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2);
+  AVERAGE_CDF(ctx_left->inter_compound_mode_cdf,
+              ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+  AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf,
+              MASKED_COMPOUND_TYPES);
+  AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16);
+  AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2);
+  AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2);
+  AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf,
+              INTERINTRA_MODES);
+  AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES);
+  AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2);
+  AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf,
+              PALETTE_SIZES);
+  AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf,
+              PALETTE_SIZES);
+  for (int j = 0; j < PALETTE_SIZES; j++) {
+    int nsymbs = j + PALETTE_MIN_SIZE;
+    AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j],
+                   ctx_tr->palette_y_color_index_cdf[j], nsymbs,
+                   CDF_SIZE(PALETTE_COLORS));
+    AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j],
+                   ctx_tr->palette_uv_color_index_cdf[j], nsymbs,
+                   CDF_SIZE(PALETTE_COLORS));
+  }
+  AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2);
+  AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2);
+  AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2);
+  AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2);
+  AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2);
+  AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2);
+  AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2);
+  AVERAGE_CDF(ctx_left->skip_cdfs, ctx_tr->skip_cdfs, 2);
+  AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2);
+  avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr);
+  avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr);
+  AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2);
+  AVERAGE_CDF(ctx_left->seg.tree_cdf, ctx_tr->seg.tree_cdf, MAX_SEGMENTS);
+  AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2);
+  AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf,
+              ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+  AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2);
+  AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf,
+              FILTER_INTRA_MODES);
+  AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf,
+              RESTORE_SWITCHABLE_TYPES);
+  AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2);
+  AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2);
+  AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES);
+  AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0],
+                 UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES));
+  AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES);
+  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+    if (i < 4) {
+      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4,
+                     CDF_SIZE(10));
+    } else if (i < 16) {
+      AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10);
+    } else {
+      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8,
+                     CDF_SIZE(10));
+    }
+  }
+  AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf,
+              SWITCHABLE_FILTERS);
+  AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES);
+  AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf,
+              2 * MAX_ANGLE_DELTA + 1);
+  AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH,
+                 CDF_SIZE(MAX_TX_DEPTH + 1));
+  AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1);
+  AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1);
+  for (int i = 0; i < FRAME_LF_COUNT; i++) {
+    AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i],
+                DELTA_LF_PROBS + 1);
+  }
+  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2,
+                 CDF_SIZE(TX_TYPES));
+  AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf,
+              CFL_ALPHABET_SIZE);
+}
+
+static void encode_sb_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+                          int mi_row, TOKENEXTRA **tp, int use_nonrd_mode) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   const TileInfo *const tile_info = &tile_data->tile_info;
@@ -5032,6 +4662,10 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
   const int mib_size_log2 = cm->seq_params.mib_size_log2;
   const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_sb_time);
+#endif
+
   // Initialize the left context for the new SB row
   av1_zero_left_context(xd);
 
@@ -5049,13 +4683,48 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
        mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) {
     (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
                                    sb_col_in_tile);
-    if ((cpi->row_mt == 1) && (tile_info->mi_col_start == mi_col) &&
+    if (tile_data->allow_update_cdf && (cpi->row_mt == 1) &&
         (tile_info->mi_row_start != mi_row)) {
-      // restore frame context of 1st column sb
-      memcpy(xd->tile_ctx, x->backup_tile_ctx, sizeof(*xd->tile_ctx));
+      if ((tile_info->mi_col_start == mi_col)) {
+        // restore frame context of 1st column sb
+        memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx));
+      } else {
+        int wt_left = AVG_CDF_WEIGHT_LEFT;
+        int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT;
+        if (tile_info->mi_col_end > (mi_col + mib_size))
+          avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile, wt_left,
+                          wt_tr);
+        else
+          avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1,
+                          wt_left, wt_tr);
+      }
+    }
+
+    switch (cpi->oxcf.coeff_cost_upd_freq) {
+      case COST_UPD_TILE:  // Tile level
+        if (mi_row != tile_info->mi_row_start) break;
+        AOM_FALLTHROUGH_INTENDED;
+      case COST_UPD_SBROW:  // SB row level in tile
+        if (mi_col != tile_info->mi_col_start) break;
+        AOM_FALLTHROUGH_INTENDED;
+      case COST_UPD_SB:  // SB level
+        av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
+        break;
+      default: assert(0);
+    }
+
+    switch (cpi->oxcf.mode_cost_upd_freq) {
+      case COST_UPD_TILE:  // Tile level
+        if (mi_row != tile_info->mi_row_start) break;
+        AOM_FALLTHROUGH_INTENDED;
+      case COST_UPD_SBROW:  // SB row level in tile
+        if (mi_col != tile_info->mi_col_start) break;
+        AOM_FALLTHROUGH_INTENDED;
+      case COST_UPD_SB:  // SB level
+        av1_fill_mode_rates(cm, x, xd->tile_ctx);
+        break;
+      default: assert(0);
     }
-    av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
-    av1_fill_mode_rates(cm, x, xd->tile_ctx);
 
     if (sf->adaptive_pred_interp_filter) {
       for (int i = 0; i < leaf_nodes; ++i) {
@@ -5068,16 +4737,27 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
 
     x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
 
-    av1_zero(x->txb_rd_record_8X8);
-    av1_zero(x->txb_rd_record_16X16);
-    av1_zero(x->txb_rd_record_32X32);
-    av1_zero(x->txb_rd_record_64X64);
-    av1_zero(x->txb_rd_record_intra);
+    if (!use_nonrd_mode) {
+      av1_zero(x->txb_rd_record_8X8);
+      av1_zero(x->txb_rd_record_16X16);
+      av1_zero(x->txb_rd_record_32X32);
+      av1_zero(x->txb_rd_record_64X64);
+      av1_zero(x->txb_rd_record_intra);
+    }
+
+    av1_zero(x->picked_ref_frames_mask);
 
     av1_zero(x->pred_mv);
     PC_TREE *const pc_root = td->pc_root[mib_size_log2 - MIN_MIB_SIZE_LOG2];
     pc_root->index = 0;
 
+    if ((sf->simple_motion_search_prune_rect ||
+         sf->simple_motion_search_early_term_none ||
+         sf->firstpass_simple_motion_search_early_term) &&
+        !frame_is_intra_only(cm)) {
+      init_simple_motion_search_mvs(pc_root);
+    }
+
     const struct segmentation *const seg = &cm->seg;
     int seg_skip = 0;
     if (seg->enabled) {
@@ -5099,6 +4779,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
     x->source_variance = UINT_MAX;
+    x->simple_motion_pred_sse = UINT_MAX;
     if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
       set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
       const BLOCK_SIZE bsize = seg_skip ? sb_size : sf->always_this_block_size;
@@ -5112,6 +4793,13 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
                        &dummy_rate, &dummy_dist, 1, pc_root);
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
+               use_nonrd_mode) {
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+      av1_choose_var_based_partitioning(cpi, tile_info, x, mi_row, mi_col);
+      nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                          &dummy_rate, &dummy_dist, 1, pc_root);
+
     } else {
       const int orig_rdmult = cpi->rd.RDMULT;
       x->cb_rdmult = orig_rdmult;
@@ -5124,58 +4812,87 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
         x->rdmult = x->cb_rdmult;
       }
 
-      // If required set upper and lower partition size limits
-      if (sf->auto_min_max_partition_size) {
-        set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
-        rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
-                                &x->min_partition_size, &x->max_partition_size);
-      }
-
       reset_partition(pc_root, sb_size);
       x->use_cb_search_range = 0;
-      init_first_partition_pass_stats_tables(x->first_partition_pass_stats);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, first_partition_search_pass_time);
+#endif
+      init_first_partition_pass_stats_tables(cpi,
+                                             x->first_partition_pass_stats);
       // Do the first pass if we need two pass partition search
-      if (cpi->sf.two_pass_partition_search &&
+      if (cpi->two_pass_partition_search &&
           cpi->sf.use_square_partition_only_threshold > BLOCK_4X4 &&
-          mi_row + mi_size_high[sb_size] < cm->mi_rows &&
-          mi_col + mi_size_wide[sb_size] < cm->mi_cols &&
+          mi_row + mi_size_high[sb_size] <= cm->mi_rows &&
+          mi_col + mi_size_wide[sb_size] <= cm->mi_cols &&
           cm->current_frame.frame_type != KEY_FRAME) {
         first_partition_search_pass(cpi, td, tile_data, mi_row, mi_col, tp);
       }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, first_partition_search_pass_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, rd_pick_partition_time);
+#endif
+      BLOCK_SIZE max_sq_size = BLOCK_128X128;
+      switch (cpi->oxcf.max_partition_size) {
+        case 4: max_sq_size = BLOCK_4X4; break;
+        case 8: max_sq_size = BLOCK_8X8; break;
+        case 16: max_sq_size = BLOCK_16X16; break;
+        case 32: max_sq_size = BLOCK_32X32; break;
+        case 64: max_sq_size = BLOCK_64X64; break;
+        case 128: max_sq_size = BLOCK_128X128; break;
+        default: assert(0); break;
+      }
+      max_sq_size = AOMMIN(max_sq_size, sb_size);
+
+      BLOCK_SIZE min_sq_size = BLOCK_4X4;
+      switch (cpi->oxcf.min_partition_size) {
+        case 4: min_sq_size = BLOCK_4X4; break;
+        case 8: min_sq_size = BLOCK_8X8; break;
+        case 16: min_sq_size = BLOCK_16X16; break;
+        case 32: min_sq_size = BLOCK_32X32; break;
+        case 64: min_sq_size = BLOCK_64X64; break;
+        case 128: min_sq_size = BLOCK_128X128; break;
+        default: assert(0); break;
+      }
+
+      if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
+        float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+
+        av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
+        max_sq_size =
+            AOMMIN(av1_predict_max_partition(cpi, x, features), max_sq_size);
+      }
+
+      min_sq_size = AOMMIN(min_sq_size, max_sq_size);
 
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                        &dummy_rdc, INT64_MAX, pc_root, NULL);
+                        max_sq_size, min_sq_size, &dummy_rdc, INT64_MAX,
+                        pc_root, NULL);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, rd_pick_partition_time);
+#endif
     }
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
     // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
-    if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+    if (cpi->sf.inter_mode_rd_model_estimation == 1 && cm->tile_cols == 1 &&
         cm->tile_rows == 1) {
       av1_inter_mode_data_fit(tile_data, x->rdmult);
     }
-#endif
-    // Context update for row based multi-threading of encoder is done based on
-    // the following conditions:
-    // 1. If mib_size_log2==5, context of top-right superblock is used
-    // for context modelling. If top-right is not available (in case of tile
-    // with width == mib_size_log2==5), top superblock's context is used.
-    // 2. If mib_size_log2==4, context of next superblock to top-right
-    // superblock is used. Using context of top-right superblock in this case
-    // gives high BD Rate drop for smaller resolutions.
-    if (cpi->row_mt == 1) {
-      int update_context = 0;
-      if (mib_size_log2 == 5) {
-        update_context = sb_cols_in_tile == 1 || sb_col_in_tile == 1;
-      } else if (mib_size_log2 == 4) {
-        update_context = sb_cols_in_tile == 1 ||
-                         (sb_cols_in_tile == 2 && sb_col_in_tile == 1) ||
-                         sb_col_in_tile == 2;
-      }
-      if (update_context)
-        memcpy(x->backup_tile_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx));
+    if (tile_data->allow_update_cdf && (cpi->row_mt == 1) &&
+        (tile_info->mi_row_end > (mi_row + mib_size))) {
+      if (sb_cols_in_tile == 1)
+        memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx));
+      else if (sb_col_in_tile >= 1)
+        memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx,
+               sizeof(*xd->tile_ctx));
     }
     (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
                                     sb_col_in_tile, sb_cols_in_tile);
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_sb_time);
+#endif
 }
 
 static void init_encode_frame_mb_context(AV1_COMP *cpi) {
@@ -5193,18 +4910,18 @@ static void init_encode_frame_mb_context(AV1_COMP *cpi) {
 }
 
 static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
-  if (frame_is_intra_only(&cpi->common)) return INTRA_FRAME;
-  // We will not update the golden frame with an internal overlay frame
-  else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) ||
-           cpi->rc.is_src_frame_ext_arf)
+  if (frame_is_intra_only(&cpi->common)) {
+    return INTRA_FRAME;
+  } else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) ||
+             cpi->rc.is_src_frame_internal_arf) {
+    // We will not update the golden frame with an internal overlay frame
     return ALTREF_FRAME;
-  else if (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
-           cpi->refresh_alt_ref_frame)
+  } else if (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+             cpi->refresh_alt_ref_frame) {
     return GOLDEN_FRAME;
-  else
-    // TODO(zoeliu): To investigate whether a frame_type other than
-    // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
+  } else {
     return LAST_FRAME;
+  }
 }
 
 static TX_MODE select_tx_mode(const AV1_COMP *cpi) {
@@ -5238,7 +4955,6 @@ void av1_alloc_tile_data(AV1_COMP *cpi) {
       for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
         for (j = 0; j < MAX_MODES; ++j) {
           tile_data->thresh_freq_fact[i][j] = 32;
-          tile_data->mode_map[i][j] = j;
         }
       }
     }
@@ -5296,7 +5012,7 @@ void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
                 cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
   cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok;
 
-  encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+  encode_sb_row(cpi, td, this_tile, mi_row, &tok, cpi->sf.use_nonrd_pick_mode);
 
   cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok;
   cpi->tplist[tile_row][tile_col][sb_row_in_tile].count =
@@ -5321,9 +5037,7 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
   const TileInfo *const tile_info = &this_tile->tile_info;
   int mi_row;
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
   av1_inter_mode_data_init(this_tile);
-#endif
 
   av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
                          tile_info->mi_col_end, tile_row);
@@ -5363,28 +5077,12 @@ static void encode_tiles(AV1_COMP *cpi) {
       cpi->td.intrabc_used = 0;
       cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
       cpi->td.mb.tile_pb_ctx = &this_tile->tctx;
-      cpi->td.mb.backup_tile_ctx = &this_tile->backup_tctx;
       av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
       cpi->intrabc_used |= cpi->td.intrabc_used;
     }
   }
 }
 
-#if CONFIG_FP_MB_STATS
-static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
-                            AV1_COMMON *cm, uint8_t **this_frame_mb_stats) {
-  uint8_t *mb_stats_in =
-      firstpass_mb_stats->mb_stats_start +
-      cm->current_frame.frame_number * cm->MBs * sizeof(uint8_t);
-
-  if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF;
-
-  *this_frame_mb_stats = mb_stats_in;
-
-  return 1;
-}
-#endif
-
 #define GLOBAL_TRANS_TYPES_ENC 3  // highest motion model to search
 static int gm_get_params_cost(const WarpedMotionParams *gm,
                               const WarpedMotionParams *ref_gm, int allow_hp) {
@@ -5441,123 +5139,73 @@ static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm,
   (void)frame;
   switch (sf->gm_search_type) {
     case GM_FULL_SEARCH: return 1;
-    case GM_REDUCED_REF_SEARCH:
+    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3:
       return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
+    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2:
+      return !(frame == LAST2_FRAME || frame == LAST3_FRAME ||
+               (frame == ALTREF2_FRAME));
     case GM_DISABLE_SEARCH: return 0;
     default: assert(0);
   }
   return 1;
 }
 
-static const uint8_t ref_frame_flag_list[REF_FRAMES] = { 0,
-                                                         AOM_LAST_FLAG,
-                                                         AOM_LAST2_FLAG,
-                                                         AOM_LAST3_FLAG,
-                                                         AOM_GOLD_FLAG,
-                                                         AOM_BWD_FLAG,
-                                                         AOM_ALT2_FLAG,
-                                                         AOM_ALT_FLAG };
-
-// Enforce the number of references for each arbitrary frame limited to
-// (INTER_REFS_PER_FRAME - 1)
+static int get_max_allowed_ref_frames(const AV1_COMP *cpi) {
+  const unsigned int max_allowed_refs_for_given_speed =
+      (cpi->sf.selective_ref_frame >= 3) ? INTER_REFS_PER_FRAME - 1
+                                         : INTER_REFS_PER_FRAME;
+  return AOMMIN(max_allowed_refs_for_given_speed,
+                cpi->oxcf.max_reference_frames);
+}
+
+// Enforce the number of references for each arbitrary frame based on user
+// options and speed.
 static void enforce_max_ref_frames(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
   MV_REFERENCE_FRAME ref_frame;
   int total_valid_refs = 0;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
       total_valid_refs++;
+    }
   }
 
-  // NOTE(zoeliu): When all the possible reference frames are availble, we
-  // reduce the number of reference frames by 1, following the rules of:
-  // (1) Retain GOLDEN_FARME/ALTEF_FRAME;
-  // (2) Check the earliest 2 remaining reference frames, and remove the one
-  //     with the lower quality factor, otherwise if both have been coded at
-  //     the same quality level, remove the earliest reference frame.
-
-  if (total_valid_refs == INTER_REFS_PER_FRAME) {
-    unsigned int min_ref_order_hint = UINT_MAX;
-    unsigned int second_min_ref_order_hint = UINT_MAX;
-    MV_REFERENCE_FRAME earliest_ref_frames[2] = { LAST3_FRAME, LAST2_FRAME };
-    const RefCntBuffer *earliest_bufs[2] = { NULL };
-
-    // Locate the earliest two reference frames except GOLDEN/ALTREF.
-    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-      // Retain GOLDEN/ALTERF
-      if (ref_frame == GOLDEN_FRAME || ref_frame == ALTREF_FRAME) continue;
-
-      const RefCntBuffer *const buf =
-          cm->current_frame.frame_refs[ref_frame - LAST_FRAME].buf;
-      if (buf != NULL) {
-        const unsigned int ref_order_hint = buf->order_hint;
-
-        if (min_ref_order_hint == UINT_MAX) {
-          min_ref_order_hint = ref_order_hint;
-          earliest_ref_frames[0] = ref_frame;
-          earliest_bufs[0] = buf;
-        } else {
-          if (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint,
-                                min_ref_order_hint) < 0) {
-            second_min_ref_order_hint = min_ref_order_hint;
-            earliest_ref_frames[1] = earliest_ref_frames[0];
-            earliest_bufs[1] = earliest_bufs[0];
-
-            min_ref_order_hint = ref_order_hint;
-            earliest_ref_frames[0] = ref_frame;
-            earliest_bufs[0] = buf;
-          } else if (second_min_ref_order_hint == UINT_MAX ||
-                     get_relative_dist(&cm->seq_params.order_hint_info,
-                                       ref_order_hint,
-                                       second_min_ref_order_hint) < 0) {
-            second_min_ref_order_hint = ref_order_hint;
-            earliest_ref_frames[1] = ref_frame;
-            earliest_bufs[1] = buf;
-          }
-        }
-      }
+  const int max_allowed_refs = get_max_allowed_ref_frames(cpi);
+
+  // When more than 'max_allowed_refs' are available, we reduce the number of
+  // reference frames one at a time based on this order.
+  const MV_REFERENCE_FRAME disable_order[] = {
+    LAST3_FRAME,
+    LAST2_FRAME,
+    ALTREF2_FRAME,
+    GOLDEN_FRAME,
+  };
+
+  for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) {
+    const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i];
+
+    if (!(cpi->ref_frame_flags &
+          av1_ref_frame_flag_list[ref_frame_to_disable])) {
+      continue;
     }
-    // Check the coding quality factors of the two earliest reference frames.
-    RATE_FACTOR_LEVEL ref_rf_level[2];
-    double ref_rf_deltas[2];
-    for (int i = 0; i < 2; ++i) {
-      ref_rf_level[i] = earliest_bufs[i]->frame_rf_level;
-      ref_rf_deltas[i] = rate_factor_deltas[ref_rf_level[i]];
-    }
-    (void)ref_rf_level;
-    (void)ref_rf_deltas;
-
-#define USE_RF_LEVEL_TO_ENFORCE 1
-#if USE_RF_LEVEL_TO_ENFORCE
-    // If both earliest two reference frames are coded using the same rate-
-    // factor, disable the earliest reference frame; Otherwise disable the
-    // reference frame that uses a lower rate-factor delta.
-    const MV_REFERENCE_FRAME ref_frame_to_disable =
-        (ref_rf_deltas[0] <= ref_rf_deltas[1]) ? earliest_ref_frames[0]
-                                               : earliest_ref_frames[1];
-#else
-    // Always disable the earliest reference frame
-    const MV_REFERENCE_FRAME ref_frame_to_disable = earliest_ref_frames[0];
-#endif  // USE_RF_LEVEL_TO_ENFORCE
-#undef USE_RF_LEVEL_TO_ENFORCE
 
     switch (ref_frame_to_disable) {
-      case LAST_FRAME: cpi->ref_frame_flags &= ~AOM_LAST_FLAG; break;
-      case LAST2_FRAME: cpi->ref_frame_flags &= ~AOM_LAST2_FLAG; break;
       case LAST3_FRAME: cpi->ref_frame_flags &= ~AOM_LAST3_FLAG; break;
-      case BWDREF_FRAME: cpi->ref_frame_flags &= ~AOM_BWD_FLAG; break;
+      case LAST2_FRAME: cpi->ref_frame_flags &= ~AOM_LAST2_FLAG; break;
       case ALTREF2_FRAME: cpi->ref_frame_flags &= ~AOM_ALT2_FLAG; break;
-      default: break;
+      case GOLDEN_FRAME: cpi->ref_frame_flags &= ~AOM_GOLD_FLAG; break;
+      default: assert(0);
     }
+    --total_valid_refs;
   }
+  assert(total_valid_refs <= max_allowed_refs);
 }
 
 static INLINE int av1_refs_are_one_sided(const AV1_COMMON *cm) {
   assert(!frame_is_intra_only(cm));
 
   int one_sided_refs = 1;
-  for (int ref = 0; ref < INTER_REFS_PER_FRAME; ++ref) {
-    const RefCntBuffer *const buf = cm->current_frame.frame_refs[ref].buf;
+  for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
     if (buf == NULL) continue;
 
     const int ref_order_hint = buf->order_hint;
@@ -5577,9 +5225,9 @@ static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm,
   if (!skip_mode_info->skip_mode_allowed) return;
 
   const RefCntBuffer *const buf_0 =
-      cm->current_frame.frame_refs[skip_mode_info->ref_frame_idx_0].buf;
+      get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_0);
   const RefCntBuffer *const buf_1 =
-      cm->current_frame.frame_refs[skip_mode_info->ref_frame_idx_1].buf;
+      get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_1);
   assert(buf_0 != NULL && buf_1 != NULL);
 
   ref_order_hint[0] = buf_0->order_hint;
@@ -5666,9 +5314,10 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 
   av1_zero(*td->counts);
   av1_zero(rdc->comp_pred_diff);
+  // Two pass partition search can be enabled/disabled for different frames.
+  // Reset this data at frame level to avoid any incorrect usage.
+  init_first_partition_pass_stats_tables(cpi, x->first_partition_pass_stats);
 
-  // Allow intrabc when screen content tools are enabled.
-  cm->allow_intrabc = cm->allow_screen_content_tools;
   // Reset the flag.
   cpi->intrabc_used = 0;
   // Need to disable intrabc when superres is selected
@@ -5676,6 +5325,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     cm->allow_intrabc = 0;
   }
 
+  cm->allow_intrabc &= (cpi->oxcf.enable_intrabc);
+
   if (cpi->oxcf.pass != 1 && av1_use_hash_me(cm)) {
     // add to hash table
     const int pic_width = cpi->source->y_crop_width;
@@ -5760,7 +5411,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     if (xd->lossless[i]) {
       cpi->optimize_seg_arr[i] = 0;
     } else {
-      cpi->optimize_seg_arr[i] = cpi->optimize_speed_feature;
+      cpi->optimize_seg_arr[i] = cpi->sf.optimize_coefficients;
     }
   }
   cm->coded_lossless = is_coded_lossless(cm, xd);
@@ -5775,7 +5426,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   cm->delta_q_info.delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
   cm->delta_q_info.delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF;
   cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
-  // update delta_q_present_flag and delta_lf_present_flag based on base_qindex
+  // update delta_q_present_flag and delta_lf_present_flag based on
+  // base_qindex
   cm->delta_q_info.delta_q_present_flag &= cm->base_qindex > 0;
   cm->delta_q_info.delta_lf_present_flag &= cm->base_qindex > 0;
 
@@ -5801,8 +5453,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     aom_clear_system_state();
 
     if (tpl_frame->is_valid)
-      cpi->rd.r0 =
-          (double)intra_cost_base / (intra_cost_base + mc_dep_cost_base);
+      cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
   }
 
   av1_frame_init_quantizer(cpi);
@@ -5815,7 +5466,6 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     cm->last_frame_seg_map = cm->prev_frame->seg_map;
   else
     cm->last_frame_seg_map = NULL;
-  cm->current_frame_seg_map = cm->cur_frame->seg_map;
   if (cm->allow_intrabc || cm->coded_lossless) {
     av1_set_default_ref_deltas(cm->lf.ref_deltas);
     av1_set_default_mode_deltas(cm->lf.mode_deltas);
@@ -5831,14 +5481,17 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   cm->prev_mi = cm->allow_ref_frame_mvs ? cm->prev_mip : NULL;
 
   x->txb_split_count = 0;
+#if CONFIG_SPEED_STATS
+  x->tx_search_count = 0;
+#endif  // CONFIG_SPEED_STATS
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_compute_global_motion_time);
+#endif
   av1_zero(rdc->global_motion_used);
   av1_zero(cpi->gmparams_cost);
-#if !CONFIG_GLOBAL_MOTION_SEARCH
-  cpi->global_motion_search_done = 1;
-#endif  // !CONFIG_GLOBAL_MOTION_SEARCH
   if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
-      !cpi->global_motion_search_done) {
+      cpi->oxcf.enable_global_motion && !cpi->global_motion_search_done) {
     YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
     int frame;
     double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)];
@@ -5853,7 +5506,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     int num_refs_using_gm = 0;
 
     for (frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
-      ref_buf[frame] = get_ref_frame_buffer(cpi, frame);
+      ref_buf[frame] = NULL;
+      RefCntBuffer *buf = get_ref_frame_buf(cm, frame);
+      if (buf != NULL) ref_buf[frame] = &buf->buf;
       int pframe;
       cm->global_motion[frame] = default_warp_params;
       const WarpedMotionParams *ref_params =
@@ -5872,15 +5527,26 @@ static void encode_frame_internal(AV1_COMP *cpi) {
                  do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame) &&
                  !(cpi->sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
         TransformationType model;
-        const int64_t ref_frame_error =
-            av1_frame_error(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-                            ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
-                            cpi->source->y_buffer, cpi->source->y_width,
-                            cpi->source->y_height, cpi->source->y_stride);
+        const int64_t ref_frame_error = av1_frame_error(
+            is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
+            ref_buf[frame]->y_stride, cpi->source->y_buffer,
+            cpi->source->y_width, cpi->source->y_height, cpi->source->y_stride);
 
         if (ref_frame_error == 0) continue;
 
         aom_clear_system_state();
+
+        // TODO(sarahparker, debargha): Explore do_adaptive_gm_estimation = 1
+        const int do_adaptive_gm_estimation = 0;
+
+        const int ref_frame_dist = get_relative_dist(
+            &cm->seq_params.order_hint_info, cm->current_frame.order_hint,
+            cm->cur_frame->ref_order_hints[frame - LAST_FRAME]);
+        const GlobalMotionEstimationType gm_estimation_type =
+            cm->seq_params.order_hint_info.enable_order_hint &&
+                    abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation
+                ? GLOBAL_MOTION_DISFLOW_BASED
+                : GLOBAL_MOTION_FEATURE_BASED;
         for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
           int64_t best_warp_error = INT64_MAX;
           // Initially set all params to identity.
@@ -5891,8 +5557,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 
           av1_compute_global_motion(model, cpi->source, ref_buf[frame],
                                     cpi->common.seq_params.bit_depth,
-                                    inliers_by_motion, params_by_motion,
-                                    RANSAC_NUM_MOTIONS);
+                                    gm_estimation_type, inliers_by_motion,
+                                    params_by_motion, RANSAC_NUM_MOTIONS);
 
           for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
             if (inliers_by_motion[i] == 0) continue;
@@ -5902,17 +5568,17 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 
             if (tmp_wm_params.wmtype != IDENTITY) {
               const int64_t warp_error = av1_refine_integerized_param(
-                  &tmp_wm_params, tmp_wm_params.wmtype,
-                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-                  ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
+                  &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd),
+                  xd->bd, ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
                   ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
                   cpi->source->y_buffer, cpi->source->y_width,
                   cpi->source->y_height, cpi->source->y_stride, 5,
                   best_warp_error);
               if (warp_error < best_warp_error) {
                 best_warp_error = warp_error;
-                // Save the wm_params modified by av1_refine_integerized_param()
-                // rather than motion index to avoid rerunning refine() below.
+                // Save the wm_params modified by
+                // av1_refine_integerized_param() rather than motion index to
+                // avoid rerunning refine() below.
                 memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
                        sizeof(WarpedMotionParams));
               }
@@ -5956,7 +5622,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     // clear disabled ref_frames
     for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
       const int ref_disabled =
-          !(cpi->ref_frame_flags & ref_frame_flag_list[frame]);
+          !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]);
       if (ref_disabled && cpi->sf.recode_loop != DISALLOW_RECODE) {
         cpi->gmparams_cost[frame] = 0;
         cm->global_motion[frame] = default_warp_params;
@@ -5966,8 +5632,17 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   }
   memcpy(cm->cur_frame->global_motion, cm->global_motion,
          REF_FRAMES * sizeof(WarpedMotionParams));
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_compute_global_motion_time);
+#endif
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_setup_motion_field_time);
+#endif
   av1_setup_motion_field(cm);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_setup_motion_field_time);
+#endif
 
   cpi->all_one_sided_refs =
       frame_is_intra_only(cm) ? 0 : av1_refs_are_one_sided(cm);
@@ -5976,16 +5651,6 @@ static void encode_frame_internal(AV1_COMP *cpi) {
       check_skip_mode_enabled(cpi);
 
   {
-    struct aom_usec_timer emr_timer;
-    aom_usec_timer_start(&emr_timer);
-
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
-                       &cpi->twopass.this_frame_mb_stats);
-    }
-#endif
-
     cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read_dummy;
     cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write_dummy;
     cpi->row_mt = 0;
@@ -6000,9 +5665,6 @@ static void encode_frame_internal(AV1_COMP *cpi) {
       else
         encode_tiles(cpi);
     }
-
-    aom_usec_timer_mark(&emr_timer);
-    cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer);
   }
 
   // If intrabc is allowed but never selected, reset the allow_intrabc flag.
@@ -6016,21 +5678,7 @@ void av1_encode_frame(AV1_COMP *cpi) {
   const int num_planes = av1_num_planes(cm);
   // Indicates whether or not to use a default reduced set for ext-tx
   // rather than the potential full set of 16 transforms
-  cm->reduced_tx_set_used = 0;
-
-  if (cm->show_frame == 0) {
-    int arf_offset = AOMMIN(
-        (MAX_GF_INTERVAL - 1),
-        cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
-    int brf_offset =
-        cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
-    arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
-    current_frame->order_hint = current_frame->frame_number + arf_offset;
-  } else {
-    current_frame->order_hint = current_frame->frame_number;
-  }
-  current_frame->order_hint %=
-      (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1));
+  cm->reduced_tx_set_used = cpi->oxcf.reduced_tx_type_set;
 
   // Make sure segment_id is no larger than last_active_segid.
   if (cm->seg.enabled && cm->seg.update_map) {
@@ -6047,7 +5695,7 @@ void av1_encode_frame(AV1_COMP *cpi) {
   }
 
   av1_setup_frame_buf_refs(cm);
-  if (cpi->sf.selective_ref_frame >= 3) enforce_max_ref_frames(cpi);
+  enforce_max_ref_frames(cpi);
   av1_setup_frame_sign_bias(cm);
 
 #if CONFIG_MISMATCH_DEBUG
@@ -6056,8 +5704,6 @@ void av1_encode_frame(AV1_COMP *cpi) {
   (void)num_planes;
 #endif
 
-  cpi->allow_comp_inter_inter = !frame_is_intra_only(cm);
-
   if (cpi->sf.frame_parameter_update) {
     int i;
     RD_OPT *const rd_opt = &cpi->rd;
@@ -6079,7 +5725,7 @@ void av1_encode_frame(AV1_COMP *cpi) {
 
     /* prediction (compound, single or hybrid) mode selection */
     // NOTE: "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames
-    if (is_alt_ref || !cpi->allow_comp_inter_inter)
+    if (is_alt_ref || frame_is_intra_only(cm))
       current_frame->reference_mode = SINGLE_REFERENCE;
     else
       current_frame->reference_mode = REFERENCE_MODE_SELECT;
@@ -6106,7 +5752,8 @@ void av1_encode_frame(AV1_COMP *cpi) {
 #endif  // CONFIG_ENTROPY_STATS
       }
     }
-    // Re-check on the skip mode status as reference mode may have been changed.
+    // Re-check on the skip mode status as reference mode may have been
+    // changed.
     SkipModeInfo *const skip_mode_info = &current_frame->skip_mode_info;
     if (frame_is_intra_only(cm) ||
         current_frame->reference_mode == SINGLE_REFERENCE) {
@@ -6287,8 +5934,7 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   const int mi_height = mi_size_high[bsize];
   const int is_inter = is_inter_block(mbmi);
 
-  if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
-      x->cb_partition_scan) {
+  if (cpi->two_pass_partition_search && x->cb_partition_scan) {
     for (int row = mi_row; row < mi_row + mi_width;
          row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
       for (int col = mi_col; col < mi_col + mi_height;
@@ -6302,8 +5948,15 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
         if (stats->ref0_counts[mbmi->ref_frame[0]] < 255)
           ++stats->ref0_counts[mbmi->ref_frame[0]];
         if (mbmi->ref_frame[1] >= 0 &&
-            stats->ref1_counts[mbmi->ref_frame[0]] < 255)
+            stats->ref1_counts[mbmi->ref_frame[1]] < 255)
           ++stats->ref1_counts[mbmi->ref_frame[1]];
+        if (cpi->sf.use_first_partition_pass_interintra_stats) {
+          // Increase the counter for interintra_motion_mode_count
+          if (mbmi->motion_mode == 0 && mbmi->ref_frame[1] == INTRA_FRAME &&
+              stats->interintra_motion_mode_count[mbmi->ref_frame[0]] < 255) {
+            ++stats->interintra_motion_mode_count[mbmi->ref_frame[0]];
+          }
+        }
       }
     }
   }
@@ -6351,15 +6004,19 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
 
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
     for (ref = 0; ref < 1 + is_compound; ++ref) {
-      YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+      const YV12_BUFFER_CONFIG *cfg =
+          get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]);
       assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
       av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
-                           &xd->block_refs[ref]->sf, num_planes);
+                           xd->block_ref_scale_factors[ref], num_planes);
     }
 
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-    if (mbmi->motion_mode == OBMC_CAUSAL)
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                  av1_num_planes(cm) - 1);
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
+      assert(cpi->oxcf.enable_obmc == 1);
       av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+    }
 
 #if CONFIG_MISMATCH_DEBUG
     if (dry_run == OUTPUT_ENABLED) {
diff --git a/libaom/av1/encoder/encodemb.c b/libaom/av1/encoder/encodemb.c
index e0c0370..8e9da61 100644
--- a/libaom/av1/encoder/encodemb.c
+++ b/libaom/av1/encoder/encodemb.c
@@ -43,7 +43,7 @@ static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
                            const uint8_t *src8, ptrdiff_t src_stride,
                            const uint8_t *pred8, ptrdiff_t pred_stride) {
   if (check_subtract_block_size(rows, cols)) {
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
       aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8,
                                   src_stride, pred8, pred_stride, xd->bd);
       return;
@@ -54,7 +54,7 @@ static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
     return;
   }
 
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
                               pred8, pred_stride, xd->bd);
     return;
@@ -111,16 +111,15 @@ int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
     return eob;
   }
 
-  (void)fast_mode;
   return av1_optimize_txb_new(cpi, mb, plane, block, tx_size, tx_type, txb_ctx,
-                              rate_cost, cpi->oxcf.sharpness);
+                              rate_cost, cpi->oxcf.sharpness, fast_mode);
 }
 
-typedef enum QUANT_FUNC {
+enum {
   QUANT_FUNC_LOWBD = 0,
   QUANT_FUNC_HIGHBD = 1,
   QUANT_FUNC_TYPES = 2
-} QUANT_FUNC;
+} UENUM1BYTE(QUANT_FUNC);
 
 static AV1_QUANT_FACADE
     quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
@@ -163,6 +162,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
   qparam.tx_size = tx_size;
   qparam.qmatrix = qmatrix;
   qparam.iqmatrix = iqmatrix;
+  qparam.use_quant_b_adapt = cm->use_quant_b_adapt;
   TxfmParam txfm_param;
   txfm_param.tx_type = tx_type;
   txfm_param.tx_size = tx_size;
@@ -171,7 +171,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
       txfm_param.tx_size, is_inter_block(mbmi), cm->reduced_tx_set_used);
 
   txfm_param.bd = xd->bd;
-  txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
+  txfm_param.is_hbd = is_cur_buf_hbd(xd);
 
   av1_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param);
 
@@ -184,7 +184,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
       av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
     }
   }
-  // NOTE: optimize_b_following is ture means av1_optimze_b will be called
+  // NOTE: optimize_b_following is true means av1_optimze_b will be called
   // When the condition of doing optimize_b is changed,
   // this flag need update simultaneously
   const int optimize_b_following =
@@ -226,13 +226,17 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
   if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) {
     TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
                                       tx_size, cm->reduced_tx_set_used);
-    if (args->enable_optimize_b) {
-      av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
-                      tx_size, tx_type, AV1_XFORM_QUANT_FP);
+    if (args->enable_optimize_b != NO_TRELLIS_OPT) {
+      av1_xform_quant(
+          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+          USE_B_QUANT_NO_TRELLIS &&
+                  (args->enable_optimize_b == FINAL_PASS_TRELLIS_OPT)
+              ? AV1_XFORM_QUANT_B
+              : AV1_XFORM_QUANT_FP);
       TXB_CTX txb_ctx;
       get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
-                     &dummy_rate_cost);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
+                     args->cpi->sf.trellis_eob_fast, &dummy_rate_cost);
     } else {
       av1_xform_quant(
           cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
@@ -255,12 +259,12 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
                                 cm->reduced_tx_set_used);
   }
 
+  // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
+  // case. It is possible that certain collision in hash index would cause
+  // the assertion failure. To further optimize the rate-distortion
+  // performance, we need to re-visit this part and enable this assert
+  // again.
   if (p->eobs[block] == 0 && plane == 0) {
-    // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
-    // case. It is possible that certain collision in hash index would cause
-    // the assertion failure. To further optimize the rate-distortion
-    // performance, we need to re-visit this part and enable this assert
-    // again.
 #if 0
     if (args->cpi->oxcf.aq_mode == NO_AQ &&
         args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
@@ -431,7 +435,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
 
   if (p->eobs[block] > 0) {
     txfm_param.bd = xd->bd;
-    txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
+    txfm_param.is_hbd = is_cur_buf_hbd(xd);
     txfm_param.tx_type = DCT_DCT;
     txfm_param.tx_size = tx_size;
     txfm_param.eob = p->eobs[block];
@@ -578,13 +582,17 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
 
     const ENTROPY_CONTEXT *a = &args->ta[blk_col];
     const ENTROPY_CONTEXT *l = &args->tl[blk_row];
-    if (args->enable_optimize_b) {
-      av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
-                      tx_size, tx_type, AV1_XFORM_QUANT_FP);
+    if (args->enable_optimize_b != NO_TRELLIS_OPT) {
+      av1_xform_quant(
+          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+          USE_B_QUANT_NO_TRELLIS &&
+                  (args->enable_optimize_b == FINAL_PASS_TRELLIS_OPT)
+              ? AV1_XFORM_QUANT_B
+              : AV1_XFORM_QUANT_FP);
       TXB_CTX txb_ctx;
       get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
-                     &dummy_rate_cost);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
+                     args->cpi->sf.trellis_eob_fast, &dummy_rate_cost);
     } else {
       av1_xform_quant(
           cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
@@ -597,12 +605,12 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                                 dst_stride, *eob, cm->reduced_tx_set_used);
   }
 
+  // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
+  // It is possible that certain collision in hash index would cause
+  // the assertion failure. To further optimize the rate-distortion
+  // performance, we need to re-visit this part and enable this assert
+  // again.
   if (*eob == 0 && plane == 0) {
-    // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
-    // It is possible that certain collision in hash index would cause
-    // the assertion failure. To further optimize the rate-distortion
-    // performance, we need to re-visit this part and enable this assert
-    // again.
 #if 0
     if (args->cpi->oxcf.aq_mode == NO_AQ
         && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
diff --git a/libaom/av1/encoder/encodemb.h b/libaom/av1/encoder/encodemb.h
index 39080de..d4394cf 100644
--- a/libaom/av1/encoder/encodemb.h
+++ b/libaom/av1/encoder/encodemb.h
@@ -37,13 +37,13 @@ struct encode_b_args {
   int8_t enable_optimize_b;
 };
 
-typedef enum AV1_XFORM_QUANT {
+enum {
   AV1_XFORM_QUANT_FP = 0,
   AV1_XFORM_QUANT_B = 1,
   AV1_XFORM_QUANT_DC = 2,
   AV1_XFORM_QUANT_SKIP_QUANT,
   AV1_XFORM_QUANT_TYPES,
-} AV1_XFORM_QUANT;
+} UENUM1BYTE(AV1_XFORM_QUANT);
 
 void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                    int mi_row, int mi_col, RUN_TYPE dry_run);
diff --git a/libaom/av1/encoder/encoder.c b/libaom/av1/encoder/encoder.c
index 7652029..818e43c 100644
--- a/libaom/av1/encoder/encoder.c
+++ b/libaom/av1/encoder/encoder.c
@@ -33,9 +33,9 @@
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
 #include "aom_scale/aom_scale.h"
-#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#if CONFIG_BITSTREAM_DEBUG
 #include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#endif  // CONFIG_BITSTREAM_DEBUG
 
 #include "av1/common/alloccommon.h"
 #include "av1/common/cdef.h"
@@ -54,6 +54,7 @@
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encodetxb.h"
 #include "av1/encoder/ethread.h"
@@ -61,6 +62,7 @@
 #include "av1/encoder/grain_test_vectors.h"
 #include "av1/encoder/hash_motion.h"
 #include "av1/encoder/mbgraph.h"
+#include "av1/encoder/pass2_strategy.h"
 #include "av1/encoder/picklpf.h"
 #include "av1/encoder/pickrst.h"
 #include "av1/encoder/random.h"
@@ -69,14 +71,11 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/speed_features.h"
-#include "av1/encoder/temporal_filter.h"
 #include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/var_based_part.h"
 
 #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
 
-// av1 uses 10,000,000 ticks/second as time stamp
-#define TICKS_PER_SEC 10000000LL
-
 #if CONFIG_ENTROPY_STATS
 FRAME_COUNTS aggregate_fc;
 #endif  // CONFIG_ENTROPY_STATS
@@ -100,30 +99,6 @@ FILE *yuv_rec_file;
 #define FILE_NAME_LEN 100
 #endif
 
-// Estimate if the source frame is screen content, based on the portion of
-// blocks that have no more than 4 (experimentally selected) luma colors.
-static int is_screen_content(const uint8_t *src, int use_hbd, int bd,
-                             int stride, int width, int height) {
-  assert(src != NULL);
-  int counts = 0;
-  const int blk_w = 16;
-  const int blk_h = 16;
-  const int limit = 4;
-  for (int r = 0; r + blk_h <= height; r += blk_h) {
-    for (int c = 0; c + blk_w <= width; c += blk_w) {
-      int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
-      const int n_colors =
-          use_hbd ? av1_count_colors_highbd(src + r * stride + c, stride, blk_w,
-                                            blk_h, bd, count_buf)
-                  : av1_count_colors(src + r * stride + c, stride, blk_w, blk_h,
-                                     count_buf);
-      if (n_colors > 1 && n_colors <= limit) counts++;
-    }
-  }
-  // The threshold is 10%.
-  return counts * blk_h * blk_w * 10 > width * height;
-}
-
 static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -269,7 +244,7 @@ int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
 // by calculuating the 16x4 Horizontal DCT. This is to be used to
 // decide the superresolution parameters.
 void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
-  uint64_t freq_energy[8] = { 0 };
+  uint64_t freq_energy[16] = { 0 };
   const YV12_BUFFER_CONFIG *buf = cpi->source;
   const int bd = cpi->td.mb.e_mbd.bd;
   const int width = buf->y_crop_width;
@@ -283,14 +258,13 @@ void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
       for (int j = 0; j < width - 16; j += 16) {
         av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride,
                             H_DCT, bd);
-        for (int k = 8; k < 16; ++k) {
+        for (int k = 1; k < 16; ++k) {
           const uint64_t this_energy =
               ((int64_t)coeff[k] * coeff[k]) +
               ((int64_t)coeff[k + 16] * coeff[k + 16]) +
               ((int64_t)coeff[k + 32] * coeff[k + 32]) +
               ((int64_t)coeff[k + 48] * coeff[k + 48]);
-          freq_energy[k - 8] +=
-              ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
+          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
         }
         n++;
       }
@@ -305,24 +279,24 @@ void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
             src16[ii * 16 + jj] =
                 buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)];
         av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd);
-        for (int k = 8; k < 16; ++k) {
+        for (int k = 1; k < 16; ++k) {
           const uint64_t this_energy =
               ((int64_t)coeff[k] * coeff[k]) +
               ((int64_t)coeff[k + 16] * coeff[k + 16]) +
               ((int64_t)coeff[k + 32] * coeff[k + 32]) +
               ((int64_t)coeff[k + 48] * coeff[k + 48]);
-          freq_energy[k - 8] += ROUND_POWER_OF_TWO(this_energy, 2);
+          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2);
         }
         n++;
       }
     }
   }
   if (n) {
-    for (int k = 0; k < 8; ++k) energy[k] = (double)freq_energy[k] / n;
+    for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n;
     // Convert to cumulative energy
-    for (int k = 6; k >= 0; --k) energy[k] += energy[k + 1];
+    for (int k = 14; k > 0; --k) energy[k] += energy[k + 1];
   } else {
-    for (int k = 0; k < 8; ++k) energy[k] = 1e+20;
+    for (int k = 1; k < 16; ++k) energy[k] = 1e+20;
   }
 }
 
@@ -358,6 +332,9 @@ static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
   // When superres / resize is on, 'cm->width / height' can change between
   // calls, so we don't apply this heuristic there. Also, this heuristic gives
   // compression gain for speed >= 2 only.
+  // Things break if superblock size changes per-frame which is why this
+  // heuristic is set based on configured speed rather than actual
+  // speed-features (which may change per-frame in future)
   if (cpi->oxcf.superres_mode == SUPERRES_NONE &&
       cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 2) {
     return (cm->width >= 480 && cm->height >= 360) ? BLOCK_128X128
@@ -375,64 +352,28 @@ static void setup_frame(AV1_COMP *cpi) {
   // other inter-frames the encoder currently uses only two contexts;
   // context 1 for ALTREF frames and context 0 for the others.
 
-  cm->primary_ref_frame = PRIMARY_REF_NONE;
   if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
-      cm->force_primary_ref_none) {
+      cpi->ext_use_primary_ref_none) {
     av1_setup_past_independence(cm);
-    for (int i = 0; i < REF_FRAMES; i++) {
-      cm->fb_of_context_type[i] = -1;
-    }
-    cm->fb_of_context_type[REGULAR_FRAME] =
-        cm->show_frame ? get_ref_frame_map_idx(cpi, GOLDEN_FRAME)
-                       : get_ref_frame_map_idx(cpi, ALTREF_FRAME);
-    cm->frame_context_idx = REGULAR_FRAME;
-  } else {
-    const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-    if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE)
-      cm->frame_context_idx = EXT_ARF_FRAME;
-    else if (cpi->refresh_alt_ref_frame)
-      cm->frame_context_idx = ARF_FRAME;
-    else if (cpi->rc.is_src_frame_alt_ref)
-      cm->frame_context_idx = OVERLAY_FRAME;
-    else if (cpi->refresh_golden_frame)
-      cm->frame_context_idx = GLD_FRAME;
-    else if (cpi->refresh_bwd_ref_frame)
-      cm->frame_context_idx = BRF_FRAME;
-    else
-      cm->frame_context_idx = REGULAR_FRAME;
-    int wanted_fb = cm->fb_of_context_type[cm->frame_context_idx];
-    for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-      int fb = get_ref_frame_map_idx(cpi, ref_frame);
-      if (fb == wanted_fb) {
-        cm->primary_ref_frame = ref_frame - LAST_FRAME;
-      }
-    }
   }
 
   if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
-    cpi->refresh_golden_frame = 1;
-    cpi->refresh_alt_ref_frame = 1;
-    av1_zero(cpi->interp_filter_selected);
     set_sb_size(&cm->seq_params, select_sb_size(cpi));
   } else if (frame_is_sframe(cm)) {
-    cpi->refresh_golden_frame = 1;
-    cpi->refresh_alt_ref_frame = 1;
-    av1_zero(cpi->interp_filter_selected);
     set_sb_size(&cm->seq_params, select_sb_size(cpi));
   } else {
-    if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
-        cm->current_frame.frame_refs[cm->primary_ref_frame].buf == NULL) {
+    const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
+    if (primary_ref_buf == NULL) {
       av1_setup_past_independence(cm);
       cm->seg.update_map = 1;
       cm->seg.update_data = 1;
     } else {
-      *cm->fc = cm->current_frame.frame_refs[cm->primary_ref_frame]
-                    .buf->frame_context;
+      *cm->fc = primary_ref_buf->frame_context;
     }
-    av1_zero(cpi->interp_filter_selected[0]);
   }
 
-  cm->prev_frame = get_prev_frame(cm);
+  av1_zero(cm->cur_frame->interp_filter_selected);
+  cm->prev_frame = get_primary_ref_frame_buf(cm);
   cpi->vaq_refresh = 0;
 }
 
@@ -526,6 +467,20 @@ static void alloc_context_buffers_ext(AV1_COMP *cpi) {
                   aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
 }
 
+static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
+  pars->num_cr_points = 0;
+  pars->cr_mult = 0;
+  pars->cr_luma_mult = 0;
+  memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr));
+  memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr));
+  pars->num_cb_points = 0;
+  pars->cb_mult = 0;
+  pars->cb_luma_mult = 0;
+  pars->chroma_scaling_from_luma = 0;
+  memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb));
+  memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb));
+}
+
 static void update_film_grain_parameters(struct AV1_COMP *cpi,
                                          const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
@@ -543,20 +498,27 @@ static void update_film_grain_parameters(struct AV1_COMP *cpi,
       memcpy(&cm->film_grain_params,
              film_grain_test_vectors + oxcf->film_grain_test_vector - 1,
              sizeof(cm->film_grain_params));
-
+      if (oxcf->monochrome)
+        reset_film_grain_chroma_params(&cm->film_grain_params);
       cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
       if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) {
         cm->film_grain_params.clip_to_restricted_range = 0;
       }
     }
   } else if (oxcf->film_grain_table_filename) {
+    cm->seq_params.film_grain_params_present = 1;
+
     cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
     memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t));
 
     aom_film_grain_table_read(cpi->film_grain_table,
                               oxcf->film_grain_table_filename, &cm->error);
   } else {
+#if CONFIG_DENOISE
+    cm->seq_params.film_grain_params_present = (cpi->oxcf.noise_level > 0);
+#else
     cm->seq_params.film_grain_params_present = 0;
+#endif
     memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
 }
@@ -589,10 +551,8 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   aom_free(cpi->td.mb.wsrc_buf);
   cpi->td.mb.wsrc_buf = NULL;
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
   aom_free(cpi->td.mb.inter_modes_info);
   cpi->td.mb.inter_modes_info = NULL;
-#endif
 
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++) {
@@ -809,7 +769,7 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
 static void update_reference_segmentation_map(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MB_MODE_INFO **mi_4x4_ptr = cm->mi_grid_visible;
-  uint8_t *cache_ptr = cm->current_frame_seg_map;
+  uint8_t *cache_ptr = cm->cur_frame->seg_map;
   int row, col;
 
   for (row = 0; row < cm->mi_rows; row++) {
@@ -827,11 +787,13 @@ static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
   const SequenceHeader *const seq_params = &cm->seq_params;
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
 
-  if (!cpi->lookahead)
-    cpi->lookahead =
-        av1_lookahead_init(oxcf->width, oxcf->height, seq_params->subsampling_x,
-                           seq_params->subsampling_y,
-                           seq_params->use_highbitdepth, oxcf->lag_in_frames);
+  if (!cpi->lookahead) {
+    int is_scale = (oxcf->resize_mode || oxcf->superres_mode);
+    cpi->lookahead = av1_lookahead_init(
+        oxcf->width, oxcf->height, seq_params->subsampling_x,
+        seq_params->subsampling_y, seq_params->use_highbitdepth,
+        oxcf->lag_in_frames, oxcf->border_in_pixels, is_scale);
+  }
   if (!cpi->lookahead)
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
@@ -840,7 +802,7 @@ static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
   if (aom_realloc_frame_buffer(
           &cpi->alt_ref_buffer, oxcf->width, oxcf->height,
           seq_params->subsampling_x, seq_params->subsampling_y,
-          seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          seq_params->use_highbitdepth, oxcf->border_in_pixels,
           cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
@@ -852,7 +814,7 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) {
   if (aom_realloc_frame_buffer(
           &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+          cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
@@ -860,21 +822,21 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) {
           &cpi->trial_frame_rst, cm->superres_upscaled_width,
           cm->superres_upscaled_height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+          AOM_RESTORATION_FRAME_BORDER, cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate trial restored frame buffer");
 
   if (aom_realloc_frame_buffer(
           &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+          cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
   if (aom_realloc_frame_buffer(
           &cpi->scaled_last_source, cm->width, cm->height,
           seq_params->subsampling_x, seq_params->subsampling_y,
-          seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
           cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
@@ -978,10 +940,9 @@ static void update_frame_size(AV1_COMP *cpi) {
 static void init_buffer_indices(AV1_COMP *cpi) {
   int fb_idx;
   for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
-    cpi->remapped_ref_idx[fb_idx] = fb_idx;
+    cpi->common.remapped_ref_idx[fb_idx] = fb_idx;
   cpi->rate_index = 0;
   cpi->rate_size = 0;
-  cpi->cur_poc = -1;
 }
 
 static INLINE int does_level_match(int width, int height, double fps,
@@ -1003,77 +964,58 @@ static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
   // and max display sample rates.
   // Need to add checks for max bit rate, max decoded luma sample rate, header
   // rate, etc. that are not covered by this function.
-  (void)oxcf;
-  BitstreamLevel bl = { 9, 3 };
+  AV1_LEVEL level = SEQ_LEVEL_MAX;
   if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 512,
                        288, 30.0, 4)) {
-    bl.major = 2;
-    bl.minor = 0;
+    level = SEQ_LEVEL_2_0;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               704, 396, 30.0, 4)) {
-    bl.major = 2;
-    bl.minor = 1;
+    level = SEQ_LEVEL_2_1;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               1088, 612, 30.0, 4)) {
-    bl.major = 3;
-    bl.minor = 0;
+    level = SEQ_LEVEL_3_0;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               1376, 774, 30.0, 4)) {
-    bl.major = 3;
-    bl.minor = 1;
+    level = SEQ_LEVEL_3_1;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               2048, 1152, 30.0, 3)) {
-    bl.major = 4;
-    bl.minor = 0;
+    level = SEQ_LEVEL_4_0;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               2048, 1152, 60.0, 3)) {
-    bl.major = 4;
-    bl.minor = 1;
+    level = SEQ_LEVEL_4_1;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               4096, 2176, 30.0, 2)) {
-    bl.major = 5;
-    bl.minor = 0;
+    level = SEQ_LEVEL_5_0;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               4096, 2176, 60.0, 2)) {
-    bl.major = 5;
-    bl.minor = 1;
+    level = SEQ_LEVEL_5_1;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               4096, 2176, 120.0, 2)) {
-    bl.major = 5;
-    bl.minor = 2;
+    level = SEQ_LEVEL_5_2;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               8192, 4352, 30.0, 2)) {
-    bl.major = 6;
-    bl.minor = 0;
+    level = SEQ_LEVEL_6_0;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               8192, 4352, 60.0, 2)) {
-    bl.major = 6;
-    bl.minor = 1;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               8192, 4352, 120.0, 2)) {
-    bl.major = 6;
-    bl.minor = 2;
+    level = SEQ_LEVEL_6_2;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               16384, 8704, 30.0, 2)) {
-    bl.major = 7;
-    bl.minor = 0;
+    level = SEQ_LEVEL_7_0;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               16384, 8704, 60.0, 2)) {
-    bl.major = 7;
-    bl.minor = 1;
+    level = SEQ_LEVEL_7_1;
   } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
                               16384, 8704, 120.0, 2)) {
-    bl.major = 7;
-    bl.minor = 2;
+    level = SEQ_LEVEL_7_2;
   }
   for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
-    seq->level[i] = bl;
-    seq->tier[i] = 0;  // setting main tier by default
+    seq->seq_level_idx[i] = level;
     // Set the maximum parameters for bitrate and buffer size for this profile,
     // level, and tier
     cm->op_params[i].bitrate = max_level_bitrate(
-        cm->seq_params.profile, major_minor_to_seq_level_idx(seq->level[i]),
-        seq->tier[i]);
+        cm->seq_params.profile, seq->seq_level_idx[i], seq->tier[i]);
     // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
     // check
     if (cm->op_params[i].bitrate == 0)
@@ -1106,9 +1048,24 @@ static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
           ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1
           : -1;
 
+  seq->max_frame_width =
+      oxcf->forced_max_frame_width ? oxcf->forced_max_frame_width : oxcf->width;
+  seq->max_frame_height = oxcf->forced_max_frame_height
+                              ? oxcf->forced_max_frame_height
+                              : oxcf->height;
+  seq->num_bits_width =
+      (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1;
+  seq->num_bits_height =
+      (seq->max_frame_height > 1) ? get_msb(seq->max_frame_height - 1) + 1 : 1;
+  assert(seq->num_bits_width <= 16);
+  assert(seq->num_bits_height <= 16);
+
+  seq->frame_id_length = FRAME_ID_LENGTH;
+  seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
+
   seq->enable_dual_filter = oxcf->enable_dual_filter;
-  seq->order_hint_info.enable_jnt_comp = oxcf->enable_jnt_comp;
-  seq->order_hint_info.enable_jnt_comp &=
+  seq->order_hint_info.enable_dist_wtd_comp = oxcf->enable_dist_wtd_comp;
+  seq->order_hint_info.enable_dist_wtd_comp &=
       seq->order_hint_info.enable_order_hint;
   seq->order_hint_info.enable_ref_frame_mvs = oxcf->enable_ref_frame_mvs;
   seq->order_hint_info.enable_ref_frame_mvs &=
@@ -1117,10 +1074,10 @@ static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
   seq->enable_cdef = oxcf->enable_cdef;
   seq->enable_restoration = oxcf->enable_restoration;
   seq->enable_warped_motion = oxcf->enable_warped_motion;
-  seq->enable_interintra_compound = 1;
-  seq->enable_masked_compound = 1;
-  seq->enable_intra_edge_filter = 1;
-  seq->enable_filter_intra = 1;
+  seq->enable_interintra_compound = oxcf->enable_interintra_comp;
+  seq->enable_masked_compound = oxcf->enable_masked_comp;
+  seq->enable_intra_edge_filter = oxcf->enable_intra_edge_filter;
+  seq->enable_filter_intra = oxcf->enable_filter_intra;
 
   set_bitstream_level_tier(seq, cm, oxcf);
 
@@ -1317,14 +1274,14 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
   static unsigned int fnname##_bits8(                                       \
       const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
       int ref_stride, const uint8_t *second_pred,                           \
-      const JNT_COMP_PARAMS *jcp_param) {                                   \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
     return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
                   jcp_param);                                               \
   }                                                                         \
   static unsigned int fnname##_bits10(                                      \
       const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
       int ref_stride, const uint8_t *second_pred,                           \
-      const JNT_COMP_PARAMS *jcp_param) {                                   \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
     return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
                   jcp_param) >>                                             \
            2;                                                               \
@@ -1332,7 +1289,7 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
   static unsigned int fnname##_bits12(                                      \
       const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
       int ref_stride, const uint8_t *second_pred,                           \
-      const JNT_COMP_PARAMS *jcp_param) {                                   \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
     return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
                   jcp_param) >>                                             \
            4;                                                               \
@@ -1406,28 +1363,28 @@ MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
 
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x128_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x128_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x4_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x4_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x4_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg)
 
 #define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
   cpi->fn_ptr[BT].msdf = MCSDF;       \
@@ -1536,166 +1493,167 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                    aom_highbd_8_sub_pixel_variance64x16,
                    aom_highbd_8_sub_pixel_avg_variance64x16,
                    aom_highbd_sad64x16x4d_bits8,
-                   aom_highbd_jnt_sad64x16_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance64x16)
+                   aom_highbd_dist_wtd_sad64x16_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16)
 
         HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8,
                    aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64,
                    aom_highbd_8_sub_pixel_variance16x64,
                    aom_highbd_8_sub_pixel_avg_variance16x64,
                    aom_highbd_sad16x64x4d_bits8,
-                   aom_highbd_jnt_sad16x64_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance16x64)
+                   aom_highbd_dist_wtd_sad16x64_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64)
 
         HIGHBD_BFP(
             BLOCK_32X8, aom_highbd_sad32x8_bits8, aom_highbd_sad32x8_avg_bits8,
             aom_highbd_8_variance32x8, aom_highbd_8_sub_pixel_variance32x8,
             aom_highbd_8_sub_pixel_avg_variance32x8,
-            aom_highbd_sad32x8x4d_bits8, aom_highbd_jnt_sad32x8_avg_bits8,
-            aom_highbd_8_jnt_sub_pixel_avg_variance32x8)
+            aom_highbd_sad32x8x4d_bits8, aom_highbd_dist_wtd_sad32x8_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8)
 
         HIGHBD_BFP(
             BLOCK_8X32, aom_highbd_sad8x32_bits8, aom_highbd_sad8x32_avg_bits8,
             aom_highbd_8_variance8x32, aom_highbd_8_sub_pixel_variance8x32,
             aom_highbd_8_sub_pixel_avg_variance8x32,
-            aom_highbd_sad8x32x4d_bits8, aom_highbd_jnt_sad8x32_avg_bits8,
-            aom_highbd_8_jnt_sub_pixel_avg_variance8x32)
+            aom_highbd_sad8x32x4d_bits8, aom_highbd_dist_wtd_sad8x32_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32)
 
         HIGHBD_BFP(
             BLOCK_16X4, aom_highbd_sad16x4_bits8, aom_highbd_sad16x4_avg_bits8,
             aom_highbd_8_variance16x4, aom_highbd_8_sub_pixel_variance16x4,
             aom_highbd_8_sub_pixel_avg_variance16x4,
-            aom_highbd_sad16x4x4d_bits8, aom_highbd_jnt_sad16x4_avg_bits8,
-            aom_highbd_8_jnt_sub_pixel_avg_variance16x4)
+            aom_highbd_sad16x4x4d_bits8, aom_highbd_dist_wtd_sad16x4_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4)
 
         HIGHBD_BFP(
             BLOCK_4X16, aom_highbd_sad4x16_bits8, aom_highbd_sad4x16_avg_bits8,
             aom_highbd_8_variance4x16, aom_highbd_8_sub_pixel_variance4x16,
             aom_highbd_8_sub_pixel_avg_variance4x16,
-            aom_highbd_sad4x16x4d_bits8, aom_highbd_jnt_sad4x16_avg_bits8,
-            aom_highbd_8_jnt_sub_pixel_avg_variance4x16)
+            aom_highbd_sad4x16x4d_bits8, aom_highbd_dist_wtd_sad4x16_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16)
 
         HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8,
                    aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16,
                    aom_highbd_8_sub_pixel_variance32x16,
                    aom_highbd_8_sub_pixel_avg_variance32x16,
                    aom_highbd_sad32x16x4d_bits8,
-                   aom_highbd_jnt_sad32x16_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance32x16)
+                   aom_highbd_dist_wtd_sad32x16_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16)
 
         HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8,
                    aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32,
                    aom_highbd_8_sub_pixel_variance16x32,
                    aom_highbd_8_sub_pixel_avg_variance16x32,
                    aom_highbd_sad16x32x4d_bits8,
-                   aom_highbd_jnt_sad16x32_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance16x32)
+                   aom_highbd_dist_wtd_sad16x32_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32)
 
         HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8,
                    aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32,
                    aom_highbd_8_sub_pixel_variance64x32,
                    aom_highbd_8_sub_pixel_avg_variance64x32,
                    aom_highbd_sad64x32x4d_bits8,
-                   aom_highbd_jnt_sad64x32_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance64x32)
+                   aom_highbd_dist_wtd_sad64x32_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32)
 
         HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8,
                    aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64,
                    aom_highbd_8_sub_pixel_variance32x64,
                    aom_highbd_8_sub_pixel_avg_variance32x64,
                    aom_highbd_sad32x64x4d_bits8,
-                   aom_highbd_jnt_sad32x64_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance32x64)
+                   aom_highbd_dist_wtd_sad32x64_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64)
 
         HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8,
                    aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32,
                    aom_highbd_8_sub_pixel_variance32x32,
                    aom_highbd_8_sub_pixel_avg_variance32x32,
                    aom_highbd_sad32x32x4d_bits8,
-                   aom_highbd_jnt_sad32x32_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance32x32)
+                   aom_highbd_dist_wtd_sad32x32_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32)
 
         HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8,
                    aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64,
                    aom_highbd_8_sub_pixel_variance64x64,
                    aom_highbd_8_sub_pixel_avg_variance64x64,
                    aom_highbd_sad64x64x4d_bits8,
-                   aom_highbd_jnt_sad64x64_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance64x64)
+                   aom_highbd_dist_wtd_sad64x64_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64)
 
         HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8,
                    aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16,
                    aom_highbd_8_sub_pixel_variance16x16,
                    aom_highbd_8_sub_pixel_avg_variance16x16,
                    aom_highbd_sad16x16x4d_bits8,
-                   aom_highbd_jnt_sad16x16_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance16x16)
+                   aom_highbd_dist_wtd_sad16x16_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16)
 
         HIGHBD_BFP(
             BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8,
             aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8,
             aom_highbd_8_sub_pixel_avg_variance16x8,
-            aom_highbd_sad16x8x4d_bits8, aom_highbd_jnt_sad16x8_avg_bits8,
-            aom_highbd_8_jnt_sub_pixel_avg_variance16x8)
+            aom_highbd_sad16x8x4d_bits8, aom_highbd_dist_wtd_sad16x8_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8)
 
         HIGHBD_BFP(
             BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8,
             aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16,
             aom_highbd_8_sub_pixel_avg_variance8x16,
-            aom_highbd_sad8x16x4d_bits8, aom_highbd_jnt_sad8x16_avg_bits8,
-            aom_highbd_8_jnt_sub_pixel_avg_variance8x16)
-
-        HIGHBD_BFP(BLOCK_8X8, aom_highbd_sad8x8_bits8,
-                   aom_highbd_sad8x8_avg_bits8, aom_highbd_8_variance8x8,
-                   aom_highbd_8_sub_pixel_variance8x8,
-                   aom_highbd_8_sub_pixel_avg_variance8x8,
-                   aom_highbd_sad8x8x4d_bits8, aom_highbd_jnt_sad8x8_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance8x8)
-
-        HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits8,
-                   aom_highbd_sad8x4_avg_bits8, aom_highbd_8_variance8x4,
-                   aom_highbd_8_sub_pixel_variance8x4,
-                   aom_highbd_8_sub_pixel_avg_variance8x4,
-                   aom_highbd_sad8x4x4d_bits8, aom_highbd_jnt_sad8x4_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance8x4)
-
-        HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits8,
-                   aom_highbd_sad4x8_avg_bits8, aom_highbd_8_variance4x8,
-                   aom_highbd_8_sub_pixel_variance4x8,
-                   aom_highbd_8_sub_pixel_avg_variance4x8,
-                   aom_highbd_sad4x8x4d_bits8, aom_highbd_jnt_sad4x8_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance4x8)
-
-        HIGHBD_BFP(BLOCK_4X4, aom_highbd_sad4x4_bits8,
-                   aom_highbd_sad4x4_avg_bits8, aom_highbd_8_variance4x4,
-                   aom_highbd_8_sub_pixel_variance4x4,
-                   aom_highbd_8_sub_pixel_avg_variance4x4,
-                   aom_highbd_sad4x4x4d_bits8, aom_highbd_jnt_sad4x4_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance4x4)
+            aom_highbd_sad8x16x4d_bits8, aom_highbd_dist_wtd_sad8x16_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8,
+            aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8,
+            aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x4d_bits8,
+            aom_highbd_dist_wtd_sad8x8_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8)
+
+        HIGHBD_BFP(
+            BLOCK_8X4, aom_highbd_sad8x4_bits8, aom_highbd_sad8x4_avg_bits8,
+            aom_highbd_8_variance8x4, aom_highbd_8_sub_pixel_variance8x4,
+            aom_highbd_8_sub_pixel_avg_variance8x4, aom_highbd_sad8x4x4d_bits8,
+            aom_highbd_dist_wtd_sad8x4_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4)
+
+        HIGHBD_BFP(
+            BLOCK_4X8, aom_highbd_sad4x8_bits8, aom_highbd_sad4x8_avg_bits8,
+            aom_highbd_8_variance4x8, aom_highbd_8_sub_pixel_variance4x8,
+            aom_highbd_8_sub_pixel_avg_variance4x8, aom_highbd_sad4x8x4d_bits8,
+            aom_highbd_dist_wtd_sad4x8_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8)
 
         HIGHBD_BFP(
-            BLOCK_128X128, aom_highbd_sad128x128_bits8,
-            aom_highbd_sad128x128_avg_bits8, aom_highbd_8_variance128x128,
-            aom_highbd_8_sub_pixel_variance128x128,
-            aom_highbd_8_sub_pixel_avg_variance128x128,
-            aom_highbd_sad128x128x4d_bits8, aom_highbd_jnt_sad128x128_avg_bits8,
-            aom_highbd_8_jnt_sub_pixel_avg_variance128x128)
+            BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8,
+            aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4,
+            aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x4d_bits8,
+            aom_highbd_dist_wtd_sad4x4_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4)
+
+        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8,
+                   aom_highbd_sad128x128_avg_bits8,
+                   aom_highbd_8_variance128x128,
+                   aom_highbd_8_sub_pixel_variance128x128,
+                   aom_highbd_8_sub_pixel_avg_variance128x128,
+                   aom_highbd_sad128x128x4d_bits8,
+                   aom_highbd_dist_wtd_sad128x128_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128)
 
         HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8,
                    aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64,
                    aom_highbd_8_sub_pixel_variance128x64,
                    aom_highbd_8_sub_pixel_avg_variance128x64,
                    aom_highbd_sad128x64x4d_bits8,
-                   aom_highbd_jnt_sad128x64_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance128x64)
+                   aom_highbd_dist_wtd_sad128x64_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64)
 
         HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8,
                    aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128,
                    aom_highbd_8_sub_pixel_variance64x128,
                    aom_highbd_8_sub_pixel_avg_variance64x128,
                    aom_highbd_sad64x128x4d_bits8,
-                   aom_highbd_jnt_sad64x128_avg_bits8,
-                   aom_highbd_8_jnt_sub_pixel_avg_variance64x128)
+                   aom_highbd_dist_wtd_sad64x128_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128)
 
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
                     aom_highbd_8_masked_sub_pixel_variance128x128)
@@ -1815,148 +1773,148 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                    aom_highbd_10_sub_pixel_variance64x16,
                    aom_highbd_10_sub_pixel_avg_variance64x16,
                    aom_highbd_sad64x16x4d_bits10,
-                   aom_highbd_jnt_sad64x16_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance64x16);
+                   aom_highbd_dist_wtd_sad64x16_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16);
 
         HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10,
                    aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64,
                    aom_highbd_10_sub_pixel_variance16x64,
                    aom_highbd_10_sub_pixel_avg_variance16x64,
                    aom_highbd_sad16x64x4d_bits10,
-                   aom_highbd_jnt_sad16x64_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance16x64);
+                   aom_highbd_dist_wtd_sad16x64_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64);
 
         HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10,
                    aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8,
                    aom_highbd_10_sub_pixel_variance32x8,
                    aom_highbd_10_sub_pixel_avg_variance32x8,
                    aom_highbd_sad32x8x4d_bits10,
-                   aom_highbd_jnt_sad32x8_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance32x8);
+                   aom_highbd_dist_wtd_sad32x8_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8);
 
         HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10,
                    aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32,
                    aom_highbd_10_sub_pixel_variance8x32,
                    aom_highbd_10_sub_pixel_avg_variance8x32,
                    aom_highbd_sad8x32x4d_bits10,
-                   aom_highbd_jnt_sad8x32_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance8x32);
+                   aom_highbd_dist_wtd_sad8x32_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32);
 
         HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10,
                    aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4,
                    aom_highbd_10_sub_pixel_variance16x4,
                    aom_highbd_10_sub_pixel_avg_variance16x4,
                    aom_highbd_sad16x4x4d_bits10,
-                   aom_highbd_jnt_sad16x4_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance16x4);
+                   aom_highbd_dist_wtd_sad16x4_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4);
 
         HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10,
                    aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16,
                    aom_highbd_10_sub_pixel_variance4x16,
                    aom_highbd_10_sub_pixel_avg_variance4x16,
                    aom_highbd_sad4x16x4d_bits10,
-                   aom_highbd_jnt_sad4x16_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance4x16);
+                   aom_highbd_dist_wtd_sad4x16_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16);
 
         HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10,
                    aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16,
                    aom_highbd_10_sub_pixel_variance32x16,
                    aom_highbd_10_sub_pixel_avg_variance32x16,
                    aom_highbd_sad32x16x4d_bits10,
-                   aom_highbd_jnt_sad32x16_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance32x16);
+                   aom_highbd_dist_wtd_sad32x16_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16);
 
         HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10,
                    aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32,
                    aom_highbd_10_sub_pixel_variance16x32,
                    aom_highbd_10_sub_pixel_avg_variance16x32,
                    aom_highbd_sad16x32x4d_bits10,
-                   aom_highbd_jnt_sad16x32_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance16x32);
+                   aom_highbd_dist_wtd_sad16x32_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32);
 
         HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10,
                    aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32,
                    aom_highbd_10_sub_pixel_variance64x32,
                    aom_highbd_10_sub_pixel_avg_variance64x32,
                    aom_highbd_sad64x32x4d_bits10,
-                   aom_highbd_jnt_sad64x32_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance64x32);
+                   aom_highbd_dist_wtd_sad64x32_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32);
 
         HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10,
                    aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64,
                    aom_highbd_10_sub_pixel_variance32x64,
                    aom_highbd_10_sub_pixel_avg_variance32x64,
                    aom_highbd_sad32x64x4d_bits10,
-                   aom_highbd_jnt_sad32x64_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance32x64);
+                   aom_highbd_dist_wtd_sad32x64_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64);
 
         HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10,
                    aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32,
                    aom_highbd_10_sub_pixel_variance32x32,
                    aom_highbd_10_sub_pixel_avg_variance32x32,
                    aom_highbd_sad32x32x4d_bits10,
-                   aom_highbd_jnt_sad32x32_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance32x32);
+                   aom_highbd_dist_wtd_sad32x32_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32);
 
         HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10,
                    aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64,
                    aom_highbd_10_sub_pixel_variance64x64,
                    aom_highbd_10_sub_pixel_avg_variance64x64,
                    aom_highbd_sad64x64x4d_bits10,
-                   aom_highbd_jnt_sad64x64_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance64x64);
+                   aom_highbd_dist_wtd_sad64x64_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64);
 
         HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10,
                    aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16,
                    aom_highbd_10_sub_pixel_variance16x16,
                    aom_highbd_10_sub_pixel_avg_variance16x16,
                    aom_highbd_sad16x16x4d_bits10,
-                   aom_highbd_jnt_sad16x16_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance16x16);
+                   aom_highbd_dist_wtd_sad16x16_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16);
 
         HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10,
                    aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8,
                    aom_highbd_10_sub_pixel_variance16x8,
                    aom_highbd_10_sub_pixel_avg_variance16x8,
                    aom_highbd_sad16x8x4d_bits10,
-                   aom_highbd_jnt_sad16x8_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance16x8);
+                   aom_highbd_dist_wtd_sad16x8_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8);
 
         HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10,
                    aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16,
                    aom_highbd_10_sub_pixel_variance8x16,
                    aom_highbd_10_sub_pixel_avg_variance8x16,
                    aom_highbd_sad8x16x4d_bits10,
-                   aom_highbd_jnt_sad8x16_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance8x16);
+                   aom_highbd_dist_wtd_sad8x16_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16);
 
         HIGHBD_BFP(
             BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10,
             aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8,
             aom_highbd_10_sub_pixel_avg_variance8x8,
-            aom_highbd_sad8x8x4d_bits10, aom_highbd_jnt_sad8x8_avg_bits10,
-            aom_highbd_10_jnt_sub_pixel_avg_variance8x8);
+            aom_highbd_sad8x8x4d_bits10, aom_highbd_dist_wtd_sad8x8_avg_bits10,
+            aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8);
 
         HIGHBD_BFP(
             BLOCK_8X4, aom_highbd_sad8x4_bits10, aom_highbd_sad8x4_avg_bits10,
             aom_highbd_10_variance8x4, aom_highbd_10_sub_pixel_variance8x4,
             aom_highbd_10_sub_pixel_avg_variance8x4,
-            aom_highbd_sad8x4x4d_bits10, aom_highbd_jnt_sad8x4_avg_bits10,
-            aom_highbd_10_jnt_sub_pixel_avg_variance8x4);
+            aom_highbd_sad8x4x4d_bits10, aom_highbd_dist_wtd_sad8x4_avg_bits10,
+            aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4);
 
         HIGHBD_BFP(
             BLOCK_4X8, aom_highbd_sad4x8_bits10, aom_highbd_sad4x8_avg_bits10,
             aom_highbd_10_variance4x8, aom_highbd_10_sub_pixel_variance4x8,
             aom_highbd_10_sub_pixel_avg_variance4x8,
-            aom_highbd_sad4x8x4d_bits10, aom_highbd_jnt_sad4x8_avg_bits10,
-            aom_highbd_10_jnt_sub_pixel_avg_variance4x8);
+            aom_highbd_sad4x8x4d_bits10, aom_highbd_dist_wtd_sad4x8_avg_bits10,
+            aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8);
 
         HIGHBD_BFP(
             BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10,
             aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4,
             aom_highbd_10_sub_pixel_avg_variance4x4,
-            aom_highbd_sad4x4x4d_bits10, aom_highbd_jnt_sad4x4_avg_bits10,
-            aom_highbd_10_jnt_sub_pixel_avg_variance4x4);
+            aom_highbd_sad4x4x4d_bits10, aom_highbd_dist_wtd_sad4x4_avg_bits10,
+            aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4);
 
         HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits10,
                    aom_highbd_sad128x128_avg_bits10,
@@ -1964,24 +1922,26 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                    aom_highbd_10_sub_pixel_variance128x128,
                    aom_highbd_10_sub_pixel_avg_variance128x128,
                    aom_highbd_sad128x128x4d_bits10,
-                   aom_highbd_jnt_sad128x128_avg_bits10,
-                   aom_highbd_10_jnt_sub_pixel_avg_variance128x128);
-
-        HIGHBD_BFP(
-            BLOCK_128X64, aom_highbd_sad128x64_bits10,
-            aom_highbd_sad128x64_avg_bits10, aom_highbd_10_variance128x64,
-            aom_highbd_10_sub_pixel_variance128x64,
-            aom_highbd_10_sub_pixel_avg_variance128x64,
-            aom_highbd_sad128x64x4d_bits10, aom_highbd_jnt_sad128x64_avg_bits10,
-            aom_highbd_10_jnt_sub_pixel_avg_variance128x64);
-
-        HIGHBD_BFP(
-            BLOCK_64X128, aom_highbd_sad64x128_bits10,
-            aom_highbd_sad64x128_avg_bits10, aom_highbd_10_variance64x128,
-            aom_highbd_10_sub_pixel_variance64x128,
-            aom_highbd_10_sub_pixel_avg_variance64x128,
-            aom_highbd_sad64x128x4d_bits10, aom_highbd_jnt_sad64x128_avg_bits10,
-            aom_highbd_10_jnt_sub_pixel_avg_variance64x128);
+                   aom_highbd_dist_wtd_sad128x128_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128);
+
+        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10,
+                   aom_highbd_sad128x64_avg_bits10,
+                   aom_highbd_10_variance128x64,
+                   aom_highbd_10_sub_pixel_variance128x64,
+                   aom_highbd_10_sub_pixel_avg_variance128x64,
+                   aom_highbd_sad128x64x4d_bits10,
+                   aom_highbd_dist_wtd_sad128x64_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64);
+
+        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10,
+                   aom_highbd_sad64x128_avg_bits10,
+                   aom_highbd_10_variance64x128,
+                   aom_highbd_10_sub_pixel_variance64x128,
+                   aom_highbd_10_sub_pixel_avg_variance64x128,
+                   aom_highbd_sad64x128x4d_bits10,
+                   aom_highbd_dist_wtd_sad64x128_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128);
 
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
                     aom_highbd_10_masked_sub_pixel_variance128x128)
@@ -2107,148 +2067,148 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                    aom_highbd_12_sub_pixel_variance64x16,
                    aom_highbd_12_sub_pixel_avg_variance64x16,
                    aom_highbd_sad64x16x4d_bits12,
-                   aom_highbd_jnt_sad64x16_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance64x16);
+                   aom_highbd_dist_wtd_sad64x16_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16);
 
         HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12,
                    aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64,
                    aom_highbd_12_sub_pixel_variance16x64,
                    aom_highbd_12_sub_pixel_avg_variance16x64,
                    aom_highbd_sad16x64x4d_bits12,
-                   aom_highbd_jnt_sad16x64_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance16x64);
+                   aom_highbd_dist_wtd_sad16x64_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64);
 
         HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12,
                    aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8,
                    aom_highbd_12_sub_pixel_variance32x8,
                    aom_highbd_12_sub_pixel_avg_variance32x8,
                    aom_highbd_sad32x8x4d_bits12,
-                   aom_highbd_jnt_sad32x8_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance32x8);
+                   aom_highbd_dist_wtd_sad32x8_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8);
 
         HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12,
                    aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32,
                    aom_highbd_12_sub_pixel_variance8x32,
                    aom_highbd_12_sub_pixel_avg_variance8x32,
                    aom_highbd_sad8x32x4d_bits12,
-                   aom_highbd_jnt_sad8x32_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance8x32);
+                   aom_highbd_dist_wtd_sad8x32_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32);
 
         HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12,
                    aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4,
                    aom_highbd_12_sub_pixel_variance16x4,
                    aom_highbd_12_sub_pixel_avg_variance16x4,
                    aom_highbd_sad16x4x4d_bits12,
-                   aom_highbd_jnt_sad16x4_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance16x4);
+                   aom_highbd_dist_wtd_sad16x4_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4);
 
         HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12,
                    aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16,
                    aom_highbd_12_sub_pixel_variance4x16,
                    aom_highbd_12_sub_pixel_avg_variance4x16,
                    aom_highbd_sad4x16x4d_bits12,
-                   aom_highbd_jnt_sad4x16_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance4x16);
+                   aom_highbd_dist_wtd_sad4x16_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16);
 
         HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12,
                    aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16,
                    aom_highbd_12_sub_pixel_variance32x16,
                    aom_highbd_12_sub_pixel_avg_variance32x16,
                    aom_highbd_sad32x16x4d_bits12,
-                   aom_highbd_jnt_sad32x16_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance32x16);
+                   aom_highbd_dist_wtd_sad32x16_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16);
 
         HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12,
                    aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32,
                    aom_highbd_12_sub_pixel_variance16x32,
                    aom_highbd_12_sub_pixel_avg_variance16x32,
                    aom_highbd_sad16x32x4d_bits12,
-                   aom_highbd_jnt_sad16x32_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance16x32);
+                   aom_highbd_dist_wtd_sad16x32_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32);
 
         HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12,
                    aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32,
                    aom_highbd_12_sub_pixel_variance64x32,
                    aom_highbd_12_sub_pixel_avg_variance64x32,
                    aom_highbd_sad64x32x4d_bits12,
-                   aom_highbd_jnt_sad64x32_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance64x32);
+                   aom_highbd_dist_wtd_sad64x32_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32);
 
         HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12,
                    aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64,
                    aom_highbd_12_sub_pixel_variance32x64,
                    aom_highbd_12_sub_pixel_avg_variance32x64,
                    aom_highbd_sad32x64x4d_bits12,
-                   aom_highbd_jnt_sad32x64_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance32x64);
+                   aom_highbd_dist_wtd_sad32x64_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64);
 
         HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12,
                    aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32,
                    aom_highbd_12_sub_pixel_variance32x32,
                    aom_highbd_12_sub_pixel_avg_variance32x32,
                    aom_highbd_sad32x32x4d_bits12,
-                   aom_highbd_jnt_sad32x32_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance32x32);
+                   aom_highbd_dist_wtd_sad32x32_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32);
 
         HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12,
                    aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64,
                    aom_highbd_12_sub_pixel_variance64x64,
                    aom_highbd_12_sub_pixel_avg_variance64x64,
                    aom_highbd_sad64x64x4d_bits12,
-                   aom_highbd_jnt_sad64x64_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance64x64);
+                   aom_highbd_dist_wtd_sad64x64_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64);
 
         HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12,
                    aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16,
                    aom_highbd_12_sub_pixel_variance16x16,
                    aom_highbd_12_sub_pixel_avg_variance16x16,
                    aom_highbd_sad16x16x4d_bits12,
-                   aom_highbd_jnt_sad16x16_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance16x16);
+                   aom_highbd_dist_wtd_sad16x16_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16);
 
         HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12,
                    aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8,
                    aom_highbd_12_sub_pixel_variance16x8,
                    aom_highbd_12_sub_pixel_avg_variance16x8,
                    aom_highbd_sad16x8x4d_bits12,
-                   aom_highbd_jnt_sad16x8_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance16x8);
+                   aom_highbd_dist_wtd_sad16x8_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8);
 
         HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12,
                    aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16,
                    aom_highbd_12_sub_pixel_variance8x16,
                    aom_highbd_12_sub_pixel_avg_variance8x16,
                    aom_highbd_sad8x16x4d_bits12,
-                   aom_highbd_jnt_sad8x16_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance8x16);
+                   aom_highbd_dist_wtd_sad8x16_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16);
 
         HIGHBD_BFP(
             BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12,
             aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8,
             aom_highbd_12_sub_pixel_avg_variance8x8,
-            aom_highbd_sad8x8x4d_bits12, aom_highbd_jnt_sad8x8_avg_bits12,
-            aom_highbd_12_jnt_sub_pixel_avg_variance8x8);
+            aom_highbd_sad8x8x4d_bits12, aom_highbd_dist_wtd_sad8x8_avg_bits12,
+            aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8);
 
         HIGHBD_BFP(
             BLOCK_8X4, aom_highbd_sad8x4_bits12, aom_highbd_sad8x4_avg_bits12,
             aom_highbd_12_variance8x4, aom_highbd_12_sub_pixel_variance8x4,
             aom_highbd_12_sub_pixel_avg_variance8x4,
-            aom_highbd_sad8x4x4d_bits12, aom_highbd_jnt_sad8x4_avg_bits12,
-            aom_highbd_12_jnt_sub_pixel_avg_variance8x4);
+            aom_highbd_sad8x4x4d_bits12, aom_highbd_dist_wtd_sad8x4_avg_bits12,
+            aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4);
 
         HIGHBD_BFP(
             BLOCK_4X8, aom_highbd_sad4x8_bits12, aom_highbd_sad4x8_avg_bits12,
             aom_highbd_12_variance4x8, aom_highbd_12_sub_pixel_variance4x8,
             aom_highbd_12_sub_pixel_avg_variance4x8,
-            aom_highbd_sad4x8x4d_bits12, aom_highbd_jnt_sad4x8_avg_bits12,
-            aom_highbd_12_jnt_sub_pixel_avg_variance4x8);
+            aom_highbd_sad4x8x4d_bits12, aom_highbd_dist_wtd_sad4x8_avg_bits12,
+            aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8);
 
         HIGHBD_BFP(
             BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12,
             aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4,
             aom_highbd_12_sub_pixel_avg_variance4x4,
-            aom_highbd_sad4x4x4d_bits12, aom_highbd_jnt_sad4x4_avg_bits12,
-            aom_highbd_12_jnt_sub_pixel_avg_variance4x4);
+            aom_highbd_sad4x4x4d_bits12, aom_highbd_dist_wtd_sad4x4_avg_bits12,
+            aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4);
 
         HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits12,
                    aom_highbd_sad128x128_avg_bits12,
@@ -2256,24 +2216,26 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                    aom_highbd_12_sub_pixel_variance128x128,
                    aom_highbd_12_sub_pixel_avg_variance128x128,
                    aom_highbd_sad128x128x4d_bits12,
-                   aom_highbd_jnt_sad128x128_avg_bits12,
-                   aom_highbd_12_jnt_sub_pixel_avg_variance128x128);
-
-        HIGHBD_BFP(
-            BLOCK_128X64, aom_highbd_sad128x64_bits12,
-            aom_highbd_sad128x64_avg_bits12, aom_highbd_12_variance128x64,
-            aom_highbd_12_sub_pixel_variance128x64,
-            aom_highbd_12_sub_pixel_avg_variance128x64,
-            aom_highbd_sad128x64x4d_bits12, aom_highbd_jnt_sad128x64_avg_bits12,
-            aom_highbd_12_jnt_sub_pixel_avg_variance128x64);
-
-        HIGHBD_BFP(
-            BLOCK_64X128, aom_highbd_sad64x128_bits12,
-            aom_highbd_sad64x128_avg_bits12, aom_highbd_12_variance64x128,
-            aom_highbd_12_sub_pixel_variance64x128,
-            aom_highbd_12_sub_pixel_avg_variance64x128,
-            aom_highbd_sad64x128x4d_bits12, aom_highbd_jnt_sad64x128_avg_bits12,
-            aom_highbd_12_jnt_sub_pixel_avg_variance64x128);
+                   aom_highbd_dist_wtd_sad128x128_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128);
+
+        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12,
+                   aom_highbd_sad128x64_avg_bits12,
+                   aom_highbd_12_variance128x64,
+                   aom_highbd_12_sub_pixel_variance128x64,
+                   aom_highbd_12_sub_pixel_avg_variance128x64,
+                   aom_highbd_sad128x64x4d_bits12,
+                   aom_highbd_dist_wtd_sad128x64_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64);
+
+        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12,
+                   aom_highbd_sad64x128_avg_bits12,
+                   aom_highbd_12_variance64x128,
+                   aom_highbd_12_sub_pixel_variance64x128,
+                   aom_highbd_12_sub_pixel_avg_variance64x128,
+                   aom_highbd_sad64x128x4d_bits12,
+                   aom_highbd_dist_wtd_sad64x128_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128);
 
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
                     aom_highbd_12_masked_sub_pixel_variance128x128)
@@ -2433,6 +2395,16 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   assert(IMPLIES(seq_params->profile <= PROFILE_1,
                  seq_params->bit_depth <= AOM_BITS_10));
 
+  memcpy(cpi->target_seq_level_idx, oxcf->target_seq_level_idx,
+         sizeof(cpi->target_seq_level_idx));
+  cpi->keep_level_stats = 0;
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    if (cpi->target_seq_level_idx[i] < SEQ_LEVELS) {
+      cpi->keep_level_stats = 1;
+      break;
+    }
+  }
+
   cm->timing_info_present = oxcf->timing_info_present;
   cm->timing_info.num_units_in_display_tick =
       oxcf->timing_info.num_units_in_display_tick;
@@ -2541,6 +2513,8 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   // Superblock size should not be updated after the first key frame.
   if (!cpi->seq_params_locked) {
     set_sb_size(&cm->seq_params, select_sb_size(cpi));
+    for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
+      seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
   }
 
   if (cpi->initial_width || sb_size != seq_params->sb_size) {
@@ -2558,10 +2532,6 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   cpi->alt_ref_source = NULL;
   rc->is_src_frame_alt_ref = 0;
 
-  rc->is_bwd_ref_frame = 0;
-  rc->is_last_bipred_frame = 0;
-  rc->is_bipred_frame = 0;
-
   set_tile_info(cpi);
 
   cpi->ext_refresh_frame_flags_pending = 0;
@@ -2578,6 +2548,21 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   }
 }
 
+static void init_level_info(AV1LevelInfo *level_info) {
+  memset(level_info, 0, MAX_NUM_OPERATING_POINTS * sizeof(*level_info));
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    AV1LevelSpec *const level_spec = &level_info[i].level_spec;
+    level_spec->level = SEQ_LEVEL_MAX;
+    AV1LevelStats *const level_stats = &level_info[i].level_stats;
+    level_stats->min_cropped_tile_width = INT_MAX;
+    level_stats->min_cropped_tile_height = INT_MAX;
+    level_stats->min_frame_width = INT_MAX;
+    level_stats->min_frame_height = INT_MAX;
+    level_stats->tile_width_is_valid = 1;
+    level_stats->min_cr = 1e8;
+  }
+}
+
 AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
                                 BufferPool *const pool) {
   unsigned int i;
@@ -2620,10 +2605,11 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
 
   cm->current_frame.frame_number = 0;
+  cm->current_frame_id = -1;
   cpi->seq_params_locked = 0;
   cpi->partition_search_skippable_frame = 0;
   cpi->tile_data = NULL;
-  cpi->last_show_frame_buf_idx = INVALID_IDX;
+  cpi->last_show_frame_buf = NULL;
   realloc_segmentation_maps(cpi);
 
   memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
@@ -2636,19 +2622,10 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
         aom_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
   }
 
-#if CONFIG_FP_MB_STATS
-  cpi->use_fp_mb_stats = 0;
-  if (cpi->use_fp_mb_stats) {
-    // a place holder used to store the first pass mb stats in the first pass
-    CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf,
-                    aom_calloc(cm->MBs * sizeof(uint8_t), 1));
-  } else {
-    cpi->twopass.frame_mb_stats_buf = NULL;
-  }
-#endif
-
   cpi->refresh_alt_ref_frame = 0;
 
+  init_level_info(cpi->level_info);
+
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
   cpi->b_calculate_blockiness = 1;
@@ -2659,6 +2636,9 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
   cpi->count = 0;
   cpi->bytes = 0;
+#if CONFIG_SPEED_STATS
+  cpi->tx_search_count = 0;
+#endif  // CONFIG_SPEED_STATS
 
   if (cpi->b_calculate_psnr) {
     cpi->total_sq_error = 0;
@@ -2707,19 +2687,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
 
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      const size_t psz = cpi->common.MBs * sizeof(uint8_t);
-      const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
-
-      cpi->twopass.firstpass_mb_stats.mb_stats_start =
-          oxcf->firstpass_mb_stats_in.buf;
-      cpi->twopass.firstpass_mb_stats.mb_stats_end =
-          cpi->twopass.firstpass_mb_stats.mb_stats_start +
-          (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
-    }
-#endif
-
     cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
     cpi->twopass.stats_in = cpi->twopass.stats_in_start;
     cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
@@ -2740,11 +2707,9 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
                   (int32_t *)aom_memalign(
                       16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
   CHECK_MEM_ERROR(
       cm, cpi->td.mb.inter_modes_info,
       (InterModesInfo *)aom_malloc(sizeof(*cpi->td.mb.inter_modes_info)));
-#endif
 
   for (int x = 0; x < 2; x++)
     for (int y = 0; y < 2; y++)
@@ -2759,8 +2724,8 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
                   (int32_t *)aom_memalign(
                       16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
 
-  av1_set_speed_features_framesize_independent(cpi);
-  av1_set_speed_features_framesize_dependent(cpi);
+  av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
+  av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
 
   for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
     int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
@@ -2777,6 +2742,10 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
     cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
   }
 
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+  av1_zero(cpi->partition_stats);
+#endif
+
 #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
   cpi->fn_ptr[BT].sdf = SDF;                                    \
   cpi->fn_ptr[BT].sdaf = SDAF;                                  \
@@ -2789,103 +2758,109 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
   BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
       aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
-      aom_sad4x16x4d, aom_jnt_sad4x16_avg, aom_jnt_sub_pixel_avg_variance4x16)
+      aom_sad4x16x4d, aom_dist_wtd_sad4x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance4x16)
 
   BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
       aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
-      aom_sad16x4x4d, aom_jnt_sad16x4_avg, aom_jnt_sub_pixel_avg_variance16x4)
+      aom_sad16x4x4d, aom_dist_wtd_sad16x4_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x4)
 
   BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
       aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
-      aom_sad8x32x4d, aom_jnt_sad8x32_avg, aom_jnt_sub_pixel_avg_variance8x32)
+      aom_sad8x32x4d, aom_dist_wtd_sad8x32_avg,
+      aom_dist_wtd_sub_pixel_avg_variance8x32)
 
   BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
       aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
-      aom_sad32x8x4d, aom_jnt_sad32x8_avg, aom_jnt_sub_pixel_avg_variance32x8)
+      aom_sad32x8x4d, aom_dist_wtd_sad32x8_avg,
+      aom_dist_wtd_sub_pixel_avg_variance32x8)
 
   BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
       aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
-      aom_sad16x64x4d, aom_jnt_sad16x64_avg,
-      aom_jnt_sub_pixel_avg_variance16x64)
+      aom_sad16x64x4d, aom_dist_wtd_sad16x64_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x64)
 
   BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
       aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
-      aom_sad64x16x4d, aom_jnt_sad64x16_avg,
-      aom_jnt_sub_pixel_avg_variance64x16)
+      aom_sad64x16x4d, aom_dist_wtd_sad64x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance64x16)
 
   BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
       aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
-      aom_sad128x128x4d, aom_jnt_sad128x128_avg,
-      aom_jnt_sub_pixel_avg_variance128x128)
+      aom_sad128x128x4d, aom_dist_wtd_sad128x128_avg,
+      aom_dist_wtd_sub_pixel_avg_variance128x128)
 
   BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
       aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
-      aom_sad128x64x4d, aom_jnt_sad128x64_avg,
-      aom_jnt_sub_pixel_avg_variance128x64)
+      aom_sad128x64x4d, aom_dist_wtd_sad128x64_avg,
+      aom_dist_wtd_sub_pixel_avg_variance128x64)
 
   BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
       aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
-      aom_sad64x128x4d, aom_jnt_sad64x128_avg,
-      aom_jnt_sub_pixel_avg_variance64x128)
+      aom_sad64x128x4d, aom_dist_wtd_sad64x128_avg,
+      aom_dist_wtd_sub_pixel_avg_variance64x128)
 
   BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
       aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
-      aom_sad32x16x4d, aom_jnt_sad32x16_avg,
-      aom_jnt_sub_pixel_avg_variance32x16)
+      aom_sad32x16x4d, aom_dist_wtd_sad32x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance32x16)
 
   BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
       aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
-      aom_sad16x32x4d, aom_jnt_sad16x32_avg,
-      aom_jnt_sub_pixel_avg_variance16x32)
+      aom_sad16x32x4d, aom_dist_wtd_sad16x32_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x32)
 
   BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
       aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
-      aom_sad64x32x4d, aom_jnt_sad64x32_avg,
-      aom_jnt_sub_pixel_avg_variance64x32)
+      aom_sad64x32x4d, aom_dist_wtd_sad64x32_avg,
+      aom_dist_wtd_sub_pixel_avg_variance64x32)
 
   BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
       aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
-      aom_sad32x64x4d, aom_jnt_sad32x64_avg,
-      aom_jnt_sub_pixel_avg_variance32x64)
+      aom_sad32x64x4d, aom_dist_wtd_sad32x64_avg,
+      aom_dist_wtd_sub_pixel_avg_variance32x64)
 
   BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
       aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
-      aom_sad32x32x4d, aom_jnt_sad32x32_avg,
-      aom_jnt_sub_pixel_avg_variance32x32)
+      aom_sad32x32x4d, aom_dist_wtd_sad32x32_avg,
+      aom_dist_wtd_sub_pixel_avg_variance32x32)
 
   BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
       aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
-      aom_sad64x64x4d, aom_jnt_sad64x64_avg,
-      aom_jnt_sub_pixel_avg_variance64x64)
+      aom_sad64x64x4d, aom_dist_wtd_sad64x64_avg,
+      aom_dist_wtd_sub_pixel_avg_variance64x64)
 
   BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
       aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
-      aom_sad16x16x4d, aom_jnt_sad16x16_avg,
-      aom_jnt_sub_pixel_avg_variance16x16)
+      aom_sad16x16x4d, aom_dist_wtd_sad16x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x16)
 
   BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
       aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
-      aom_sad16x8x4d, aom_jnt_sad16x8_avg, aom_jnt_sub_pixel_avg_variance16x8)
+      aom_sad16x8x4d, aom_dist_wtd_sad16x8_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x8)
 
   BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
       aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
-      aom_sad8x16x4d, aom_jnt_sad8x16_avg, aom_jnt_sub_pixel_avg_variance8x16)
+      aom_sad8x16x4d, aom_dist_wtd_sad8x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance8x16)
 
   BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
       aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
-      aom_jnt_sad8x8_avg, aom_jnt_sub_pixel_avg_variance8x8)
+      aom_dist_wtd_sad8x8_avg, aom_dist_wtd_sub_pixel_avg_variance8x8)
 
   BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
       aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
-      aom_jnt_sad8x4_avg, aom_jnt_sub_pixel_avg_variance8x4)
+      aom_dist_wtd_sad8x4_avg, aom_dist_wtd_sub_pixel_avg_variance8x4)
 
   BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
       aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
-      aom_jnt_sad4x8_avg, aom_jnt_sub_pixel_avg_variance4x8)
+      aom_dist_wtd_sad4x8_avg, aom_dist_wtd_sub_pixel_avg_variance4x8)
 
   BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
       aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
-      aom_jnt_sad4x4_avg, aom_jnt_sub_pixel_avg_variance4x4)
+      aom_dist_wtd_sad4x4_avg, aom_dist_wtd_sub_pixel_avg_variance4x4)
 
 #define OBFP(BT, OSDF, OVF, OSVF) \
   cpi->fn_ptr[BT].osdf = OSDF;    \
@@ -3083,6 +3058,17 @@ void av1_remove_compressor(AV1_COMP *cpi) {
       fclose(f);
     }
 #endif  // CONFIG_INTERNAL_STATS
+#if CONFIG_SPEED_STATS
+    if (cpi->oxcf.pass != 1) {
+      fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count);
+    }
+#endif  // CONFIG_SPEED_STATS
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+    if (cpi->oxcf.pass != 1) {
+      av1_print_partition_stats(&cpi->partition_stats);
+    }
+#endif
   }
 
   for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
@@ -3090,7 +3076,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
     cpi->tpl_stats[frame].is_valid = 0;
   }
 
-  for (t = 0; t < cpi->num_workers; ++t) {
+  for (t = cpi->num_workers - 1; t >= 0; --t) {
     AVxWorker *const worker = &cpi->workers[t];
     EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
 
@@ -3099,7 +3085,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 
     // Deallocate allocated thread data.
     if (cpi->row_mt == 1) aom_free(thread_data->td->tctx);
-    if (t < cpi->num_workers - 1) {
+    if (t > 0) {
       aom_free(thread_data->td->palette_buffer);
       aom_free(thread_data->td->tmp_conv_dst);
       for (int j = 0; j < 2; ++j) {
@@ -3109,9 +3095,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
       aom_free(thread_data->td->left_pred_buf);
       aom_free(thread_data->td->wsrc_buf);
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
       aom_free(thread_data->td->inter_modes_info);
-#endif
       for (int x = 0; x < 2; x++) {
         for (int y = 0; y < 2; y++) {
           aom_free(thread_data->td->hash_value_buffer[x][y]);
@@ -3148,12 +3132,6 @@ void av1_remove_compressor(AV1_COMP *cpi) {
     aom_free(cpi->mbgraph_stats[i].mb_stats);
   }
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    aom_free(cpi->twopass.frame_mb_stats_buf);
-    cpi->twopass.frame_mb_stats_buf = NULL;
-  }
-#endif
 #if CONFIG_INTERNAL_STATS
   aom_free(cpi->ssim_vars);
   cpi->ssim_vars = NULL;
@@ -3179,7 +3157,7 @@ static void generate_psnr_packet(AV1_COMP *cpi) {
   struct aom_codec_cx_pkt pkt;
   int i;
   PSNR_STATS psnr;
-  aom_calc_highbd_psnr(cpi->source, cpi->common.frame_to_show, &psnr,
+  aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr,
                        cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
 
   for (i = 0; i < 4; ++i) {
@@ -3198,15 +3176,6 @@ int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) {
   return 0;
 }
 
-void av1_update_reference(AV1_COMP *cpi, int ref_frame_upd_flags) {
-  cpi->ext_refresh_last_frame = (ref_frame_upd_flags & AOM_LAST_FLAG) != 0;
-  cpi->ext_refresh_golden_frame = (ref_frame_upd_flags & AOM_GOLD_FLAG) != 0;
-  cpi->ext_refresh_alt_ref_frame = (ref_frame_upd_flags & AOM_ALT_FLAG) != 0;
-  cpi->ext_refresh_bwd_ref_frame = (ref_frame_upd_flags & AOM_BWD_FLAG) != 0;
-  cpi->ext_refresh_alt2_ref_frame = (ref_frame_upd_flags & AOM_ALT2_FLAG) != 0;
-  cpi->ext_refresh_frame_flags_pending = 1;
-}
-
 int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
@@ -3269,62 +3238,6 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
 }
 #endif
 
-static void check_show_existing_frame(AV1_COMP *cpi) {
-  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-  AV1_COMMON *const cm = &cpi->common;
-  const FRAME_UPDATE_TYPE next_frame_update_type =
-      gf_group->update_type[gf_group->index];
-#if USE_SYMM_MULTI_LAYER
-  const int which_arf = (cpi->new_bwdref_update_rule == 1)
-                            ? gf_group->arf_update_idx[gf_group->index] > 0
-                            : gf_group->arf_update_idx[gf_group->index];
-#else
-  const int which_arf = gf_group->arf_update_idx[gf_group->index];
-#endif
-
-  if (cm->show_existing_frame == 1) {
-    cm->show_existing_frame = 0;
-  } else if (cpi->rc.is_last_bipred_frame) {
-#if USE_SYMM_MULTI_LAYER
-    // NOTE: When new structure is used, every bwdref will have one overlay
-    //       frame. Therefore, there is no need to find out which frame to
-    //       show in advance.
-    if (cpi->new_bwdref_update_rule == 0) {
-#endif
-      // NOTE: If the current frame is a last bi-predictive frame, it is
-      //       needed next to show the BWDREF_FRAME, which is pointed by
-      //       the last_fb_idxes[0] after reference frame buffer update
-      cpi->rc.is_last_bipred_frame = 0;
-      cm->show_existing_frame = 1;
-      cpi->existing_fb_idx_to_show = cpi->remapped_ref_idx[0];
-#if USE_SYMM_MULTI_LAYER
-    }
-#endif
-  } else if (cpi->is_arf_filter_off[which_arf] &&
-             (next_frame_update_type == OVERLAY_UPDATE ||
-              next_frame_update_type == INTNL_OVERLAY_UPDATE)) {
-#if USE_SYMM_MULTI_LAYER
-    const int bwdref_to_show =
-        (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
-#else
-    const int bwdref_to_show = ALTREF2_FRAME;
-#endif
-    // Other parameters related to OVERLAY_UPDATE will be taken care of
-    // in av1_rc_get_second_pass_params(cpi)
-    cm->show_existing_frame = 1;
-    cpi->rc.is_src_frame_alt_ref = 1;
-    cpi->existing_fb_idx_to_show =
-        (next_frame_update_type == OVERLAY_UPDATE)
-            ? get_ref_frame_map_idx(cpi, ALTREF_FRAME)
-            : get_ref_frame_map_idx(cpi, bwdref_to_show);
-#if USE_SYMM_MULTI_LAYER
-    if (cpi->new_bwdref_update_rule == 0)
-#endif
-      cpi->is_arf_filter_off[which_arf] = 0;
-  }
-  cpi->rc.is_src_frame_ext_arf = 0;
-}
-
 #ifdef OUTPUT_YUV_REC
 void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
   uint8_t *src = s->y_buffer;
@@ -3433,379 +3346,6 @@ static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q,
   return force_recode;
 }
 
-#define DUMP_REF_FRAME_IMAGES 0
-
-#if DUMP_REF_FRAME_IMAGES == 1
-static int dump_one_image(AV1_COMMON *cm,
-                          const YV12_BUFFER_CONFIG *const ref_buf,
-                          char *file_name) {
-  int h;
-  FILE *f_ref = NULL;
-
-  if (ref_buf == NULL) {
-    printf("Frame data buffer is NULL.\n");
-    return AOM_CODEC_MEM_ERROR;
-  }
-
-  if ((f_ref = fopen(file_name, "wb")) == NULL) {
-    printf("Unable to open file %s to write.\n", file_name);
-    return AOM_CODEC_MEM_ERROR;
-  }
-
-  // --- Y ---
-  for (h = 0; h < cm->height; ++h) {
-    fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
-  }
-  // --- U ---
-  for (h = 0; h < (cm->height >> 1); ++h) {
-    fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
-           f_ref);
-  }
-  // --- V ---
-  for (h = 0; h < (cm->height >> 1); ++h) {
-    fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
-           f_ref);
-  }
-
-  fclose(f_ref);
-
-  return AOM_CODEC_OK;
-}
-
-static void dump_ref_frame_images(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  MV_REFERENCE_FRAME ref_frame;
-
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    char file_name[256] = "";
-    snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
-             cm->current_frame.frame_number, ref_frame);
-    dump_one_image(cm, get_ref_frame_buffer(cpi, ref_frame), file_name);
-  }
-}
-#endif  // DUMP_REF_FRAME_IMAGES == 1
-
-// This function is used to shift the virtual indices of last reference frames
-// as follows:
-// LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
-// when the LAST_FRAME is updated.
-static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
-  // TODO(isbs): shift the scaled indices as well
-  for (int ref_frame = LAST3_FRAME; ref_frame > LAST_FRAME; --ref_frame) {
-    const int ref_idx = ref_frame - LAST_FRAME;
-    cpi->remapped_ref_idx[ref_idx] = cpi->remapped_ref_idx[ref_idx - 1];
-
-    if (!cpi->rc.is_src_frame_alt_ref) {
-      memcpy(cpi->interp_filter_selected[ref_frame],
-             cpi->interp_filter_selected[ref_frame - 1],
-             sizeof(cpi->interp_filter_selected[ref_frame - 1]));
-    }
-  }
-}
-
-#if USE_SYMM_MULTI_LAYER
-// This function is used to shift the virtual indices of bwd reference
-// frames as follows:
-// BWD_REF -> ALT2_REF -> EXT_REF
-// to clear a space to store the closest bwdref
-static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) {
-  // TODO(isbs): shift the scaled indices as well
-  static const int ordered_bwd[3] = { BWDREF_FRAME, ALTREF2_FRAME,
-                                      EXTREF_FRAME };
-
-  for (int i = 2; i > 0; --i) {
-    // [0] is allocated to the current coded frame, i.e. bwdref
-    memcpy(cpi->interp_filter_selected[ordered_bwd[i]],
-           cpi->interp_filter_selected[ordered_bwd[i - 1]],
-           sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1]]));
-
-    cpi->remapped_ref_idx[ordered_bwd[i] - LAST_FRAME] =
-        cpi->remapped_ref_idx[ordered_bwd[i - 1] - LAST_FRAME];
-  }
-}
-
-// This function is used to shift the virtual indices of bwd reference
-// frames as follows:
-// BWD_REF <- ALT2_REF <- EXT_REF
-// to update the bwd reference frame for coding the next frame.
-static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) {
-  // TODO(isbs): shift the scaled indices as well
-  static const int ordered_bwd[3] = { BWDREF_FRAME, ALTREF2_FRAME,
-                                      EXTREF_FRAME };
-
-  for (int i = 0; i < 2; ++i) {
-    // [0] is allocated to the current coded frame, i.e. bwdref
-    memcpy(cpi->interp_filter_selected[ordered_bwd[i]],
-           cpi->interp_filter_selected[ordered_bwd[i + 1]],
-           sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1]]));
-
-    cpi->remapped_ref_idx[ordered_bwd[i] - LAST_FRAME] =
-        cpi->remapped_ref_idx[ordered_bwd[i + 1] - LAST_FRAME];
-  }
-}
-#endif  // USE_SYMM_MULTI_LAYER
-
-static void update_reference_frames(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-
-  // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
-  //       for the purpose to verify no mismatch between encoder and decoder.
-  if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
-
-  // In the case of show_existing frame, we will not send fresh flag
-  // to decoder. Any change in the reference frame buffer can be done by
-  // switching the virtual indices.
-  if (cm->show_existing_frame) {
-    // If we are not indicating to the decoder that this frame is
-    // a show_existing_frame, which occurs in error_resilient mode,
-    // we still want to refresh the LAST_FRAME when the current frame
-    // was the source of an ext_arf.
-    cpi->refresh_last_frame =
-        !encode_show_existing_frame(cm) && cpi->rc.is_src_frame_ext_arf;
-    cpi->refresh_golden_frame = 0;
-    cpi->refresh_bwd_ref_frame = 0;
-    cpi->refresh_alt2_ref_frame = 0;
-    cpi->refresh_alt_ref_frame = 0;
-
-    cpi->rc.is_bwd_ref_frame = 0;
-    cpi->rc.is_last_bipred_frame = 0;
-    cpi->rc.is_bipred_frame = 0;
-  }
-
-  BufferPool *const pool = cm->buffer_pool;
-
-  // At this point the new frame has been encoded.
-  // If any buffer copy / swapping is signaled it should be done here.
-
-  // Only update all of the reference buffers if a KEY_FRAME is also a
-  // show_frame. This ensures a fwd keyframe does not update all of the buffers
-  if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
-      frame_is_sframe(cm)) {
-    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
-      assign_frame_buffer(pool->frame_bufs,
-                          &cm->ref_frame_map[cpi->remapped_ref_idx[ref_frame]],
-                          cm->new_fb_idx);
-    }
-    return;
-  }
-
-  if (av1_preserve_existing_gf(cpi)) {
-    // We have decided to preserve the previously existing golden frame as our
-    // new ARF frame. However, in the short term in function
-    // av1_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
-    // we're updating the GF with the current decoded frame, we save it to the
-    // ARF slot instead.
-    // We now have to update the ARF with the current frame and swap gld_fb_idx
-    // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF
-    // slot and, if we're updating the GF, the current frame becomes the new GF.
-    int tmp;
-
-    // ARF in general is a better reference than overlay. We shouldkeep ARF as
-    // reference instead of replacing it with overlay.
-
-    if (!cpi->preserve_arf_as_gld) {
-      assign_frame_buffer(
-          pool->frame_bufs,
-          &cm->ref_frame_map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)],
-          cm->new_fb_idx);
-    }
-
-    tmp = get_ref_frame_map_idx(cpi, ALTREF_FRAME);
-    cpi->remapped_ref_idx[ALTREF_FRAME - 1] =
-        get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
-    cpi->remapped_ref_idx[GOLDEN_FRAME - 1] = tmp;
-
-    // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
-    // cpi->interp_filter_selected[GOLDEN_FRAME]?
-  } else if (cpi->rc.is_src_frame_ext_arf && encode_show_existing_frame(cm)) {
-#if CONFIG_DEBUG
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE);
-#endif
-#if USE_SYMM_MULTI_LAYER
-    const int bwdref_to_show =
-        (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
-#else
-    const int bwdref_to_show = ALTREF2_FRAME;
-#endif
-    // Deal with the special case for showing existing internal ALTREF_FRAME
-    // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
-    // by updating the virtual indices.
-    const int last3_remapped_idx = get_ref_frame_map_idx(cpi, LAST3_FRAME);
-    shift_last_ref_frames(cpi);
-
-    cpi->remapped_ref_idx[LAST_FRAME - 1] =
-        get_ref_frame_map_idx(cpi, bwdref_to_show);
-
-    memcpy(cpi->interp_filter_selected[LAST_FRAME],
-           cpi->interp_filter_selected[bwdref_to_show],
-           sizeof(cpi->interp_filter_selected[bwdref_to_show]));
-#if USE_SYMM_MULTI_LAYER
-    if (cpi->new_bwdref_update_rule == 1) {
-      lshift_bwd_ref_frames(cpi);
-      // pass outdated forward reference frame (previous LAST3) to the
-      // spared space
-      cpi->remapped_ref_idx[EXTREF_FRAME - 1] = last3_remapped_idx;
-    } else {
-#endif
-      cpi->remapped_ref_idx[bwdref_to_show - 1] = last3_remapped_idx;
-#if USE_SYMM_MULTI_LAYER
-    }
-#endif
-  } else { /* For non key/golden frames */
-    // === ALTREF_FRAME ===
-    if (cpi->refresh_alt_ref_frame) {
-      int arf_idx = get_ref_frame_map_idx(cpi, ALTREF_FRAME);
-      assign_frame_buffer(pool->frame_bufs, &cm->ref_frame_map[arf_idx],
-                          cm->new_fb_idx);
-
-      memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
-             cpi->interp_filter_selected[0],
-             sizeof(cpi->interp_filter_selected[0]));
-    }
-
-    // === GOLDEN_FRAME ===
-    if (cpi->refresh_golden_frame) {
-      assign_frame_buffer(
-          pool->frame_bufs,
-          &cm->ref_frame_map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)],
-          cm->new_fb_idx);
-
-      memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
-             cpi->interp_filter_selected[0],
-             sizeof(cpi->interp_filter_selected[0]));
-    }
-
-    // === BWDREF_FRAME ===
-    if (cpi->refresh_bwd_ref_frame) {
-#if USE_SYMM_MULTI_LAYER
-      if (cpi->new_bwdref_update_rule) {
-        // We shift the backward reference frame as follows:
-        // BWDREF -> ALTREF2 -> EXTREF
-        // and assign the newly coded frame to BWDREF so that it always
-        // keeps the nearest future frame
-        int tmp = get_ref_frame_map_idx(cpi, EXTREF_FRAME);
-        assign_frame_buffer(pool->frame_bufs, &cm->ref_frame_map[tmp],
-                            cm->new_fb_idx);
-
-        rshift_bwd_ref_frames(cpi);
-        cpi->remapped_ref_idx[BWDREF_FRAME - 1] = tmp;
-      } else {
-#endif  // USE_SYMM_MULTI_LAYER
-        assign_frame_buffer(
-            pool->frame_bufs,
-            &cm->ref_frame_map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)],
-            cm->new_fb_idx);
-#if USE_SYMM_MULTI_LAYER
-      }
-#endif
-      memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
-             cpi->interp_filter_selected[0],
-             sizeof(cpi->interp_filter_selected[0]));
-    }
-
-    // === ALTREF2_FRAME ===
-    if (cpi->refresh_alt2_ref_frame) {
-      assign_frame_buffer(
-          pool->frame_bufs,
-          &cm->ref_frame_map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)],
-          cm->new_fb_idx);
-
-      memcpy(cpi->interp_filter_selected[ALTREF2_FRAME],
-             cpi->interp_filter_selected[0],
-             sizeof(cpi->interp_filter_selected[0]));
-    }
-  }
-
-  if (cpi->refresh_last_frame) {
-    // NOTE(zoeliu): We have two layers of mapping (1) from the per-frame
-    // reference to the reference frame buffer virtual index; and then (2) from
-    // the virtual index to the reference frame buffer physical index:
-    //
-    // LAST_FRAME,                       ...,     EXTREF_FRAME
-    //      |                                           |
-    //      v                                           v
-    // remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1]
-    //      |                                           |
-    //      v                                           v
-    // ref_frame_map[],                  ...,    ref_frame_map[]
-    //
-    // When refresh_last_frame is set, it is intended to retire LAST3_FRAME,
-    // have the other 2 LAST reference frames shifted as follows:
-    // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
-    // , and then have LAST_FRAME refreshed by the newly coded frame.
-    //
-    // To fulfill it, the decoder will be notified to execute following 2 steps:
-    //
-    // (a) To change ref_frame_map[] and have the virtual index of LAST3_FRAME
-    //     to point to the newly coded frame, i.e.
-    //     ref_frame_map[lst_fb_idexes[2]] => new_fb_idx;
-    //
-    // (b) To change the 1st layer mapping to have LAST_FRAME mapped to the
-    //     original virtual index of LAST3_FRAME and have the other mappings
-    //     shifted as follows:
-    // LAST_FRAME,            LAST2_FRAME,             LAST3_FRAME
-    //      |                      |                        |
-    //      v                      v                        v
-    // remapped_ref_idx[2],   remapped_ref_idx[0],     remapped_ref_idx[1]
-    assign_frame_buffer(
-        pool->frame_bufs,
-        &cm->ref_frame_map[get_ref_frame_map_idx(cpi, LAST3_FRAME)],
-        cm->new_fb_idx);
-
-    int last3_remapped_idx = get_ref_frame_map_idx(cpi, LAST3_FRAME);
-
-    shift_last_ref_frames(cpi);
-    cpi->remapped_ref_idx[LAST_FRAME - 1] = last3_remapped_idx;
-
-    assert(!encode_show_existing_frame(cm));
-    memcpy(cpi->interp_filter_selected[LAST_FRAME],
-           cpi->interp_filter_selected[0],
-           sizeof(cpi->interp_filter_selected[0]));
-
-    // If the new structure is used, we will always have overlay frames coupled
-    // with bwdref frames. Therefore, we won't have to perform this update
-    // in advance (we do this update when the overlay frame shows up).
-#if USE_SYMM_MULTI_LAYER
-    if (cpi->new_bwdref_update_rule == 0 && cpi->rc.is_last_bipred_frame) {
-#else
-    if (cpi->rc.is_last_bipred_frame) {
-#endif
-      // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the
-      // LAST3_FRAME by updating the virtual indices.
-      //
-      // NOTE: The source frame for BWDREF does not have a holding position as
-      //       the OVERLAY frame for ALTREF's. Hence, to resolve the reference
-      //       virtual index reshuffling for BWDREF, the encoder always
-      //       specifies a LAST_BIPRED right before BWDREF and completes the
-      //       reshuffling job accordingly.
-      last3_remapped_idx = get_ref_frame_map_idx(cpi, LAST3_FRAME);
-
-      shift_last_ref_frames(cpi);
-      cpi->remapped_ref_idx[LAST_FRAME - 1] =
-          get_ref_frame_map_idx(cpi, BWDREF_FRAME);
-      cpi->remapped_ref_idx[BWDREF_FRAME - 1] = last3_remapped_idx;
-
-      memcpy(cpi->interp_filter_selected[LAST_FRAME],
-             cpi->interp_filter_selected[BWDREF_FRAME],
-             sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
-    }
-  }
-
-#if DUMP_REF_FRAME_IMAGES == 1
-  // Dump out all reference frame images.
-  dump_ref_frame_images(cpi);
-#endif  // DUMP_REF_FRAME_IMAGES
-}
-
-static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) {
-  assert(buffer_idx != INVALID_IDX);
-  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
-  ensure_mv_buffer(new_fb_ptr, cm);
-  new_fb_ptr->width = cm->width;
-  new_fb_ptr->height = cm->height;
-}
-
 static void scale_references(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
@@ -3820,68 +3360,79 @@ static void scale_references(AV1_COMP *cpi) {
     if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
       BufferPool *const pool = cm->buffer_pool;
       const YV12_BUFFER_CONFIG *const ref =
-          get_ref_frame_buffer(cpi, ref_frame);
+          get_ref_frame_yv12_buf(cm, ref_frame);
 
       if (ref == NULL) {
-        cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+        cpi->scaled_ref_buf[ref_frame - 1] = NULL;
         continue;
       }
 
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
-        RefCntBuffer *new_fb_ptr = NULL;
+        // Replace the reference buffer with a copy having a thicker border,
+        // if the reference buffer is higher resolution than the current
+        // frame, and the border is thin.
+        if ((ref->y_crop_width > cm->width ||
+             ref->y_crop_height > cm->height) &&
+            ref->border < AOM_BORDER_IN_PIXELS) {
+          RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame);
+          if (aom_yv12_realloc_with_new_border(
+                  &ref_fb->buf, AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+                  num_planes) != 0) {
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+        }
         int force_scaling = 0;
-        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
-        if (new_fb == INVALID_IDX) {
-          new_fb = get_free_fb(cm);
-          if (new_fb == INVALID_IDX)
+        RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1];
+        if (new_fb == NULL) {
+          const int new_fb_idx = get_free_fb(cm);
+          if (new_fb_idx == INVALID_IDX) {
             aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                                "Unable to find free frame buffer");
+          }
           force_scaling = 1;
+          new_fb = &pool->frame_bufs[new_fb_idx];
         }
-        new_fb_ptr = &pool->frame_bufs[new_fb];
-        if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
-            new_fb_ptr->buf.y_crop_height != cm->height) {
+
+        if (force_scaling || new_fb->buf.y_crop_width != cm->width ||
+            new_fb->buf.y_crop_height != cm->height) {
           if (aom_realloc_frame_buffer(
-                  &new_fb_ptr->buf, cm->width, cm->height,
+                  &new_fb->buf, cm->width, cm->height,
                   cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
                   cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
                   cm->byte_alignment, NULL, NULL, NULL)) {
             if (force_scaling) {
               // Release the reference acquired in the get_free_fb() call above.
-              --new_fb_ptr->ref_count;
+              --new_fb->ref_count;
             }
             aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           }
           av1_resize_and_extend_frame(
-              ref, &new_fb_ptr->buf, (int)cm->seq_params.bit_depth, num_planes);
-          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+              ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
+          cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
       } else {
-        const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
-        RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
+        RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
         buf->buf.y_crop_width = ref->y_crop_width;
         buf->buf.y_crop_height = ref->y_crop_height;
-        cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+        cpi->scaled_ref_buf[ref_frame - 1] = buf;
         ++buf->ref_count;
       }
     } else {
-      if (cpi->oxcf.pass != 0) cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+      if (cpi->oxcf.pass != 0) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
     }
   }
 }
 
 static void release_scaled_references(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  int i;
   // TODO(isbs): only refresh the necessary frames, rather than all of them
-  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-    const int idx = cpi->scaled_ref_idx[i];
-    if (idx != INVALID_IDX) {
-      RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx];
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+    if (buf != NULL) {
       --buf->ref_count;
-      cpi->scaled_ref_idx[i] = INVALID_IDX;
+      cpi->scaled_ref_buf[i] = NULL;
     }
   }
 }
@@ -3911,6 +3462,71 @@ static void set_mv_search_params(AV1_COMP *cpi) {
   }
 }
 
+static void set_screen_content_options(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+
+  if (cm->seq_params.force_screen_content_tools != 2) {
+    cm->allow_screen_content_tools = cm->allow_intrabc =
+        cm->seq_params.force_screen_content_tools;
+    return;
+  }
+
+  if (cpi->oxcf.content == AOM_CONTENT_SCREEN) {
+    cm->allow_screen_content_tools = cm->allow_intrabc = 1;
+    return;
+  }
+
+  // Estimate if the source frame is screen content, based on the portion of
+  // blocks that have few luma colors.
+  const uint8_t *src = cpi->source->y_buffer;
+  assert(src != NULL);
+  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int stride = cpi->source->y_stride;
+  const int width = cpi->source->y_width;
+  const int height = cpi->source->y_height;
+  const int bd = cm->seq_params.bit_depth;
+  const int blk_w = 16;
+  const int blk_h = 16;
+  // These threshold values are selected experimentally.
+  const int color_thresh = 4;
+  const unsigned int var_thresh = 0;
+  // Counts of blocks with no more than color_thresh colors.
+  int counts_1 = 0;
+  // Counts of blocks with no more than color_thresh colors and variance larger
+  // than var_thresh.
+  int counts_2 = 0;
+
+  for (int r = 0; r + blk_h <= height; r += blk_h) {
+    for (int c = 0; c + blk_w <= width; c += blk_w) {
+      int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+      const uint8_t *const this_src = src + r * stride + c;
+      const int n_colors =
+          use_hbd ? av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd,
+                                            count_buf)
+                  : av1_count_colors(this_src, stride, blk_w, blk_h, count_buf);
+      if (n_colors > 1 && n_colors <= color_thresh) {
+        ++counts_1;
+        struct buf_2d buf;
+        buf.stride = stride;
+        buf.buf = (uint8_t *)this_src;
+        const unsigned int var =
+            use_hbd
+                ? av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16, bd)
+                : av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16);
+        if (var > var_thresh) ++counts_2;
+      }
+    }
+  }
+
+  // The threshold values are selected experimentally.
+  cm->allow_screen_content_tools =
+      counts_1 * blk_h * blk_w * 10 > width * height;
+  // IntraBC would force loop filters off, so we use more strict rules that also
+  // requires that the block has high variance.
+  cm->allow_intrabc = cm->allow_screen_content_tools &&
+                      counts_2 * blk_h * blk_w * 15 > width * height;
+}
+
 static void set_size_independent_vars(AV1_COMP *cpi) {
   int i;
   AV1_COMMON *cm = &cpi->common;
@@ -3918,25 +3534,14 @@ static void set_size_independent_vars(AV1_COMP *cpi) {
     cm->global_motion[i] = default_warp_params;
   }
   cpi->global_motion_search_done = 0;
-  av1_set_speed_features_framesize_independent(cpi);
+
+  if (frame_is_intra_only(cm)) set_screen_content_options(cpi);
+  cpi->is_screen_content_type = (cm->allow_screen_content_tools != 0);
+
+  av1_set_speed_features_framesize_independent(cpi, cpi->speed);
   av1_set_rd_speed_thresholds(cpi);
-  av1_set_rd_speed_thresholds_sub8x8(cpi);
   cm->interp_filter = SWITCHABLE;
   cm->switchable_motion_mode = 1;
-
-  if (frame_is_intra_only(cm)) {
-    if (cm->seq_params.force_screen_content_tools == 2) {
-      cm->allow_screen_content_tools =
-          cpi->oxcf.content == AOM_CONTENT_SCREEN ||
-          is_screen_content(cpi->source->y_buffer,
-                            cpi->source->flags & YV12_FLAG_HIGHBITDEPTH,
-                            cm->seq_params.bit_depth, cpi->source->y_stride,
-                            cpi->source->y_width, cpi->source->y_height);
-    } else {
-      cm->allow_screen_content_tools =
-          cm->seq_params.force_screen_content_tools;
-    }
-  }
 }
 
 static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
@@ -3945,7 +3550,7 @@ static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
 
   // Setup variables that depend on the dimensions of the frame.
-  av1_set_speed_features_framesize_dependent(cpi);
+  av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
 
   // Decide q and q bounds.
   *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, bottom_index,
@@ -3966,11 +3571,17 @@ static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
 
 static void init_motion_estimation(AV1_COMP *cpi) {
   int y_stride = cpi->scaled_source.y_stride;
+  int y_stride_src = (cpi->oxcf.resize_mode || cpi->oxcf.superres_mode)
+                         ? y_stride
+                         : cpi->lookahead->buf->img.y_stride;
 
   if (cpi->sf.mv.search_method == NSTEP) {
-    av1_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+    av1_init3smotion_compensation(&cpi->ss_cfg[SS_CFG_SRC], y_stride);
+    av1_init3smotion_compensation(&cpi->ss_cfg[SS_CFG_LOOKAHEAD], y_stride_src);
   } else if (cpi->sf.mv.search_method == DIAMOND) {
-    av1_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+    av1_init_dsmotion_compensation(&cpi->ss_cfg[SS_CFG_SRC], y_stride);
+    av1_init_dsmotion_compensation(&cpi->ss_cfg[SS_CFG_LOOKAHEAD],
+                                   y_stride_src);
   }
 }
 
@@ -3999,10 +3610,9 @@ static void init_ref_frame_bufs(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int i;
   BufferPool *const pool = cm->buffer_pool;
-  cm->new_fb_idx = INVALID_IDX;
   cm->cur_frame = NULL;
   for (i = 0; i < REF_FRAMES; ++i) {
-    cm->ref_frame_map[i] = INVALID_IDX;
+    cm->ref_frame_map[i] = NULL;
   }
   for (i = 0; i < FRAME_BUFFERS; ++i) {
     pool->frame_bufs[i].ref_count = 0;
@@ -4064,7 +3674,7 @@ static int set_size_literal(AV1_COMP *cpi, int width, int height) {
   return 0;
 }
 
-static void set_frame_size(AV1_COMP *cpi, int width, int height) {
+void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *const cm = &cpi->common;
   const SequenceHeader *const seq_params = &cm->seq_params;
   const int num_planes = av1_num_planes(cm);
@@ -4083,7 +3693,7 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
     av1_set_target_rate(cpi, cm->width, cm->height);
   }
 
-  alloc_frame_mvs(cm, cm->new_fb_idx);
+  alloc_frame_mvs(cm, cm->cur_frame);
 
   // Allocate above context buffers
   if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
@@ -4099,7 +3709,7 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
   if (aom_realloc_frame_buffer(
           &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+          cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
 
@@ -4116,20 +3726,13 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
   init_motion_estimation(cpi);
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    RefBuffer *const ref_buf =
-        &cm->current_frame.frame_refs[ref_frame - LAST_FRAME];
-    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
-
-    if (buf_idx != INVALID_IDX) {
-      RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[buf_idx];
-      ref_buf->buf = buf;
-      av1_setup_scale_factors_for_frame(&ref_buf->sf, buf->buf.y_crop_width,
+    RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    if (buf != NULL) {
+      struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame);
+      av1_setup_scale_factors_for_frame(sf, buf->buf.y_crop_width,
                                         buf->buf.y_crop_height, cm->width,
                                         cm->height);
-      if (av1_is_scaled(&ref_buf->sf))
-        aom_extend_frame_borders(&buf->buf, num_planes);
-    } else {
-      ref_buf->buf = NULL;
+      if (av1_is_scaled(sf)) aom_extend_frame_borders(&buf->buf, num_planes);
     }
   }
 
@@ -4161,24 +3764,33 @@ static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
   return new_denom;
 }
 
-#define ENERGY_BY_Q2_THRESH 0.015
+#define ENERGY_BY_Q2_THRESH 0.01
+#define ENERGY_BY_AC_THRESH 0.2
 
 static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy,
-                                                     double thresh) {
+                                                     double threshq,
+                                                     double threshp) {
   const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8);
-  const double threshq2 = thresh * q * q;
+  const double tq = threshq * q * q;
+  const double tp = threshp * energy[1];
+  const double thresh = AOMMIN(tq, tp);
   int k;
-  for (k = 8; k > 0; --k) {
-    if (energy[k - 1] > threshq2) break;
+  for (k = 16; k > 8; --k) {
+    if (energy[k - 1] > thresh) break;
   }
-  return 2 * SCALE_NUMERATOR - k;
+  return 3 * SCALE_NUMERATOR - k;
 }
 
 static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex) {
-  double energy[8];
+  double energy[16];
   analyze_hor_freq(cpi, energy);
-  return get_superres_denom_from_qindex_energy(qindex, energy,
-                                               ENERGY_BY_Q2_THRESH);
+  /*
+  printf("\nenergy = [");
+  for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]);
+  printf("]\n");
+  */
+  return get_superres_denom_from_qindex_energy(
+      qindex, energy, ENERGY_BY_Q2_THRESH, ENERGY_BY_AC_THRESH);
 }
 
 static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
@@ -4216,25 +3828,31 @@ static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
       const int qthresh = (frame_is_intra_only(&cpi->common))
                               ? oxcf->superres_kf_qthresh
                               : oxcf->superres_qthresh;
-      if (q < qthresh) {
+      if (q <= qthresh) {
         new_denom = SCALE_NUMERATOR;
       } else {
-        // TODO(debargha): Experiment with the variant below.
-        // new_denom = get_superres_denom_for_qindex(cpi, q);
-        uint8_t max_denom = get_superres_denom_for_qindex(cpi, MAXQ);
-        if (max_denom == SCALE_NUMERATOR) {
-          new_denom = max_denom;
-          break;
-        } else {
-          const uint8_t q_denom_step =
-              max_denom - SCALE_NUMERATOR == 0
-                  ? 255
-                  : (MAXQ - qthresh + 1 + max_denom - SCALE_NUMERATOR - 1) /
-                        (max_denom - SCALE_NUMERATOR);
-          const uint8_t additional_denom =
-              (q - qthresh + 1 + q_denom_step - 1) / q_denom_step;
-          new_denom = AOMMIN(SCALE_NUMERATOR + additional_denom, max_denom);
-        }
+        new_denom = get_superres_denom_for_qindex(cpi, q);
+      }
+      break;
+    }
+    case SUPERRES_AUTO: {
+      // Don't use when screen content tools are used.
+      if (cpi->common.allow_screen_content_tools) break;
+      // Don't use for inter frames.
+      if (!frame_is_intra_only(&cpi->common)) break;
+      // Don't use for keyframes that can be used as references.
+      if (cpi->rc.frames_to_key != 1) break;
+
+      // Now decide the use of superres based on 'q'.
+      int bottom_index, top_index;
+      const int q = av1_rc_pick_q_and_bounds(
+          cpi, cpi->oxcf.width, cpi->oxcf.height, &bottom_index, &top_index);
+
+      const int qthresh = 128;
+      if (q <= qthresh) {
+        new_denom = SCALE_NUMERATOR;
+      } else {
+        new_denom = get_superres_denom_for_qindex(cpi, q);
       }
       break;
     }
@@ -4311,7 +3929,7 @@ static int validate_size_scales(RESIZE_MODE resize_mode,
 }
 
 // Calculates resize and superres params for next frame
-size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) {
+static size_params_type calculate_next_size_params(AV1_COMP *cpi) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR };
   int resize_denom;
@@ -4334,7 +3952,8 @@ size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) {
   return rsz;
 }
 
-static void setup_frame_size_from_params(AV1_COMP *cpi, size_params_type *rsz) {
+static void setup_frame_size_from_params(AV1_COMP *cpi,
+                                         const size_params_type *rsz) {
   int encode_width = rsz->resize_width;
   int encode_height = rsz->resize_height;
 
@@ -4344,12 +3963,17 @@ static void setup_frame_size_from_params(AV1_COMP *cpi, size_params_type *rsz) {
   cm->superres_scale_denominator = rsz->superres_denom;
   av1_calculate_scaled_superres_size(&encode_width, &encode_height,
                                      rsz->superres_denom);
-  set_frame_size(cpi, encode_width, encode_height);
+  av1_set_frame_size(cpi, encode_width, encode_height);
 }
 
-static void setup_frame_size(AV1_COMP *cpi) {
-  size_params_type rsz = av1_calculate_next_size_params(cpi);
+void av1_setup_frame_size(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  // Reset superres params from previous frame.
+  cm->superres_scale_denominator = SCALE_NUMERATOR;
+  const size_params_type rsz = calculate_next_size_params(cpi);
   setup_frame_size_from_params(cpi, &rsz);
+
+  assert(is_min_tile_width_satisfied(cm));
 }
 
 static void superres_post_encode(AV1_COMP *cpi) {
@@ -4398,237 +4022,431 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
   assert(IMPLIES(is_lossless_requested(&cpi->oxcf),
                  cm->coded_lossless && cm->all_lossless));
 
-  const int no_loopfilter = cm->coded_lossless || cm->large_scale_tile;
-  const int no_cdef =
-      !cm->seq_params.enable_cdef || cm->coded_lossless || cm->large_scale_tile;
-  const int no_restoration = !cm->seq_params.enable_restoration ||
-                             cm->all_lossless || cm->large_scale_tile;
+  const int use_loopfilter = !cm->coded_lossless && !cm->large_scale_tile;
+  const int use_cdef = cm->seq_params.enable_cdef && !cm->coded_lossless &&
+                       !cm->large_scale_tile;
+  const int use_restoration = cm->seq_params.enable_restoration &&
+                              !cm->all_lossless && !cm->large_scale_tile;
 
   struct loopfilter *lf = &cm->lf;
 
-  if (no_loopfilter) {
-    lf->filter_level[0] = 0;
-    lf->filter_level[1] = 0;
-  } else {
-    struct aom_usec_timer timer;
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, loop_filter_time);
+#endif
+  if (use_loopfilter) {
     aom_clear_system_state();
-
-    aom_usec_timer_start(&timer);
-
     av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_pick);
-
-    aom_usec_timer_mark(&timer);
-    cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
+  } else {
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
   }
 
   if (lf->filter_level[0] || lf->filter_level[1]) {
     if (cpi->num_workers > 1)
-      av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0,
+      av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
 #if LOOP_FILTER_BITMASK
                                0,
 #endif
                                cpi->workers, cpi->num_workers,
                                &cpi->lf_row_sync);
     else
-      av1_loop_filter_frame(cm->frame_to_show, cm, xd,
+      av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd,
 #if LOOP_FILTER_BITMASK
                             0,
 #endif
                             0, num_planes, 0);
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, loop_filter_time);
+#endif
 
-  if (!no_restoration)
-    av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 0);
+  if (use_restoration)
+    av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0);
 
-  if (no_cdef) {
-    cm->cdef_info.cdef_bits = 0;
-    cm->cdef_info.cdef_strengths[0] = 0;
-    cm->cdef_info.nb_cdef_strengths = 1;
-    cm->cdef_info.cdef_uv_strengths[0] = 0;
-  } else {
+  if (use_cdef) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, cdef_time);
+#endif
     // Find CDEF parameters
-    av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd,
+    av1_cdef_search(&cm->cur_frame->buf, cpi->source, cm, xd,
                     cpi->sf.fast_cdef_search);
 
     // Apply the filter
-    av1_cdef_frame(cm->frame_to_show, cm, xd);
+    av1_cdef_frame(&cm->cur_frame->buf, cm, xd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, cdef_time);
+#endif
+  } else {
+    cm->cdef_info.cdef_bits = 0;
+    cm->cdef_info.cdef_strengths[0] = 0;
+    cm->cdef_info.nb_cdef_strengths = 1;
+    cm->cdef_info.cdef_uv_strengths[0] = 0;
   }
 
   superres_post_encode(cpi);
 
-  if (no_restoration) {
-    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
-    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
-    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
-  } else {
-    av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 1);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, loop_restoration_time);
+#endif
+  if (use_restoration) {
+    av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1);
     av1_pick_filter_restoration(cpi->source, cpi);
     if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
       if (cpi->num_workers > 1)
-        av1_loop_restoration_filter_frame_mt(cm->frame_to_show, cm, 0,
+        av1_loop_restoration_filter_frame_mt(&cm->cur_frame->buf, cm, 0,
                                              cpi->workers, cpi->num_workers,
                                              &cpi->lr_row_sync, &cpi->lr_ctxt);
       else
-        av1_loop_restoration_filter_frame(cm->frame_to_show, cm, 0,
+        av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0,
                                           &cpi->lr_ctxt);
     }
+  } else {
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, loop_restoration_time);
+#endif
 }
 
-static int encode_without_recode_loop(AV1_COMP *cpi) {
+static void fix_interp_filter(InterpFilter *const interp_filter,
+                              const FRAME_COUNTS *const counts) {
+  if (*interp_filter == SWITCHABLE) {
+    // Check to see if only one of the filters is actually used
+    int count[SWITCHABLE_FILTERS] = { 0 };
+    int num_filters_used = 0;
+    for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+      for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+        count[i] += counts->switchable_interp[j][i];
+      num_filters_used += (count[i] > 0);
+    }
+    if (num_filters_used == 1) {
+      // Only one filter is used. So set the filter at frame level
+      for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        if (count[i]) {
+          if (i == EIGHTTAP_REGULAR) *interp_filter = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+static void finalize_encoded_frame(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+  CurrentFrame *const current_frame = &cm->current_frame;
 
-  aom_clear_system_state();
+  if (!cm->seq_params.reduced_still_picture_hdr &&
+      encode_show_existing_frame(cm)) {
+    RefCntBuffer *const frame_to_show =
+        cm->ref_frame_map[cpi->existing_fb_idx_to_show];
 
-  set_size_independent_vars(cpi);
+    if (frame_to_show == NULL) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Buffer does not contain a reconstructed frame");
+    }
+    assert(frame_to_show->ref_count > 0);
+    assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
+  }
 
-  setup_frame_size(cpi);
+  if (!encode_show_existing_frame(cm) &&
+      cm->seq_params.film_grain_params_present &&
+      (cm->show_frame || cm->showable_frame)) {
+    // Copy the current frame's film grain params to the its corresponding
+    // RefCntBuffer slot.
+    cm->cur_frame->film_grain_params = cm->film_grain_params;
 
-  assert(cm->width == cpi->scaled_source.y_crop_width);
-  assert(cm->height == cpi->scaled_source.y_crop_height);
+    // We must update the parameters if this is not an INTER_FRAME
+    if (current_frame->frame_type != INTER_FRAME)
+      cm->cur_frame->film_grain_params.update_parameters = 1;
 
-  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+    // Iterate the random seed for the next frame.
+    cm->film_grain_params.random_seed += 3381;
+    if (cm->film_grain_params.random_seed == 0)
+      cm->film_grain_params.random_seed = 7391;
+  }
 
-  cpi->source =
-      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
-  if (cpi->unscaled_last_source != NULL)
-    cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
-                                             &cpi->scaled_last_source);
-  cpi->source->buf_8bit_valid = 0;
-  if (frame_is_intra_only(cm) == 0) {
-    scale_references(cpi);
+  // Initialise all tiles' contexts from the global frame context
+  for (int tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+    for (int tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+      const int tile_idx = tile_row * cm->tile_cols + tile_col;
+      cpi->tile_data[tile_idx].tctx = *cm->fc;
+    }
   }
 
-  av1_set_quantizer(cm, q);
-  setup_frame(cpi);
-  suppress_active_map(cpi);
+  fix_interp_filter(&cm->interp_filter, cpi->td.counts);
+}
+
+static int get_regulated_q_overshoot(AV1_COMP *const cpi, int q_low, int q_high,
+                                     int top_index, int bottom_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+
+  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+  int q_regulated =
+      av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                        AOMMAX(q_high, top_index), cm->width, cm->height);
 
-  // Variance adaptive and in frame q adjustment experiments are mutually
-  // exclusive.
-  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-    av1_vaq_frame_setup(cpi);
-  } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
-    av1_setup_in_frame_q_adj(cpi);
-  } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-    av1_cyclic_refresh_setup(cpi);
+  int retries = 0;
+  while (q_regulated < q_low && retries < 10) {
+    av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+    q_regulated =
+        av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                          AOMMAX(q_high, top_index), cm->width, cm->height);
+    retries++;
   }
-  apply_active_map(cpi);
-  if (cm->seg.enabled) {
-    if (!cm->seg.update_data && cm->prev_frame) {
-      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+  return q_regulated;
+}
+
+static int get_regulated_q_undershoot(AV1_COMP *const cpi, int q_high,
+                                      int top_index, int bottom_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+
+  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+  int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                      top_index, cm->width, cm->height);
+
+  int retries = 0;
+  while (q_regulated > q_high && retries < 10) {
+    av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+    q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                    top_index, cm->width, cm->height);
+    retries++;
+  }
+  return q_regulated;
+}
+
+// Called after encode_with_recode_loop() has just encoded a frame and packed
+// its bitstream.  This function works out whether we under- or over-shot
+// our bitrate target and adjusts q as appropriate.  Also decides whether
+// or not we should do another recode loop, indicated by *loop
+static void recode_loop_update_q(AV1_COMP *const cpi, int *const loop,
+                                 int *const q, int *const q_low,
+                                 int *const q_high, const int top_index,
+                                 const int bottom_index,
+                                 int *const undershoot_seen,
+                                 int *const overshoot_seen,
+                                 const int loop_at_this_size) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0;
+  av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                   &frame_under_shoot_limit,
+                                   &frame_over_shoot_limit);
+  if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+
+  if ((cm->current_frame.frame_type == KEY_FRAME) &&
+      rc->this_key_frame_forced &&
+      (rc->projected_frame_size < rc->max_frame_bandwidth)) {
+    int last_q = *q;
+    int64_t kf_err;
+
+    int64_t high_err_target = cpi->ambient_err;
+    int64_t low_err_target = cpi->ambient_err >> 1;
+
+    if (cm->seq_params.use_highbitdepth) {
+      kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
     } else {
-      calculate_segdata(&cm->seg);
+      kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    }
+    // Prevent possible divide by zero error below for perfect KF
+    kf_err += !kf_err;
+
+    // The key frame is not good enough or we can afford
+    // to make it better without undue risk of popping.
+    if ((kf_err > high_err_target &&
+         rc->projected_frame_size <= frame_over_shoot_limit) ||
+        (kf_err > low_err_target &&
+         rc->projected_frame_size <= frame_under_shoot_limit)) {
+      // Lower q_high
+      *q_high = *q > *q_low ? *q - 1 : *q_low;
+
+      // Adjust Q
+      *q = (int)((*q * high_err_target) / kf_err);
+      *q = AOMMIN(*q, (*q_high + *q_low) >> 1);
+    } else if (kf_err < low_err_target &&
+               rc->projected_frame_size >= frame_under_shoot_limit) {
+      // The key frame is much better than the previous frame
+      // Raise q_low
+      *q_low = *q < *q_high ? *q + 1 : *q_high;
+
+      // Adjust Q
+      *q = (int)((*q * low_err_target) / kf_err);
+      *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1);
     }
-  } else {
-    memset(&cm->seg, 0, sizeof(cm->seg));
-  }
-  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
 
-  // transform / motion compensation build reconstruction frame
-  av1_encode_frame(cpi);
+    // Clamp Q to upper and lower limits:
+    *q = clamp(*q, *q_low, *q_high);
+
+    *loop = *q != last_q;
+  } else if (recode_loop_test(cpi, frame_over_shoot_limit,
+                              frame_under_shoot_limit, *q,
+                              AOMMAX(*q_high, top_index), bottom_index)) {
+    // Is the projected frame size out of range and are we allowed
+    // to attempt to recode.
+    int last_q = *q;
+
+    // Frame size out of permitted range:
+    // Update correction factor & compute new Q to try...
+    // Frame is too large
+    if (rc->projected_frame_size > rc->this_frame_target) {
+      // Special case if the projected size is > the max allowed.
+      if (rc->projected_frame_size >= rc->max_frame_bandwidth)
+        *q_high = rc->worst_quality;
+
+      // Raise Qlow as to at least the current value
+      *q_low = *q < *q_high ? *q + 1 : *q_high;
+
+      if (*undershoot_seen || loop_at_this_size > 2 ||
+          (loop_at_this_size == 2 && !frame_is_intra_only(cm))) {
+        av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+        *q = (*q_high + *q_low + 1) / 2;
+      } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) {
+        const int q_mid = (*q_high + *q_low + 1) / 2;
+        const int q_regulated = get_regulated_q_overshoot(
+            cpi, *q_low, *q_high, top_index, bottom_index);
+        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+        // transition between loop_at_this_size < 2 and loop_at_this_size > 2.
+        *q = (q_mid + q_regulated + 1) / 2;
+      } else {
+        *q = get_regulated_q_overshoot(cpi, *q_low, *q_high, top_index,
+                                       bottom_index);
+      }
 
-  // Update some stats from cyclic refresh, and check if we should not update
-  // golden reference, for 1 pass CBR.
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
-      cm->current_frame.frame_type != KEY_FRAME &&
-      (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == AOM_CBR))
-    av1_cyclic_refresh_check_golden_update(cpi);
+      *overshoot_seen = 1;
+    } else {
+      // Frame is too small
+      *q_high = *q > *q_low ? *q - 1 : *q_low;
+
+      if (*overshoot_seen || loop_at_this_size > 2 ||
+          (loop_at_this_size == 2 && !frame_is_intra_only(cm))) {
+        av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+        *q = (*q_high + *q_low) / 2;
+      } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) {
+        const int q_mid = (*q_high + *q_low) / 2;
+        const int q_regulated =
+            get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
+        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+        // transition between loop_at_this_size < 2 and loop_at_this_size > 2.
+        *q = (q_mid + q_regulated) / 2;
+
+        // Special case reset for qlow for constrained quality.
+        // This should only trigger where there is very substantial
+        // undershoot on a frame and the auto cq level is above
+        // the user passsed in value.
+        if (cpi->oxcf.rc_mode == AOM_CQ && q_regulated < *q_low) {
+          *q_low = *q;
+        }
+      } else {
+        *q = get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
+
+        // Special case reset for qlow for constrained quality.
+        // This should only trigger where there is very substantial
+        // undershoot on a frame and the auto cq level is above
+        // the user passsed in value.
+        if (cpi->oxcf.rc_mode == AOM_CQ && *q < *q_low) {
+          *q_low = *q;
+        }
+      }
 
-  // Update the skip mb flag probabilities based on the distribution
-  // seen in the last encoder iteration.
-  // update_base_skip_probs(cpi);
-  aom_clear_system_state();
-  return AOM_CODEC_OK;
+      *undershoot_seen = 1;
+    }
+
+    // Clamp Q to upper and lower limits:
+    *q = clamp(*q, *q_low, *q_high);
+
+    *loop = (*q != last_q);
+  } else {
+    *loop = 0;
+  }
 }
 
 static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
-  int bottom_index, top_index;
-  int loop_count = 0;
-  int loop_at_this_size = 0;
-  int loop = 0;
-  int overshoot_seen = 0;
-  int undershoot_seen = 0;
-  int frame_over_shoot_limit;
-  int frame_under_shoot_limit;
-  int q = 0, q_low = 0, q_high = 0;
+  const int allow_recode = cpi->sf.recode_loop != DISALLOW_RECODE;
 
   set_size_independent_vars(cpi);
 
   cpi->source->buf_8bit_valid = 0;
 
-  aom_clear_system_state();
+  av1_setup_frame_size(cpi);
 
-  setup_frame_size(cpi);
+  int top_index = 0, bottom_index = 0;
+  int q = 0, q_low = 0, q_high = 0;
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+  q_low = bottom_index;
+  q_high = top_index;
 
+  // Loop variables
+  int loop_count = 0;
+  int loop_at_this_size = 0;
+  int loop = 0;
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  printf("\n Encoding a frame:");
+#endif
   do {
     aom_clear_system_state();
 
-    if (loop_count == 0) {
-      // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
-      set_mv_search_params(cpi);
-
-      // Reset the loop state for new frame size.
-      overshoot_seen = 0;
-      undershoot_seen = 0;
-
-      q_low = bottom_index;
-      q_high = top_index;
-
-      loop_at_this_size = 0;
-
-      // Decide frame size bounds first time through.
-      av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
-                                       &frame_under_shoot_limit,
-                                       &frame_over_shoot_limit);
-    }
-
     // if frame was scaled calculate global_motion_search again if already
     // done
-    if (loop_count > 0 && cpi->source && cpi->global_motion_search_done)
+    if (loop_count > 0 && cpi->source && cpi->global_motion_search_done) {
       if (cpi->source->y_crop_width != cm->width ||
-          cpi->source->y_crop_height != cm->height)
+          cpi->source->y_crop_height != cm->height) {
         cpi->global_motion_search_done = 0;
+      }
+    }
     cpi->source =
         av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
-    if (cpi->unscaled_last_source != NULL)
+    if (cpi->unscaled_last_source != NULL) {
       cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
                                                &cpi->scaled_last_source);
+    }
 
-    if (frame_is_intra_only(cm) == 0) {
+    if (!frame_is_intra_only(cm)) {
       if (loop_count > 0) {
         release_scaled_references(cpi);
       }
       scale_references(cpi);
     }
     av1_set_quantizer(cm, q);
+    av1_init_quantizer(cpi);
+
+    av1_set_variance_partition_thresholds(cpi, q, 0);
+
     // printf("Frame %d/%d: q = %d, frame_type = %d superres_denom = %d\n",
     //        cm->current_frame.frame_number, cm->show_frame, q,
     //        cm->current_frame.frame_type, cm->superres_scale_denominator);
 
-    if (loop_count == 0) setup_frame(cpi);
-
-    // Base q-index may have changed, so we need to assign proper default coef
-    // probs before every iteration.
-    if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
-        cm->current_frame.frame_refs[cm->primary_ref_frame].buf == NULL) {
+    if (loop_count == 0) {
+      setup_frame(cpi);
+    } else if (get_primary_ref_frame_buf(cm) == NULL) {
+      // Base q-index may have changed, so we need to assign proper default coef
+      // probs before every iteration.
       av1_default_coef_probs(cm);
       av1_setup_frame_contexts(cm);
     }
 
-    // Variance adaptive and in frame q adjustment experiments are mutually
-    // exclusive.
     if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
       av1_vaq_frame_setup(cpi);
     } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
       av1_setup_in_frame_q_adj(cpi);
+    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && !allow_recode) {
+      suppress_active_map(cpi);
+      av1_cyclic_refresh_setup(cpi);
+      apply_active_map(cpi);
     }
+
     if (cm->seg.enabled) {
       if (!cm->seg.update_data && cm->prev_frame) {
         segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
@@ -4640,13 +4458,15 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
     }
     segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
 
+    if (allow_recode) save_coding_context(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_encode_frame_time);
+#endif
     // transform / motion compensation build reconstruction frame
-    save_coding_context(cpi);
     av1_encode_frame(cpi);
-
-    // Update the skip mb flag probabilities based on the distribution
-    // seen in the last encoder iteration.
-    // update_base_skip_probs(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_encode_frame_time);
+#endif
 
     aom_clear_system_state();
 
@@ -4656,141 +4476,20 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
       restore_coding_context(cpi);
 
-      if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+      finalize_encoded_frame(cpi);
+      int largest_tile_id = 0;  // Output from bitstream: unused here
+      if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
         return AOM_CODEC_ERROR;
 
       rc->projected_frame_size = (int)(*size) << 3;
       restore_coding_context(cpi);
-
-      if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
     }
 
-    if (cpi->oxcf.rc_mode == AOM_Q) {
-      loop = 0;
-    } else {
-      if ((cm->current_frame.frame_type == KEY_FRAME) &&
-          rc->this_key_frame_forced &&
-          (rc->projected_frame_size < rc->max_frame_bandwidth)) {
-        int last_q = q;
-        int64_t kf_err;
-
-        int64_t high_err_target = cpi->ambient_err;
-        int64_t low_err_target = cpi->ambient_err >> 1;
-
-        if (cm->seq_params.use_highbitdepth) {
-          kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
-        } else {
-          kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
-        }
-        // Prevent possible divide by zero error below for perfect KF
-        kf_err += !kf_err;
-
-        // The key frame is not good enough or we can afford
-        // to make it better without undue risk of popping.
-        if ((kf_err > high_err_target &&
-             rc->projected_frame_size <= frame_over_shoot_limit) ||
-            (kf_err > low_err_target &&
-             rc->projected_frame_size <= frame_under_shoot_limit)) {
-          // Lower q_high
-          q_high = q > q_low ? q - 1 : q_low;
-
-          // Adjust Q
-          q = (int)((q * high_err_target) / kf_err);
-          q = AOMMIN(q, (q_high + q_low) >> 1);
-        } else if (kf_err < low_err_target &&
-                   rc->projected_frame_size >= frame_under_shoot_limit) {
-          // The key frame is much better than the previous frame
-          // Raise q_low
-          q_low = q < q_high ? q + 1 : q_high;
-
-          // Adjust Q
-          q = (int)((q * low_err_target) / kf_err);
-          q = AOMMIN(q, (q_high + q_low + 1) >> 1);
-        }
-
-        // Clamp Q to upper and lower limits:
-        q = clamp(q, q_low, q_high);
-
-        loop = q != last_q;
-      } else if (recode_loop_test(cpi, frame_over_shoot_limit,
-                                  frame_under_shoot_limit, q,
-                                  AOMMAX(q_high, top_index), bottom_index)) {
-        // Is the projected frame size out of range and are we allowed
-        // to attempt to recode.
-        int last_q = q;
-        int retries = 0;
-
-        // Frame size out of permitted range:
-        // Update correction factor & compute new Q to try...
-        // Frame is too large
-        if (rc->projected_frame_size > rc->this_frame_target) {
-          // Special case if the projected size is > the max allowed.
-          if (rc->projected_frame_size >= rc->max_frame_bandwidth)
-            q_high = rc->worst_quality;
-
-          // Raise Qlow as to at least the current value
-          q_low = q < q_high ? q + 1 : q_high;
-
-          if (undershoot_seen || loop_at_this_size > 1) {
-            // Update rate_correction_factor unless
-            av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-
-            q = (q_high + q_low + 1) / 2;
-          } else {
-            // Update rate_correction_factor unless
-            av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-
-            q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                  AOMMAX(q_high, top_index), cm->width,
-                                  cm->height);
-
-            while (q < q_low && retries < 10) {
-              av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-              q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                    AOMMAX(q_high, top_index), cm->width,
-                                    cm->height);
-              retries++;
-            }
-          }
-
-          overshoot_seen = 1;
-        } else {
-          // Frame is too small
-          q_high = q > q_low ? q - 1 : q_low;
-
-          if (overshoot_seen || loop_at_this_size > 1) {
-            av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-            q = (q_high + q_low) / 2;
-          } else {
-            av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-            q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                  top_index, cm->width, cm->height);
-            // Special case reset for qlow for constrained quality.
-            // This should only trigger where there is very substantial
-            // undershoot on a frame and the auto cq level is above
-            // the user passsed in value.
-            if (cpi->oxcf.rc_mode == AOM_CQ && q < q_low) {
-              q_low = q;
-            }
-
-            while (q > q_high && retries < 10) {
-              av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-              q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                    top_index, cm->width, cm->height);
-              retries++;
-            }
-          }
-
-          undershoot_seen = 1;
-        }
-
-        // Clamp Q to upper and lower limits:
-        q = clamp(q, q_low, q_high);
-
-        loop = (q != last_q);
-      } else {
-        loop = 0;
-      }
+    if (allow_recode && cpi->oxcf.rc_mode != AOM_Q) {
+      // Update q and decide whether to do a recode loop
+      recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index,
+                           bottom_index, &undershoot_seen, &overshoot_seen,
+                           loop_at_this_size);
     }
 
     // Special case for overlay frame.
@@ -4798,8 +4497,9 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
         rc->projected_frame_size < rc->max_frame_bandwidth)
       loop = 0;
 
-    if (!cpi->sf.gm_disable_recode) {
-      if (recode_loop_test_global_motion(cpi)) loop = 1;
+    if (allow_recode && !cpi->sf.gm_disable_recode &&
+        recode_loop_test_global_motion(cpi)) {
+      loop = 1;
     }
 
     if (loop) {
@@ -4810,127 +4510,14 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
       ++cpi->tot_recode_hits;
 #endif
     }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    if (loop) printf("\n Recoding:");
+#endif
   } while (loop);
 
   return AOM_CODEC_OK;
 }
 
-static int get_ref_frame_flags(const AV1_COMP *cpi) {
-  const int *const map = cpi->common.ref_frame_map;
-
-  // No.1 Priority: LAST_FRAME
-  const int last2_is_last =
-      map[cpi->remapped_ref_idx[1]] == map[cpi->remapped_ref_idx[0]];
-  const int last3_is_last =
-      map[cpi->remapped_ref_idx[2]] == map[cpi->remapped_ref_idx[0]];
-  const int gld_is_last = map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)] ==
-                          map[cpi->remapped_ref_idx[0]];
-  const int bwd_is_last = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] ==
-                          map[cpi->remapped_ref_idx[0]];
-  const int alt2_is_last = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] ==
-                           map[cpi->remapped_ref_idx[0]];
-  const int alt_is_last = map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)] ==
-                          map[cpi->remapped_ref_idx[0]];
-
-  // No.2 Priority: ALTREF_FRAME
-  const int last2_is_alt = map[cpi->remapped_ref_idx[1]] ==
-                           map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)];
-  const int last3_is_alt = map[cpi->remapped_ref_idx[2]] ==
-                           map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)];
-  const int gld_is_alt = map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)] ==
-                         map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)];
-  const int bwd_is_alt = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] ==
-                         map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)];
-  const int alt2_is_alt = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] ==
-                          map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)];
-
-  // No.3 Priority: LAST2_FRAME
-  const int last3_is_last2 =
-      map[cpi->remapped_ref_idx[2]] == map[cpi->remapped_ref_idx[1]];
-  const int gld_is_last2 = map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)] ==
-                           map[cpi->remapped_ref_idx[1]];
-  const int bwd_is_last2 = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] ==
-                           map[cpi->remapped_ref_idx[1]];
-  const int alt2_is_last2 = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] ==
-                            map[cpi->remapped_ref_idx[1]];
-
-  // No.4 Priority: LAST3_FRAME
-  const int gld_is_last3 = map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)] ==
-                           map[cpi->remapped_ref_idx[2]];
-  const int bwd_is_last3 = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] ==
-                           map[cpi->remapped_ref_idx[2]];
-  const int alt2_is_last3 = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] ==
-                            map[cpi->remapped_ref_idx[2]];
-
-  // No.5 Priority: GOLDEN_FRAME
-  const int bwd_is_gld = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] ==
-                         map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)];
-  const int alt2_is_gld = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] ==
-                          map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)];
-
-  // No.6 Priority: BWDREF_FRAME
-  const int alt2_is_bwd = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] ==
-                          map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)];
-
-  // No.7 Priority: ALTREF2_FRAME
-
-  // After av1_apply_encoding_flags() is called, cpi->ref_frame_flags might be
-  // adjusted according to external encoder flags.
-  int flags = cpi->ext_ref_frame_flags;
-
-  if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG;
-
-  if (alt_is_last) flags &= ~AOM_ALT_FLAG;
-
-  if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG;
-
-  if (last3_is_last || last3_is_alt || last3_is_last2) flags &= ~AOM_LAST3_FLAG;
-
-  if (gld_is_last || gld_is_alt || gld_is_last2 || gld_is_last3)
-    flags &= ~AOM_GOLD_FLAG;
-
-  if ((bwd_is_last || bwd_is_alt || bwd_is_last2 || bwd_is_last3 ||
-       bwd_is_gld) &&
-      (flags & AOM_BWD_FLAG))
-    flags &= ~AOM_BWD_FLAG;
-
-  if ((alt2_is_last || alt2_is_alt || alt2_is_last2 || alt2_is_last3 ||
-       alt2_is_gld || alt2_is_bwd) &&
-      (flags & AOM_ALT2_FLAG))
-    flags &= ~AOM_ALT2_FLAG;
-
-  return flags;
-}
-
-static void set_ext_overrides(AV1_COMP *cpi) {
-  // Overrides the defaults with the externally supplied values with
-  // av1_update_reference() and av1_update_entropy() calls
-  // Note: The overrides are valid only for the next frame passed
-  // to encode_frame_to_data_rate() function
-  if (cpi->ext_use_s_frame) cpi->common.current_frame.frame_type = S_FRAME;
-  cpi->common.force_primary_ref_none = cpi->ext_use_primary_ref_none;
-
-  if (cpi->ext_refresh_frame_context_pending) {
-    cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
-    cpi->ext_refresh_frame_context_pending = 0;
-  }
-  if (cpi->ext_refresh_frame_flags_pending) {
-    cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
-    cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
-    cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
-    cpi->refresh_bwd_ref_frame = cpi->ext_refresh_bwd_ref_frame;
-    cpi->refresh_alt2_ref_frame = cpi->ext_refresh_alt2_ref_frame;
-    cpi->ext_refresh_frame_flags_pending = 0;
-  }
-  cpi->common.allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs;
-  // A keyframe is already error resilient and keyframes with
-  // error_resilient_mode interferes with the use of show_existing_frame
-  // when forward reference keyframes are enabled.
-  cpi->common.error_resilient_mode =
-      cpi->ext_use_error_resilient &&
-      cpi->common.current_frame.frame_type != KEY_FRAME;
-}
-
 #define DUMP_RECON_FRAMES 0
 
 #if DUMP_RECON_FRAMES == 1
@@ -4938,7 +4525,7 @@ static void set_ext_overrides(AV1_COMP *cpi) {
 static void dump_filtered_recon_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const CurrentFrame *const current_frame = &cm->current_frame;
-  const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show;
+  const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf;
 
   if (recon_buf == NULL) {
     printf("Frame %d is not ready.\n", current_frame->frame_number);
@@ -4960,12 +4547,10 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
       current_frame->frame_number, current_frame->order_hint, cm->show_frame,
       cm->show_existing_frame);
   for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    RefBuffer *buf = &cm->current_frame.frame_refs[ref_frame - LAST_FRAME];
-    const int ref_offset = (buf->buf) ? (int)buf->buf->order_hint : -1;
-    printf(" %d(%c-%d-%4.2f)", ref_offset,
-           (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N',
-           (buf->buf) ? (int)buf->buf->frame_rf_level : -1,
-           (buf->buf) ? rate_factor_deltas[buf->buf->frame_rf_level] : -1);
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    const int ref_offset = buf != NULL ? (int)buf->order_hint : -1;
+    printf(" %d(%c)", ref_offset,
+           (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N');
   }
   printf(" ]\n");
 
@@ -4993,25 +4578,18 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
   printf(
       "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
       "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
-      "refresh_alt_ref_frame=%d, rf_level=%d, "
+      "refresh_alt_ref_frame=%d, "
       "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
       current_frame->frame_number, cpi->twopass.gf_group.index,
       cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
       current_frame->order_hint, cm->show_frame, cm->show_existing_frame,
       cpi->rc.source_alt_ref_active, cpi->refresh_alt_ref_frame,
-      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index],
       recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
 #if 0
   int ref_frame;
   printf("get_ref_frame_map_idx: [");
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
-    printf(" %d", get_ref_frame_map_idx(cpi, ref_frame));
-  printf(" ]\n");
-  printf("cm->new_fb_idx = %d\n", cm->new_fb_idx);
-  printf("cm->ref_frame_map = [");
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    printf(" %d", cm->ref_frame_map[ref_frame - LAST_FRAME]);
-  }
+    printf(" %d", get_ref_frame_map_idx(cm, ref_frame));
   printf(" ]\n");
 #endif  // 0
 
@@ -5035,31 +4613,209 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
 }
 #endif  // DUMP_RECON_FRAMES
 
-static INLINE int is_frame_droppable(AV1_COMP *cpi) {
-  return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
-           cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame ||
-           cpi->refresh_last_frame);
+static int get_interp_filter_selected(const AV1_COMMON *const cm,
+                                      MV_REFERENCE_FRAME ref,
+                                      InterpFilters ifilter) {
+  const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+  if (buf == NULL) return 0;
+  return buf->interp_filter_selected[ifilter];
+}
+
+static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int ref_total[REF_FRAMES] = { 0 };
+
+  if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
+    return 0;
+
+  for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+    for (InterpFilters ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+         ++ifilter) {
+      ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter);
+    }
+  }
+  int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] +
+                         ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] +
+                         ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]);
+
+  int mask = 0;
+  for (InterpFilters ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+       ++ifilter) {
+    int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30;
+    if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) {
+      int filter_score =
+          get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 +
+          get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 +
+          get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10;
+      if (filter_score < ref_total_total) mask |= 1 << ifilter;
+    }
+  }
+  return mask;
+}
+
+static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
+                         const YV12_BUFFER_CONFIG *last_picture,
+                         hash_table *last_hash_table) {
+  aom_clear_system_state();
+  // check use hash ME
+  int k;
+  uint32_t hash_value_1;
+  uint32_t hash_value_2;
+
+  const int block_size = 8;
+  const double threshold_current = 0.8;
+  const double threshold_average = 0.95;
+  const int max_history_size = 32;
+  int T = 0;  // total block
+  int C = 0;  // match with collocated block
+  int S = 0;  // smooth region but not match with collocated block
+  int M = 0;  // match with other block
+
+  const int pic_width = cur_picture->y_width;
+  const int pic_height = cur_picture->y_height;
+  for (int i = 0; i + block_size <= pic_height; i += block_size) {
+    for (int j = 0; j + block_size <= pic_width; j += block_size) {
+      const int x_pos = j;
+      const int y_pos = i;
+      int match = 1;
+      T++;
+
+      // check whether collocated block match with current
+      uint8_t *p_cur = cur_picture->y_buffer;
+      uint8_t *p_ref = last_picture->y_buffer;
+      int stride_cur = cur_picture->y_stride;
+      int stride_ref = last_picture->y_stride;
+      p_cur += (y_pos * stride_cur + x_pos);
+      p_ref += (y_pos * stride_ref + x_pos);
+
+      if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+        uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p16_cur[tmpX] != p16_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p16_cur += stride_cur;
+          p16_ref += stride_ref;
+        }
+      } else {
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p_cur[tmpX] != p_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p_cur += stride_cur;
+          p_ref += stride_ref;
+        }
+      }
+
+      if (match) {
+        C++;
+        continue;
+      }
+
+      if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
+                                         y_pos) ||
+          av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
+        S++;
+        continue;
+      }
+
+      av1_get_block_hash_value(
+          cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
+          block_size, &hash_value_1, &hash_value_2,
+          (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb);
+      // Hashing does not work for highbitdepth currently.
+      // TODO(Roger): Make it work for highbitdepth.
+      if (av1_use_hash_me(&cpi->common)) {
+        if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) {
+          M++;
+        }
+      }
+    }
+  }
+
+  assert(T > 0);
+  double csm_rate = ((double)(C + S + M)) / ((double)(T));
+  double m_rate = ((double)(M)) / ((double)(T));
+
+  cpi->csm_rate_array[cpi->rate_index] = csm_rate;
+  cpi->m_rate_array[cpi->rate_index] = m_rate;
+
+  cpi->rate_index = (cpi->rate_index + 1) % max_history_size;
+  cpi->rate_size++;
+  cpi->rate_size = AOMMIN(cpi->rate_size, max_history_size);
+
+  if (csm_rate < threshold_current) {
+    return 0;
+  }
+
+  if (C == T) {
+    return 1;
+  }
+
+  double csm_average = 0.0;
+  double m_average = 0.0;
+
+  for (k = 0; k < cpi->rate_size; k++) {
+    csm_average += cpi->csm_rate_array[k];
+    m_average += cpi->m_rate_array[k];
+  }
+  csm_average /= cpi->rate_size;
+  m_average /= cpi->rate_size;
+
+  if (csm_average < threshold_average) {
+    return 0;
+  }
+
+  if (M > (T - C - S) / 3) {
+    return 1;
+  }
+
+  if (csm_rate > 0.99 && m_rate > 0.01) {
+    return 1;
+  }
+
+  if (csm_average + m_average > 1.01) {
+    return 1;
+  }
+
+  return 0;
 }
 
-static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
-                                     int skip_adapt,
-                                     unsigned int *frame_flags) {
+// Refresh reference frame buffers according to refresh_frame_flags.
+static void refresh_reference_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  // All buffers are refreshed for shown keyframes and S-frames.
+
+  for (int ref_frame = 0; ref_frame < REF_FRAMES; ref_frame++) {
+    if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) {
+      assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame);
+    }
+  }
+}
+
+static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
+                                     uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
   SequenceHeader *const seq_params = &cm->seq_params;
   CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
 
-  set_ext_overrides(cpi);
-  aom_clear_system_state();
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_frame_to_data_rate_time);
+#endif
 
   // frame type has been decided outside of this function call
-  cm->cur_frame->intra_only = frame_is_intra_only(cm);
   cm->cur_frame->frame_type = current_frame->frame_type;
 
-  // S_FRAMEs are always error resilient
-  cm->error_resilient_mode |= frame_is_sframe(cm);
-
   cm->large_scale_tile = cpi->oxcf.large_scale_tile;
   cm->single_tile_decoding = cpi->oxcf.single_tile_decoding;
 
@@ -5072,34 +4828,20 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   cm->allow_warped_motion =
       cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm);
 
-  // Reset the frame packet stamp index.
-  if (current_frame->frame_type == KEY_FRAME && cm->show_frame)
-    current_frame->frame_number = 0;
+  cm->last_frame_type = current_frame->frame_type;
+  if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search)
+    cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
 
-  // NOTE:
-  // (1) Move the setup of the ref_frame_flags upfront as it would be
-  //     determined by the current frame properties;
-  // (2) The setup of the ref_frame_flags applies to both
-  // show_existing_frame's
-  //     and the other cases.
-  if (current_frame->frame_number > 0)
-    cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+  cpi->two_pass_partition_search = cpi->sf.two_pass_partition_search &&
+                                   !cpi->partition_search_skippable_frame;
 
   if (encode_show_existing_frame(cm)) {
-    // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
-    //               BWDREF_FRAME in the reference frame buffer.
-    if (current_frame->frame_type == KEY_FRAME) {
-      cm->reset_decoder_state = 1;
-    } else {
-      current_frame->frame_type = INTER_FRAME;
-    }
-    cm->show_frame = 1;
-    cpi->frame_flags = *frame_flags;
-
     restore_coding_context(cpi);
 
+    finalize_encoded_frame(cpi);
     // Build the bitstream
-    if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+    int largest_tile_id = 0;  // Output from bitstream: unused here
+    if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
       return AOM_CODEC_ERROR;
 
     if (seq_params->frame_id_numbers_present_flag &&
@@ -5112,40 +4854,16 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
 
     cpi->seq_params_locked = 1;
 
-    // Set up frame to show to get ready for stats collection.
-    cm->frame_to_show = &cm->cur_frame->buf;
-
-    // Update current frame offset.
-    current_frame->order_hint = cm->cur_frame->order_hint;
-
 #if DUMP_RECON_FRAMES == 1
     // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
     dump_filtered_recon_frames(cpi);
 #endif  // DUMP_RECON_FRAMES
 
-    // Update the LAST_FRAME in the reference frame buffer.
-    // NOTE:
-    // (1) For BWDREF_FRAME as the show_existing_frame, the reference frame
-    //     update has been done previously when handling the LAST_BIPRED_FRAME
-    //     right before BWDREF_FRAME (in the display order);
-    // (2) For INTNL_OVERLAY as the show_existing_frame, the reference frame
-    //     update will be done when the following is called, which will
-    //     exchange
-    //     the virtual indexes between LAST_FRAME and ALTREF2_FRAME, so that
-    //     LAST3 will get retired, LAST2 becomes LAST3, LAST becomes LAST2,
-    //     and
-    //     ALTREF2_FRAME will serve as the new LAST_FRAME.
-    update_reference_frames(cpi);
-
-    // Update frame flags
-    cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
-    cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
-    cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
-
-    *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
-
-    // Update the frame type
-    cm->last_frame_type = current_frame->frame_type;
+    // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+    //       for the purpose to verify no mismatch between encoder and decoder.
+    if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
+
+    refresh_reference_frames(cpi);
 
     // Since we allocate a spot for the OVERLAY frame in the gf group, we need
     // to do post-encoding update accordingly.
@@ -5159,6 +4877,26 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
     return AOM_CODEC_OK;
   }
 
+  // Work out whether to force_integer_mv this frame
+  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools &&
+      !frame_is_intra_only(cm)) {
+    if (cpi->common.seq_params.force_integer_mv == 2) {
+      // Adaptive mode: see what previous frame encoded did
+      if (cpi->unscaled_last_source != NULL) {
+        cm->cur_frame_force_integer_mv =
+            is_integer_mv(cpi, cpi->source, cpi->unscaled_last_source,
+                          cpi->previous_hash_table);
+      } else {
+        cpi->common.cur_frame_force_integer_mv = 0;
+      }
+    } else {
+      cpi->common.cur_frame_force_integer_mv =
+          cpi->common.seq_params.force_integer_mv;
+    }
+  } else {
+    cpi->common.cur_frame_force_integer_mv = 0;
+  }
+
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
@@ -5190,6 +4928,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
       current_frame->frame_type != KEY_FRAME) {
     if (av1_rc_drop_frame(cpi)) {
       av1_rc_postencode_update_drop_frame(cpi);
+      release_scaled_references(cpi);
       return AOM_CODEC_OK;
     }
   }
@@ -5204,7 +4943,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   if (seq_params->frame_id_numbers_present_flag) {
     /* Non-normative definition of current_frame_id ("frame counter" with
      * wraparound) */
-    const int frame_id_length = FRAME_ID_LENGTH;
     if (cm->current_frame_id == -1) {
       int lsb, msb;
       /* quasi-random initialization of current_frame_id for a key frame */
@@ -5215,7 +4953,8 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
         lsb = cpi->source->y_buffer[0] & 0xff;
         msb = cpi->source->y_buffer[1] & 0xff;
       }
-      cm->current_frame_id = ((msb << 8) + lsb) % (1 << frame_id_length);
+      cm->current_frame_id =
+          ((msb << 8) + lsb) % (1 << seq_params->frame_id_length);
 
       // S_frame is meant for stitching different streams of different
       // resolutions together, so current_frame_id must be the
@@ -5225,8 +4964,8 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
       if (cpi->oxcf.sframe_enabled) cm->current_frame_id = 0x37;
     } else {
       cm->current_frame_id =
-          (cm->current_frame_id + 1 + (1 << frame_id_length)) %
-          (1 << frame_id_length);
+          (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) %
+          (1 << seq_params->frame_id_length);
     }
   }
 
@@ -5249,15 +4988,14 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   }
   cm->timing_info_present &= !seq_params->reduced_still_picture_hdr;
 
-  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
-    if (encode_without_recode_loop(cpi) != AOM_CODEC_OK) return AOM_CODEC_ERROR;
-  } else {
-    if (encode_with_recode_loop(cpi, size, dest) != AOM_CODEC_OK)
-      return AOM_CODEC_ERROR;
-  }
-
-  cm->last_tile_cols = cm->tile_cols;
-  cm->last_tile_rows = cm->tile_rows;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_with_recode_loop_time);
+#endif
+  if (encode_with_recode_loop(cpi, size, dest) != AOM_CODEC_OK)
+    return AOM_CODEC_ERROR;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_with_recode_loop_time);
+#endif
 
 #ifdef OUTPUT_YUV_SKINMAP
   if (cpi->common.current_frame.frame_number > 1) {
@@ -5276,23 +5014,16 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
     }
   }
 
-  // If the encoder forced a KEY_FRAME decision or if frame is an S_FRAME
-  if ((current_frame->frame_type == KEY_FRAME && cm->show_frame) ||
-      frame_is_sframe(cm)) {
-    cpi->refresh_last_frame = 1;
-  }
-
-  cm->frame_to_show = &cm->cur_frame->buf;
-  cm->frame_to_show->color_primaries = seq_params->color_primaries;
-  cm->frame_to_show->transfer_characteristics =
+  cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
+  cm->cur_frame->buf.transfer_characteristics =
       seq_params->transfer_characteristics;
-  cm->frame_to_show->matrix_coefficients = seq_params->matrix_coefficients;
-  cm->frame_to_show->monochrome = seq_params->monochrome;
-  cm->frame_to_show->chroma_sample_position =
+  cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
+  cm->cur_frame->buf.monochrome = seq_params->monochrome;
+  cm->cur_frame->buf.chroma_sample_position =
       seq_params->chroma_sample_position;
-  cm->frame_to_show->color_range = seq_params->color_range;
-  cm->frame_to_show->render_width = cm->render_width;
-  cm->frame_to_show->render_height = cm->render_height;
+  cm->cur_frame->buf.color_range = seq_params->color_range;
+  cm->cur_frame->buf.render_width = cm->render_width;
+  cm->cur_frame->buf.render_height = cm->render_height;
 
   // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
   // off.
@@ -5313,26 +5044,31 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   }
 
   // TODO(debargha): Fix mv search range on encoder side
-  // aom_extend_frame_inner_borders(cm->frame_to_show, av1_num_planes(cm));
-  aom_extend_frame_borders(cm->frame_to_show, av1_num_planes(cm));
+  // aom_extend_frame_inner_borders(&cm->cur_frame->buf, av1_num_planes(cm));
+  aom_extend_frame_borders(&cm->cur_frame->buf, av1_num_planes(cm));
 
 #ifdef OUTPUT_YUV_REC
-  aom_write_one_yuv_frame(cm, cm->frame_to_show);
+  aom_write_one_yuv_frame(cm, &cm->cur_frame->buf);
 #endif
 
+  finalize_encoded_frame(cpi);
   // Build the bitstream
-  if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+  int largest_tile_id = 0;  // Output from pack_bitstream
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_pack_bitstream_final_time);
+#endif
+  if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
     return AOM_CODEC_ERROR;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_pack_bitstream_final_time);
+#endif
 
   cpi->seq_params_locked = 1;
 
-  if (skip_adapt) return AOM_CODEC_OK;
-
+  // Update reference frame ids for reference frames this frame will overwrite
   if (seq_params->frame_id_numbers_present_flag) {
-    int i;
-    // Update reference frame id values based on the value of refresh_frame_mask
-    for (i = 0; i < REF_FRAMES; i++) {
-      if ((cpi->refresh_frame_mask >> i) & 1) {
+    for (int i = 0; i < REF_FRAMES; i++) {
+      if ((current_frame->refresh_frame_flags >> i) & 1) {
         cm->ref_frame_id[i] = cm->current_frame_id;
       }
     }
@@ -5347,7 +5083,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
     if (cm->seg.update_map) {
       update_reference_segmentation_map(cpi);
     } else if (cm->last_frame_seg_map) {
-      memcpy(cm->current_frame_seg_map, cm->last_frame_seg_map,
+      memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map,
              cm->mi_cols * cm->mi_rows * sizeof(uint8_t));
     }
   }
@@ -5356,41 +5092,60 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
     release_scaled_references(cpi);
   }
 
-  update_reference_frames(cpi);
+  // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+  //       for the purpose to verify no mismatch between encoder and decoder.
+  if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
+
+  refresh_reference_frames(cpi);
 
 #if CONFIG_ENTROPY_STATS
   av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts);
 #endif  // CONFIG_ENTROPY_STATS
 
   if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-    *cm->fc = cpi->tile_data[cm->largest_tile_id].tctx;
+    *cm->fc = cpi->tile_data[largest_tile_id].tctx;
     av1_reset_cdf_symbol_counters(cm->fc);
   }
+  if (!cm->large_scale_tile) {
+    cm->cur_frame->frame_context = *cm->fc;
+  }
+#define EXT_TILE_DEBUG 0
+#if EXT_TILE_DEBUG
+  if (cm->large_scale_tile && oxcf->pass == 2) {
+    char fn[20] = "./fc";
+    fn[4] = current_frame->frame_number / 100 + '0';
+    fn[5] = (current_frame->frame_number % 100) / 10 + '0';
+    fn[6] = (current_frame->frame_number % 10) + '0';
+    fn[7] = '\0';
+    av1_print_frame_contexts(cm->fc, fn);
+  }
+#endif  // EXT_TILE_DEBUG
+#undef EXT_TILE_DEBUG
 
-  if (cpi->refresh_golden_frame == 1)
-    cpi->frame_flags |= FRAMEFLAGS_GOLDEN;
-  else
-    cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
-
-  if (cpi->refresh_alt_ref_frame == 1)
-    cpi->frame_flags |= FRAMEFLAGS_ALTREF;
-  else
-    cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_frame_to_data_rate_time);
 
-  if (cpi->refresh_bwd_ref_frame == 1)
-    cpi->frame_flags |= FRAMEFLAGS_BWDREF;
-  else
-    cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+  // Print out timing information.
+  int i;
+  fprintf(stderr, "\n Frame number: %d, Frame type: %s, Show Frame: %d\n",
+          cm->current_frame.frame_number,
+          get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame);
+  for (i = 0; i < kTimingComponents; i++) {
+    cpi->component_time[i] += cpi->frame_component_time[i];
+    fprintf(stderr, " %s:  %" PRId64 " us (total: %" PRId64 " us)\n",
+            get_component_name(i), cpi->frame_component_time[i],
+            cpi->component_time[i]);
+    cpi->frame_component_time[i] = 0;
+  }
+#endif
 
   cm->last_frame_type = current_frame->frame_type;
 
   av1_rc_postencode_update(cpi, *size);
 
-  if (current_frame->frame_type == KEY_FRAME) {
-    // Tell the caller that the frame was coded as a key frame
-    *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY;
-  } else {
-    *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+  // Store encoded frame's hash table for is_integer_mv() next time
+  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
+    cpi->previous_hash_table = &cm->cur_frame->hash_table;
   }
 
   // Clear the one shot update flags for segmentation map and mode/ref loop
@@ -5414,114 +5169,62 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
     ++current_frame->frame_number;
   }
 
-  // NOTE: Shall not refer to any frame not used as reference.
-  if (cm->is_reference_frame) {
-    // keep track of the last coded dimensions
-    cm->last_width = cm->width;
-    cm->last_height = cm->height;
-  }
-
   return AOM_CODEC_OK;
 }
 
-static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
-  // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
-  //               differently here for rc->avg_frame_bandwidth.
-  if (cpi->common.show_frame || cpi->rc.is_bwd_ref_frame) {
-    if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
-        cpi->common.current_frame.frame_type == KEY_FRAME) {
-      // If this is a show_existing_frame with a source other than altref,
-      // or if it is not a displayed forward keyframe, the keyframe update
-      // counters were incremented when it was originally encoded.
-      cpi->rc.frames_since_key++;
-      cpi->rc.frames_to_key--;
-    }
-  }
-}
-
-static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
-  // TODO(weitinglin): Updating this counter for is_frame_droppable
-  // is a work-around to handle the condition when a frame is drop.
-  // We should fix the cpi->common.show_frame flag
-  // instead of checking the other condition to update the counter properly.
-  if (cpi->common.show_frame || is_frame_droppable(cpi)) {
-    // Decrement count down till next gf
-    if (cpi->rc.frames_till_gf_update_due > 0)
-      cpi->rc.frames_till_gf_update_due--;
-  }
-}
-
-static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) {
-  // Increment the gf group index ready for the next frame. If this is
-  // a show_existing_frame with a source other than altref, or if it is not
-  // a displayed forward keyframe, the index was incremented when it was
-  // originally encoded.
-  if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
-      cpi->common.current_frame.frame_type == KEY_FRAME) {
-    ++cpi->twopass.gf_group.index;
-  }
-}
+int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+               const EncodeFrameInput *const frame_input,
+               const EncodeFrameParams *const frame_params,
+               EncodeFrameResults *const frame_results) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
 
-static void update_rc_counts(AV1_COMP *cpi) {
-  update_keyframe_counters(cpi);
-  update_frames_till_gf_update(cpi);
-  if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi);
-}
+  cpi->unscaled_source = frame_input->source;
+  cpi->source = frame_input->source;
+  cpi->unscaled_last_source = frame_input->last_source;
+
+  current_frame->refresh_frame_flags = frame_params->refresh_frame_flags;
+  cm->error_resilient_mode = frame_params->error_resilient_mode;
+  cm->primary_ref_frame = frame_params->primary_ref_frame;
+  cm->current_frame.frame_type = frame_params->frame_type;
+  cm->show_frame = frame_params->show_frame;
+  cpi->ref_frame_flags = frame_params->ref_frame_flags;
+  cpi->speed = frame_params->speed;
+  cm->show_existing_frame = frame_params->show_existing_frame;
+  cpi->existing_fb_idx_to_show = frame_params->existing_fb_idx_to_show;
+
+  memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx,
+         REF_FRAMES * sizeof(*cm->remapped_ref_idx));
+
+  cpi->refresh_last_frame = frame_params->refresh_last_frame;
+  cpi->refresh_golden_frame = frame_params->refresh_golden_frame;
+  cpi->refresh_bwd_ref_frame = frame_params->refresh_bwd_ref_frame;
+  cpi->refresh_alt2_ref_frame = frame_params->refresh_alt2_ref_frame;
+  cpi->refresh_alt_ref_frame = frame_params->refresh_alt_ref_frame;
 
-static void set_additional_frame_flags(AV1_COMMON *const cm,
-                                       unsigned int *frame_flags) {
-  if (frame_is_intra_only(cm)) *frame_flags |= FRAMEFLAGS_INTRAONLY;
-  if (frame_is_sframe(cm)) *frame_flags |= FRAMEFLAGS_SWITCH;
-  if (cm->error_resilient_mode) *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT;
-}
+  if (current_frame->frame_type == KEY_FRAME && cm->show_frame)
+    current_frame->frame_number = 0;
 
-static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
-                       int skip_adapt, unsigned int *frame_flags) {
-  if (cpi->oxcf.rc_mode == AOM_CBR) {
-    av1_rc_get_one_pass_cbr_params(cpi);
+  if (cm->show_existing_frame) {
+    current_frame->order_hint = cm->cur_frame->order_hint;
   } else {
-    av1_rc_get_one_pass_vbr_params(cpi);
-  }
-  if (encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags) !=
-      AOM_CODEC_OK) {
-    return AOM_CODEC_ERROR;
+    current_frame->order_hint =
+        current_frame->frame_number + frame_params->order_offset;
+    current_frame->order_hint %=
+        (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1));
   }
-  set_additional_frame_flags(&cpi->common, frame_flags);
-
-  update_rc_counts(cpi);
-  check_show_existing_frame(cpi);
-  return AOM_CODEC_OK;
-}
-
-static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
-                       unsigned int *frame_flags) {
-#if CONFIG_MISMATCH_DEBUG
-  mismatch_move_frame_idx_w();
-#endif
-#if TXCOEFF_COST_TIMER
-  AV1_COMMON *cm = &cpi->common;
-  cm->txcoeff_cost_timer = 0;
-  cm->txcoeff_cost_count = 0;
-#endif
 
-  if (encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags) !=
-      AOM_CODEC_OK) {
+  if (cpi->oxcf.pass == 1) {
+    av1_first_pass(cpi, frame_input->ts_duration);
+  } else if (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) {
+    if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
     return AOM_CODEC_ERROR;
   }
-  set_additional_frame_flags(&cpi->common, frame_flags);
 
-#if TXCOEFF_COST_TIMER
-  cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
-  fprintf(stderr,
-          "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
-          "in us\n",
-          cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
-          cm->cum_txcoeff_cost_timer);
-#endif
-
-  av1_twopass_postencode_update(cpi);
-  update_rc_counts(cpi);
-  check_show_existing_frame(cpi);
   return AOM_CODEC_OK;
 }
 
@@ -5564,7 +5267,6 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
                           int64_t end_time) {
   AV1_COMMON *const cm = &cpi->common;
   const SequenceHeader *const seq_params = &cm->seq_params;
-  struct aom_usec_timer timer;
   int res = 0;
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
@@ -5572,8 +5274,10 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
 
   check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
 
+#if CONFIG_INTERNAL_STATS
+  struct aom_usec_timer timer;
   aom_usec_timer_start(&timer);
-
+#endif
 #if CONFIG_DENOISE
   if (cpi->oxcf.noise_level > 0)
     if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size,
@@ -5584,9 +5288,10 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
   if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
                          use_highbitdepth, frame_flags))
     res = -1;
+#if CONFIG_INTERNAL_STATS
   aom_usec_timer_mark(&timer);
   cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
-
+#endif
   if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
       (subsampling_x != 1 || subsampling_y != 1)) {
     aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
@@ -5610,133 +5315,6 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
   return res;
 }
 
-static void adjust_frame_rate(AV1_COMP *cpi,
-                              const struct lookahead_entry *source) {
-  int64_t this_duration;
-  int step = 0;
-
-  if (source->ts_start == cpi->first_time_stamp_ever) {
-    this_duration = source->ts_end - source->ts_start;
-    step = 1;
-  } else {
-    int64_t last_duration =
-        cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
-
-    this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
-
-    // do a step update if the duration changes by 10%
-    if (last_duration)
-      step = (int)((this_duration - last_duration) * 10 / last_duration);
-  }
-
-  if (this_duration) {
-    if (step) {
-      av1_new_framerate(cpi, 10000000.0 / this_duration);
-    } else {
-      // Average this frame's rate into the last second's average
-      // frame rate. If we haven't seen 1 second yet, then average
-      // over the whole interval seen.
-      const double interval = AOMMIN(
-          (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
-      double avg_duration = 10000000.0 / cpi->framerate;
-      avg_duration *= (interval - avg_duration + this_duration);
-      avg_duration /= interval;
-
-      av1_new_framerate(cpi, 10000000.0 / avg_duration);
-    }
-  }
-  cpi->last_time_stamp_seen = source->ts_start;
-  cpi->last_end_time_stamp_seen = source->ts_end;
-}
-
-// Returns 0 if this is not an alt ref else the offset of the source frame
-// used as the arf midpoint.
-static int get_arf_src_index(AV1_COMP *cpi) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  int arf_src_index = 0;
-  if (is_altref_enabled(cpi)) {
-    if (cpi->oxcf.pass == 2) {
-      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-      if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
-        arf_src_index = gf_group->arf_src_offset[gf_group->index];
-      }
-    } else if (rc->source_alt_ref_pending) {
-      arf_src_index = rc->frames_till_gf_update_due;
-    }
-  }
-  return arf_src_index;
-}
-
-static int get_brf_src_index(AV1_COMP *cpi) {
-  int brf_src_index = 0;
-  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-
-  // TODO(zoeliu): We need to add the check on the -bwd_ref command line setup
-  //               flag.
-  if (gf_group->bidir_pred_enabled[gf_group->index]) {
-    if (cpi->oxcf.pass == 2) {
-      if (gf_group->update_type[gf_group->index] == BRF_UPDATE)
-        brf_src_index = gf_group->brf_src_offset[gf_group->index];
-    } else {
-      // TODO(zoeliu): To re-visit the setup for this scenario
-      brf_src_index = cpi->rc.bipred_group_interval - 1;
-    }
-  }
-
-  return brf_src_index;
-}
-
-// Returns 0 if this is not an alt ref else the offset of the source frame
-// used as the arf midpoint.
-static int get_arf2_src_index(AV1_COMP *cpi) {
-  int arf2_src_index = 0;
-  if (is_altref_enabled(cpi) && cpi->num_extra_arfs) {
-    if (cpi->oxcf.pass == 2) {
-      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-      if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
-        arf2_src_index = gf_group->arf_src_offset[gf_group->index];
-      }
-    }
-  }
-  return arf2_src_index;
-}
-
-static void check_src_altref(AV1_COMP *cpi,
-                             const struct lookahead_entry *source) {
-  RATE_CONTROL *const rc = &cpi->rc;
-
-  // If pass == 2, the parameters set here will be reset in
-  // av1_rc_get_second_pass_params()
-
-  if (cpi->oxcf.pass == 2) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    rc->is_src_frame_alt_ref =
-        (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) ||
-        (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
-    rc->is_src_frame_ext_arf =
-        gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
-  } else {
-    rc->is_src_frame_alt_ref =
-        cpi->alt_ref_source && (source == cpi->alt_ref_source);
-  }
-
-  if (rc->is_src_frame_alt_ref) {
-    // Current frame is an ARF overlay frame.
-    cpi->alt_ref_source = NULL;
-
-    if (rc->is_src_frame_ext_arf && !cpi->common.show_existing_frame) {
-      // For INTNL_OVERLAY, when show_existing_frame == 0, they do need to
-      // refresh the LAST_FRAME, i.e. LAST3 gets retired, LAST2 becomes LAST3,
-      // LAST becomes LAST2, and INTNL_OVERLAY becomes LAST.
-      cpi->refresh_last_frame = 1;
-    } else {
-      // Don't refresh the last buffer for an ARF overlay frame. It will
-      // become the GF so preserve last as an alternative prediction option.
-      cpi->refresh_last_frame = 0;
-    }
-  }
-}
-
 #if CONFIG_INTERNAL_STATS
 extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
                                  const unsigned char *img2, int img2_pitch,
@@ -5768,7 +5346,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
   }
   if (cm->show_frame) {
     const YV12_BUFFER_CONFIG *orig = cpi->source;
-    const YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+    const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
     double y, u, v, frame_all;
 
     cpi->count++;
@@ -5843,738 +5421,31 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
   }
 }
 #endif  // CONFIG_INTERNAL_STATS
-
-static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
-                         const YV12_BUFFER_CONFIG *last_picture,
-                         hash_table *last_hash_table) {
-  aom_clear_system_state();
-  // check use hash ME
-  int k;
-  uint32_t hash_value_1;
-  uint32_t hash_value_2;
-
-  const int block_size = 8;
-  const double threshold_current = 0.8;
-  const double threshold_average = 0.95;
-  const int max_history_size = 32;
-  int T = 0;  // total block
-  int C = 0;  // match with collocated block
-  int S = 0;  // smooth region but not match with collocated block
-  int M = 0;  // match with other block
-
-  const int pic_width = cur_picture->y_width;
-  const int pic_height = cur_picture->y_height;
-  for (int i = 0; i + block_size <= pic_height; i += block_size) {
-    for (int j = 0; j + block_size <= pic_width; j += block_size) {
-      const int x_pos = j;
-      const int y_pos = i;
-      int match = 1;
-      T++;
-
-      // check whether collocated block match with current
-      uint8_t *p_cur = cur_picture->y_buffer;
-      uint8_t *p_ref = last_picture->y_buffer;
-      int stride_cur = cur_picture->y_stride;
-      int stride_ref = last_picture->y_stride;
-      p_cur += (y_pos * stride_cur + x_pos);
-      p_ref += (y_pos * stride_ref + x_pos);
-
-      if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
-        uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
-        uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
-        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
-          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
-            if (p16_cur[tmpX] != p16_ref[tmpX]) {
-              match = 0;
-            }
-          }
-          p16_cur += stride_cur;
-          p16_ref += stride_ref;
-        }
-      } else {
-        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
-          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
-            if (p_cur[tmpX] != p_ref[tmpX]) {
-              match = 0;
-            }
-          }
-          p_cur += stride_cur;
-          p_ref += stride_ref;
-        }
-      }
-
-      if (match) {
-        C++;
-        continue;
-      }
-
-      if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
-                                         y_pos) ||
-          av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
-        S++;
-        continue;
-      }
-
-      av1_get_block_hash_value(
-          cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
-          block_size, &hash_value_1, &hash_value_2,
-          (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb);
-      // Hashing does not work for highbitdepth currently.
-      // TODO(Roger): Make it work for highbitdepth.
-      if (av1_use_hash_me(&cpi->common)) {
-        if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) {
-          M++;
-        }
-      }
-    }
-  }
-
-  assert(T > 0);
-  double csm_rate = ((double)(C + S + M)) / ((double)(T));
-  double m_rate = ((double)(M)) / ((double)(T));
-
-  cpi->csm_rate_array[cpi->rate_index] = csm_rate;
-  cpi->m_rate_array[cpi->rate_index] = m_rate;
-
-  cpi->rate_index = (cpi->rate_index + 1) % max_history_size;
-  cpi->rate_size++;
-  cpi->rate_size = AOMMIN(cpi->rate_size, max_history_size);
-
-  if (csm_rate < threshold_current) {
-    return 0;
-  }
-
-  if (C == T) {
-    return 1;
-  }
-
-  double csm_average = 0.0;
-  double m_average = 0.0;
-
-  for (k = 0; k < cpi->rate_size; k++) {
-    csm_average += cpi->csm_rate_array[k];
-    m_average += cpi->m_rate_array[k];
-  }
-  csm_average /= cpi->rate_size;
-  m_average /= cpi->rate_size;
-
-  if (csm_average < threshold_average) {
-    return 0;
-  }
-
-  if (M > (T - C - S) / 3) {
-    return 1;
-  }
-
-  if (csm_rate > 0.99 && m_rate > 0.01) {
-    return 1;
-  }
-
-  if (csm_average + m_average > 1.01) {
-    return 1;
-  }
-
-  return 0;
-}
-
-// Code for temporal dependency model
-typedef struct GF_PICTURE {
-  YV12_BUFFER_CONFIG *frame;
-  int ref_frame[7];
-} GF_PICTURE;
-
-static void init_gop_frames(AV1_COMP *cpi, GF_PICTURE *gf_picture,
-                            const GF_GROUP *gf_group, int *tpl_group_frames) {
-  AV1_COMMON *cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
-  int frame_idx = 0;
-  int i;
-  int gld_index = -1;
-  int alt_index = -1;
-  int lst_index = -1;
-  int extend_frame_count = 0;
-  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
-
-  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
-  int recon_frame_index[INTER_REFS_PER_FRAME + 1] = { -1, -1, -1, -1,
-                                                      -1, -1, -1, -1 };
-
-  // TODO(jingning): To be used later for gf frame type parsing.
-  (void)gf_group;
-
-  for (i = 0; i < FRAME_BUFFERS && frame_idx < INTER_REFS_PER_FRAME + 1; ++i) {
-    if (frame_bufs[i].ref_count == 0) {
-      alloc_frame_mvs(cm, i);
-      if (aom_realloc_frame_buffer(
-              &frame_bufs[i].buf, cm->width, cm->height,
-              seq_params->subsampling_x, seq_params->subsampling_y,
-              seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
-              cm->byte_alignment, NULL, NULL, NULL))
-        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                           "Failed to allocate frame buffer");
-
-      recon_frame_index[frame_idx] = i;
-      ++frame_idx;
-    }
-  }
-
-  for (i = 0; i < INTER_REFS_PER_FRAME + 1; ++i) {
-    assert(recon_frame_index[i] >= 0);
-    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
-  }
-
-  *tpl_group_frames = 0;
-
-  // Initialize Golden reference frame.
-  gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
-  for (i = 0; i < 7; ++i) gf_picture[0].ref_frame[i] = -1;
-  gld_index = 0;
-  ++*tpl_group_frames;
-
-  // Initialize ARF frame
-  gf_picture[1].frame = cpi->source;
-  gf_picture[1].ref_frame[0] = gld_index;
-  gf_picture[1].ref_frame[1] = lst_index;
-  gf_picture[1].ref_frame[2] = alt_index;
-  // TODO(yuec) Need o  figure out full AV1 reference model
-  for (i = 3; i < 7; ++i) gf_picture[1].ref_frame[i] = -1;
-  alt_index = 1;
-  ++*tpl_group_frames;
-
-  // Initialize P frames
-  for (frame_idx = 2; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
-    struct lookahead_entry *buf =
-        av1_lookahead_peek(cpi->lookahead, frame_idx - 2);
-
-    if (buf == NULL) break;
-
-    gf_picture[frame_idx].frame = &buf->img;
-    gf_picture[frame_idx].ref_frame[0] = gld_index;
-    gf_picture[frame_idx].ref_frame[1] = lst_index;
-    gf_picture[frame_idx].ref_frame[2] = alt_index;
-    for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
-
-    ++*tpl_group_frames;
-    lst_index = frame_idx;
-
-    if (frame_idx == cpi->rc.baseline_gf_interval + 1) break;
-  }
-
-  gld_index = frame_idx;
-  lst_index = AOMMAX(0, frame_idx - 1);
-  alt_index = -1;
-  ++frame_idx;
-
-  // Extend two frames outside the current gf group.
-  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
-    struct lookahead_entry *buf =
-        av1_lookahead_peek(cpi->lookahead, frame_idx - 2);
-
-    if (buf == NULL) break;
-
-    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
-
-    gf_picture[frame_idx].frame = &buf->img;
-    gf_picture[frame_idx].ref_frame[0] = gld_index;
-    gf_picture[frame_idx].ref_frame[1] = lst_index;
-    gf_picture[frame_idx].ref_frame[2] = alt_index;
-    for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
-    lst_index = frame_idx;
-    ++*tpl_group_frames;
-    ++extend_frame_count;
-  }
-}
-
-static void init_tpl_stats(AV1_COMP *cpi) {
-  int frame_idx;
-  for (frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
-    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-    memset(tpl_frame->tpl_stats_ptr, 0,
-           tpl_frame->height * tpl_frame->width *
-               sizeof(*tpl_frame->tpl_stats_ptr));
-    tpl_frame->is_valid = 0;
-  }
-}
-
-static uint32_t motion_compensated_prediction(AV1_COMP *cpi, ThreadData *td,
-                                              uint8_t *cur_frame_buf,
-                                              uint8_t *ref_frame_buf,
-                                              int stride, BLOCK_SIZE bsize,
-                                              int mi_row, int mi_col) {
-  AV1_COMMON *cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  const SEARCH_METHODS search_method = NSTEP;
-  int step_param;
-  int sadpb = x->sadperbit16;
-  uint32_t bestsme = UINT_MAX;
-  int distortion;
-  uint32_t sse;
-  int cost_list[5];
-  const MvLimits tmp_mv_limits = x->mv_limits;
-
-  MV best_ref_mv1 = { 0, 0 };
-  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
-  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
-  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
-
-  // Setup frame pointers
-  x->plane[0].src.buf = cur_frame_buf;
-  x->plane[0].src.stride = stride;
-  xd->plane[0].pre[0].buf = ref_frame_buf;
-  xd->plane[0].pre[0].stride = stride;
-
-  step_param = mv_sf->reduce_first_step_size;
-  step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
-
-  av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
-
-  av1_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
-                        search_method, 0, sadpb, cond_cost_list(cpi, cost_list),
-                        &best_ref_mv1, INT_MAX, 0, (MI_SIZE * mi_col),
-                        (MI_SIZE * mi_row), 0);
-
-  /* restore UMV window */
-  x->mv_limits = tmp_mv_limits;
-
-  const int pw = block_size_wide[bsize];
-  const int ph = block_size_high[bsize];
-  bestsme = cpi->find_fractional_mv_step(
-      x, cm, mi_row, mi_col, &best_ref_mv1, cpi->common.allow_high_precision_mv,
-      x->errorperbit, &cpi->fn_ptr[bsize], 0, mv_sf->subpel_iters_per_step,
-      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL,
-      0, 0, pw, ph, 1, 1);
-
-  return bestsme;
-}
-
-static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
-                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
-  int width = 0, height = 0;
-  int bw = 4 << mi_size_wide_log2[bsize];
-  int bh = 4 << mi_size_high_log2[bsize];
-
-  switch (block) {
-    case 0:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 1:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 2:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    case 3:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    default: assert(0);
-  }
-
-  return width * height;
-}
-
-static int round_floor(int ref_pos, int bsize_pix) {
-  int round;
-  if (ref_pos < 0)
-    round = -(1 + (-ref_pos - 1) / bsize_pix);
-  else
-    round = ref_pos / bsize_pix;
-
-  return round;
-}
-
-static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize, int stride,
-                            const TplDepStats *src_stats) {
-  const int mi_height = mi_size_high[bsize];
-  const int mi_width = mi_size_wide[bsize];
-  int idx, idy;
-
-  int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
-  int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
-
-  TplDepStats *tpl_ptr;
-
-  intra_cost = AOMMAX(1, intra_cost);
-  inter_cost = AOMMAX(1, inter_cost);
-
-  for (idy = 0; idy < mi_height; ++idy) {
-    tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col];
-    for (idx = 0; idx < mi_width; ++idx) {
-      tpl_ptr->intra_cost = intra_cost;
-      tpl_ptr->inter_cost = inter_cost;
-      tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
-      tpl_ptr->ref_frame_index = src_stats->ref_frame_index;
-      tpl_ptr->mv.as_int = src_stats->mv.as_int;
-      ++tpl_ptr;
-    }
-  }
-}
-
-static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
-                               int mi_row, int mi_col, const BLOCK_SIZE bsize) {
-  TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
-  TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
-  MV mv = tpl_stats->mv.as_mv;
-  int mv_row = mv.row >> 3;
-  int mv_col = mv.col >> 3;
-
-  int ref_pos_row = mi_row * MI_SIZE + mv_row;
-  int ref_pos_col = mi_col * MI_SIZE + mv_col;
-
-  const int bw = 4 << mi_size_wide_log2[bsize];
-  const int bh = 4 << mi_size_high_log2[bsize];
-  const int mi_height = mi_size_high[bsize];
-  const int mi_width = mi_size_wide[bsize];
-  const int pix_num = bw * bh;
-
-  // top-left on grid block location in pixel
-  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
-  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
-  int block;
-
-  for (block = 0; block < 4; ++block) {
-    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
-    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
-
-    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
-        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
-      int overlap_area = get_overlap_area(
-          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
-      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
-      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
-
-      int64_t mc_flow = tpl_stats->mc_dep_cost -
-                        (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
-                            tpl_stats->intra_cost;
-
-      int idx, idy;
-
-      for (idy = 0; idy < mi_height; ++idy) {
-        for (idx = 0; idx < mi_width; ++idx) {
-          TplDepStats *des_stats =
-              &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
-                         (ref_mi_col + idx)];
-
-          des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
-          des_stats->mc_ref_cost +=
-              ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
-              pix_num;
-          assert(overlap_area >= 0);
-        }
-      }
-    }
-  }
-}
-
-static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
-                             int mi_row, int mi_col, const BLOCK_SIZE bsize) {
-  int idx, idy;
-  const int mi_height = mi_size_high[bsize];
-  const int mi_width = mi_size_wide[bsize];
-
-  for (idy = 0; idy < mi_height; ++idy) {
-    for (idx = 0; idx < mi_width; ++idx) {
-      TplDepStats *tpl_ptr =
-          &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
-      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
-                         BLOCK_4X4);
-    }
-  }
-}
-
-static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
-                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                               TX_SIZE tx_size, int64_t *recon_error,
-                               int64_t *sse) {
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
-  uint16_t eob;
-  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
-  const int shift = tx_size == TX_32X32 ? 0 : 2;
-
-  av1_quantize_fp_32x32(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX,
-                        p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, dqcoeff,
-                        p->dequant_QTX, &eob, scan_order->scan,
-                        scan_order->iscan);
-
-  *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
-  *recon_error = AOMMAX(*recon_error, 1);
-
-  *sse = (*sse) >> shift;
-  *sse = AOMMAX(*sse, 1);
-}
-
-static void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                         TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_8X8: aom_hadamard_8x8(src_diff, bw, coeff); break;
-    case TX_16X16: aom_hadamard_16x16(src_diff, bw, coeff); break;
-    case TX_32X32: aom_hadamard_32x32(src_diff, bw, coeff); break;
-    default: assert(0);
-  }
-}
-
-static void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
-                            struct scale_factors *sf, GF_PICTURE *gf_picture,
-                            int frame_idx, int16_t *src_diff, tran_low_t *coeff,
-                            tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
-                            int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
-                            YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
-                            int64_t *recon_error, int64_t *sse,
-                            TplDepStats *tpl_stats) {
-  AV1_COMMON *cm = &cpi->common;
-  ThreadData *td = &cpi->td;
-
-  const int bw = 4 << mi_size_wide_log2[bsize];
-  const int bh = 4 << mi_size_high_log2[bsize];
-  const int pix_num = bw * bh;
-  int best_rf_idx = -1;
-  int_mv best_mv;
-  int64_t best_inter_cost = INT64_MAX;
-  int64_t inter_cost;
-  int rf_idx;
-  const InterpFilters kernel =
-      av1_make_interp_filters(EIGHTTAP_REGULAR, EIGHTTAP_REGULAR);
-
-  int64_t best_intra_cost = INT64_MAX;
-  int64_t intra_cost;
-  PREDICTION_MODE mode;
-  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
-  MB_MODE_INFO mi_above, mi_left;
-
-  memset(tpl_stats, 0, sizeof(*tpl_stats));
-
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
-  xd->above_mbmi = (mi_row > 0) ? &mi_above : NULL;
-  xd->left_mbmi = (mi_col > 0) ? &mi_left : NULL;
-
-  // Intra prediction search
-  for (mode = DC_PRED; mode <= PAETH_PRED; ++mode) {
-    uint8_t *src, *dst;
-    int src_stride, dst_stride;
-
-    src = xd->cur_buf->y_buffer + mb_y_offset;
-    src_stride = xd->cur_buf->y_stride;
-
-    dst = &predictor[0];
-    dst_stride = bw;
-
-    xd->mi[0]->sb_type = bsize;
-    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-
-    av1_predict_intra_block(
-        cm, xd, block_size_wide[bsize], block_size_high[bsize], tx_size, mode,
-        0, 0, FILTER_INTRA_MODES, src, src_stride, dst, dst_stride, 0, 0, 0);
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      aom_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
-                                dst_stride, xd->bd);
-    } else {
-      aom_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
-                         dst_stride);
-    }
-
-    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-
-    intra_cost = aom_satd(coeff, pix_num);
-
-    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
-  }
-
-  // Motion compensated prediction
-  best_mv.as_int = 0;
-
-  (void)mb_y_offset;
-  // Motion estimation column boundary
-  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
-  x->mv_limits.col_max =
-      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND);
-
-  for (rf_idx = 0; rf_idx < 7; ++rf_idx) {
-    if (ref_frame[rf_idx] == NULL) continue;
-
-    motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
-                                  ref_frame[rf_idx]->y_buffer + mb_y_offset,
-                                  xd->cur_buf->y_stride, bsize, mi_row, mi_col);
-
-    // TODO(jingning): Not yet support high bit-depth in the next three
-    // steps.
-    ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
-    WarpTypesAllowed warp_types;
-    memset(&warp_types, 0, sizeof(WarpTypesAllowed));
-
-    av1_build_inter_predictor(
-        ref_frame[rf_idx]->y_buffer + mb_y_offset, ref_frame[rf_idx]->y_stride,
-        &predictor[0], bw, &x->best_mv.as_mv, sf, bw, bh, &conv_params, kernel,
-        &warp_types, mi_col * MI_SIZE, mi_row * MI_SIZE, 0, 0, MV_PRECISION_Q3,
-        mi_col * MI_SIZE, mi_row * MI_SIZE, xd, 0);
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      aom_highbd_subtract_block(
-          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
-          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
-    } else {
-      aom_subtract_block(bh, bw, src_diff, bw,
-                         xd->cur_buf->y_buffer + mb_y_offset,
-                         xd->cur_buf->y_stride, &predictor[0], bw);
-    }
-    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-
-    inter_cost = aom_satd(coeff, pix_num);
-    if (inter_cost < best_inter_cost) {
-      best_rf_idx = rf_idx;
-      best_inter_cost = inter_cost;
-      best_mv.as_int = x->best_mv.as_int;
-      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
-                         sse);
-    }
-  }
-  best_intra_cost = AOMMAX(best_intra_cost, 1);
-  best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
-  tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
-  tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
-  tpl_stats->mc_dep_cost = tpl_stats->intra_cost + tpl_stats->mc_flow;
-
-  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
-  tpl_stats->mv.as_int = best_mv.as_int;
-}
-
-static void mc_flow_dispenser(AV1_COMP *cpi, GF_PICTURE *gf_picture,
-                              int frame_idx) {
-  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-  YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
-  YV12_BUFFER_CONFIG *ref_frame[7] = {
-    NULL, NULL, NULL, NULL, NULL, NULL, NULL
-  };
-
-  AV1_COMMON *cm = &cpi->common;
-  struct scale_factors sf;
-  int rdmult, idx;
-  ThreadData *td = &cpi->td;
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int mi_row, mi_col;
-
-  DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
-  DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
-  uint8_t *predictor;
-  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
-  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-
-  const BLOCK_SIZE bsize = BLOCK_32X32;
-  const TX_SIZE tx_size = max_txsize_lookup[bsize];
-  const int mi_height = mi_size_high[bsize];
-  const int mi_width = mi_size_wide[bsize];
-  int64_t recon_error, sse;
-
-  // Setup scaling factor
-  av1_setup_scale_factors_for_frame(
-      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
-      this_frame->y_crop_width, this_frame->y_crop_height);
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    predictor = CONVERT_TO_BYTEPTR(predictor16);
-  else
-    predictor = predictor8;
-
-  // Prepare reference frame pointers. If any reference frame slot is
-  // unavailable, the pointer will be set to Null.
-  for (idx = 0; idx < 7; ++idx) {
-    int rf_idx = gf_picture[frame_idx].ref_frame[idx];
-    if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
-  }
-
-  xd->mi = cm->mi_grid_visible;
-  xd->mi[0] = cm->mi;
-  xd->cur_buf = this_frame;
-
-  // Get rd multiplier set up.
-  rdmult = (int)av1_compute_rd_mult(cpi, tpl_frame->base_qindex);
-  if (rdmult < 1) rdmult = 1;
-  set_error_per_bit(&cpi->td.mb, rdmult);
-  av1_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
-
-  tpl_frame->is_valid = 1;
-
-  cm->base_qindex = tpl_frame->base_qindex;
-  av1_frame_init_quantizer(cpi);
-
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
-    // Motion estimation row boundary
-    x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
-    x->mv_limits.row_max =
-        (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * AOM_INTERP_EXTEND);
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-      TplDepStats tpl_stats;
-      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, src_diff, coeff,
-                      qcoeff, dqcoeff, mi_row, mi_col, bsize, tx_size,
-                      ref_frame, predictor, &recon_error, &sse, &tpl_stats);
-
-      // Motion flow dependency dispenser.
-      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
-                      tpl_frame->stride, &tpl_stats);
-
-      tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
-                       bsize);
-    }
-  }
-}
-
-static void setup_tpl_stats(AV1_COMP *cpi) {
-  GF_PICTURE gf_picture[MAX_LAG_BUFFERS];
-  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-  int tpl_group_frames = 0;
-  int frame_idx;
-
-  init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
-
-  init_tpl_stats(cpi);
-
-  // Backward propagation from tpl_group_frames to 1.
-  for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx)
-    mc_flow_dispenser(cpi, gf_picture, frame_idx);
-}
-
 int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
                             int64_t *time_end, int flush,
                             const aom_rational_t *timebase) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
-  CurrentFrame *const current_frame = &cm->current_frame;
-  const int num_planes = av1_num_planes(cm);
-  BufferPool *const pool = cm->buffer_pool;
-  RATE_CONTROL *const rc = &cpi->rc;
-  struct aom_usec_timer cmptimer;
-  YV12_BUFFER_CONFIG *force_src_buffer = NULL;
-  struct lookahead_entry *last_source = NULL;
-  struct lookahead_entry *source = NULL;
-  int arf_src_index;
-  int brf_src_index;
-  int i;
 
 #if CONFIG_BITSTREAM_DEBUG
   assert(cpi->oxcf.max_threads == 0 &&
          "bitstream debug tool does not support multithreading");
   bitstream_queue_record_write();
-  bitstream_queue_set_frame_write(current_frame->frame_number * 2 +
+  bitstream_queue_set_frame_write(cm->current_frame.frame_number * 2 +
                                   cm->show_frame);
 #endif
 
+  // Indicates whether or not to use an adaptive quantize b rather than
+  // the traditional version
+  cm->use_quant_b_adapt = cpi->oxcf.quant_b_adapt;
+
   cm->showable_frame = 0;
+  *size = 0;
+#if CONFIG_INTERNAL_STATS
+  struct aom_usec_timer cmptimer;
   aom_usec_timer_start(&cmptimer);
-
+#endif
   set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0);
 
   // Normal defaults
@@ -6584,387 +5455,42 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   if (oxcf->large_scale_tile)
     cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
-  // default reference buffers update config
-  av1_configure_buffer_updates_firstpass(cpi, LF_UPDATE);
-
   // Initialize fields related to forward keyframes
   cpi->no_show_kf = 0;
-  cm->reset_decoder_state = 0;
-
-  // Don't allow a show_existing_frame to coincide with an error resilient or
-  // S-Frame. An exception can be made in the case of a keyframe, since it
-  // does not depend on any previous frames. We must make this exception here
-  // because of the use of show_existing_frame with forward coded keyframes.
-  struct lookahead_entry *lookahead_src = NULL;
-  if (current_frame->frame_number > 0)
-    lookahead_src = av1_lookahead_peek(cpi->lookahead, 0);
-
-  int use_show_existing = 1;
-  if (lookahead_src != NULL) {
-    const int is_error_resilient =
-        cpi->oxcf.error_resilient_mode ||
-        (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
-    const int is_s_frame = cpi->oxcf.s_frame_mode ||
-                           (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
-    const int is_key_frame =
-        (rc->frames_to_key == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY);
-    use_show_existing = !(is_error_resilient || is_s_frame) || is_key_frame;
-  }
-
-  if (oxcf->pass == 2 && cm->show_existing_frame && use_show_existing) {
-    // Manage the source buffer and flush out the source frame that has been
-    // coded already; Also get prepared for PSNR calculation if needed.
-    if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) {
-      *size = 0;
-      return -1;
-    }
-    av1_apply_encoding_flags(cpi, source->flags);
-    cpi->source = &source->img;
-    // TODO(zoeliu): To track down to determine whether it's needed to adjust
-    // the frame rate.
-    *time_stamp = source->ts_start;
-    *time_end = source->ts_end;
-
-    // We need to adjust frame rate for an overlay frame
-    if (cpi->rc.is_src_frame_alt_ref) adjust_frame_rate(cpi, source);
-
-    // Find a free buffer for the new frame, releasing the reference
-    // previously held.
-    if (cm->new_fb_idx != INVALID_IDX) {
-      --pool->frame_bufs[cm->new_fb_idx].ref_count;
-    }
-
-    cm->cur_frame = NULL;
-    cm->new_fb_idx = get_free_fb(cm);
-    if (cm->new_fb_idx == INVALID_IDX) return -1;
-    cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
-
-    // Clear down mmx registers
-    aom_clear_system_state();
-
-    // Start with a 0 size frame.
-    *size = 0;
-
-    // We need to update the gf_group for show_existing overlay frame
-    if (cpi->rc.is_src_frame_alt_ref) av1_rc_get_second_pass_params(cpi);
-
-    if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK)
-      return AOM_CODEC_ERROR;
-
-    if (cpi->b_calculate_psnr) generate_psnr_packet(cpi);
-
-#if CONFIG_INTERNAL_STATS
-    compute_internal_stats(cpi, (int)(*size));
-#endif  // CONFIG_INTERNAL_STATS
-
-    // Clear down mmx registers
-    aom_clear_system_state();
-
-    cm->show_existing_frame = 0;
-    return 0;
-  }
-
-  // Should we encode an arf frame.
-  arf_src_index = get_arf_src_index(cpi);
-  if (arf_src_index) {
-    for (i = 0; i <= arf_src_index; ++i) {
-      struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i);
-      // Avoid creating an alt-ref if there's a forced keyframe pending.
-      if (e == NULL) {
-        break;
-      } else if (e->flags == AOM_EFLAG_FORCE_KF) {
-        arf_src_index = 0;
-        flush = 1;
-        break;
-      }
-    }
-  }
-
-  if (arf_src_index) {
-    assert(arf_src_index <= rc->frames_to_key);
-
-    if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
-      cm->showable_frame = 1;
-      cpi->alt_ref_source = source;
-      // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
-      if (arf_src_index == rc->frames_to_key) {
-        // Skip temporal filtering and mark as intra_only if we have a fwd_kf
-        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-        int which_arf = gf_group->arf_update_idx[gf_group->index];
-        cpi->is_arf_filter_off[which_arf] = 1;
-        cpi->no_show_kf = 1;
-      } else {
-        if (oxcf->arnr_max_frames > 0) {
-          // Produce the filtered ARF frame.
-          av1_temporal_filter(cpi, arf_src_index);
-          aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
-          force_src_buffer = &cpi->alt_ref_buffer;
-        }
-      }
-      cm->show_frame = 0;
-      current_frame->intra_only = 0;
-
-      if (oxcf->pass < 2) {
-        // In second pass, the buffer updates configure will be set
-        // in the function av1_rc_get_second_pass_params
-        av1_configure_buffer_updates_firstpass(cpi, ARF_UPDATE);
-      }
-    }
-    rc->source_alt_ref_pending = 0;
-  }
-
-  // Should we encode an arf2 frame.
-  arf_src_index = get_arf2_src_index(cpi);
-  if (arf_src_index) {
-    for (i = 0; i <= arf_src_index; ++i) {
-      struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i);
-      // Avoid creating an alt-ref if there's a forced keyframe pending.
-      if (e == NULL) {
-        break;
-      } else if (e->flags == AOM_EFLAG_FORCE_KF) {
-        arf_src_index = 0;
-        flush = 1;
-        break;
-      }
-    }
-  }
-
-  if (arf_src_index) {
-    assert(arf_src_index <= rc->frames_to_key);
-
-    if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
-      cm->showable_frame = 1;
-      cpi->alt_ref_source = source;
-
-      if (oxcf->arnr_max_frames > 0) {
-        // Produce the filtered ARF frame.
-        av1_temporal_filter(cpi, arf_src_index);
-        aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
-        force_src_buffer = &cpi->alt_ref_buffer;
-      }
-
-      cm->show_frame = 0;
-      current_frame->intra_only = 0;
-
-      if (oxcf->pass < 2) {
-        // In second pass, the buffer updates configure will be set
-        // in the function av1_rc_get_second_pass_params
-        av1_configure_buffer_updates_firstpass(cpi, INTNL_ARF_UPDATE);
-      }
-    }
-    rc->source_alt_ref_pending = 0;
-  }
-
-  rc->is_bwd_ref_frame = 0;
-  brf_src_index = get_brf_src_index(cpi);
-  if (brf_src_index) {
-    assert(brf_src_index <= rc->frames_to_key);
-    if ((source = av1_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) {
-      cm->showable_frame = 1;
-      cm->show_frame = 0;
-      current_frame->intra_only = 0;
-
-      if (oxcf->pass < 2) {
-        // In second pass, the buffer updates configure will be set
-        // in the function av1_rc_get_second_pass_params
-        av1_configure_buffer_updates_firstpass(cpi, BIPRED_UPDATE);
-      }
-    }
-  }
 
-  if (!source) {
-    // Get last frame source.
-    if (current_frame->frame_number > 0) {
-      if ((last_source = av1_lookahead_peek(cpi->lookahead, -1)) == NULL)
-        return -1;
-    }
-    if (current_frame->frame_number > 0) assert(last_source != NULL);
-    // Read in the source frame.
-    source = av1_lookahead_pop(cpi->lookahead, flush);
-
-    if (source != NULL) {
-      cm->show_frame = 1;
-      current_frame->intra_only = 0;
-
-      // Check to see if the frame should be encoded as an arf overlay.
-      check_src_altref(cpi, source);
-    }
-  }
-  if (source) {
-    cpi->unscaled_source = cpi->source =
-        force_src_buffer ? force_src_buffer : &source->img;
-    cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
+  if (assign_cur_frame_new_fb(cm) == NULL) return AOM_CODEC_ERROR;
 
-    *time_stamp = source->ts_start;
-    *time_end = source->ts_end;
-    av1_apply_encoding_flags(cpi, source->flags);
-    *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
-
-  } else {
-    *size = 0;
-    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
-      av1_end_first_pass(cpi); /* get last stats packet */
-      cpi->twopass.first_pass_done = 1;
-    }
+  const int result = av1_encode_strategy(cpi, size, dest, frame_flags,
+                                         time_stamp, time_end, timebase, flush);
+  if (result != AOM_CODEC_OK && result != -1) {
+    return AOM_CODEC_ERROR;
+  } else if (result == -1) {
+    // Returning -1 indicates no frame encoded; more input is required
     return -1;
   }
-
-  if (source->ts_start < cpi->first_time_stamp_ever) {
-    cpi->first_time_stamp_ever = source->ts_start;
-    cpi->last_end_time_stamp_seen = source->ts_start;
-  }
-
-  // Clear down mmx registers
-  aom_clear_system_state();
-
-  // adjust frame rates based on timestamps given
-  if (cm->show_frame) adjust_frame_rate(cpi, source);
-
-  // Find a free buffer for the new frame, releasing the reference previously
-  // held.
-  if (cm->new_fb_idx != INVALID_IDX) {
-    --pool->frame_bufs[cm->new_fb_idx].ref_count;
-  }
-  cm->new_fb_idx = get_free_fb(cm);
-
-  if (cm->new_fb_idx == INVALID_IDX) return -1;
-
-  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
-  // Retain the RF_LEVEL for the current newly coded frame.
-  cm->cur_frame->frame_rf_level =
-      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
-
-  cm->cur_frame->buf.buf_8bit_valid = 0;
-
-  if (cpi->film_grain_table) {
-    cm->seq_params.film_grain_params_present = aom_film_grain_table_lookup(
-        cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
-        &cm->film_grain_params);
-  }
-  cm->cur_frame->film_grain_params_present =
-      cm->seq_params.film_grain_params_present;
-
-  // only one operating point supported now
-  const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp);
-  if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
-  cpi->common.frame_presentation_time = (uint32_t)pts64;
-
-  // Start with a 0 size frame.
-  *size = 0;
-
-  cpi->frame_flags = *frame_flags;
-
-  if (oxcf->pass == 2) {
-    av1_rc_get_second_pass_params(cpi);
-  } else if (oxcf->pass == 1) {
-    setup_frame_size(cpi);
-  }
-
-  if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) {
-    for (i = 0; i < INTER_REFS_PER_FRAME; ++i)
-      cpi->scaled_ref_idx[i] = INVALID_IDX;
-  }
-
-  cm->using_qmatrix = cpi->oxcf.using_qm;
-  cm->min_qmlevel = cpi->oxcf.qm_minlevel;
-  cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
-
-  if (cm->seq_params.frame_id_numbers_present_flag && *time_stamp == 0) {
-    cpi->common.current_frame_id = -1;
-  }
-
-  cpi->cur_poc++;
-  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools &&
-      !frame_is_intra_only(cm)) {
-    if (cpi->common.seq_params.force_integer_mv == 2) {
-      struct lookahead_entry *previous_entry =
-          av1_lookahead_peek(cpi->lookahead, cpi->previous_index);
-      if (!previous_entry)
-        cpi->common.cur_frame_force_integer_mv = 0;
-      else
-        cpi->common.cur_frame_force_integer_mv = is_integer_mv(
-            cpi, cpi->source, &previous_entry->img, cpi->previous_hash_table);
-    } else {
-      cpi->common.cur_frame_force_integer_mv =
-          cpi->common.seq_params.force_integer_mv;
-    }
-  } else {
-    cpi->common.cur_frame_force_integer_mv = 0;
-  }
-
-  if (cpi->twopass.gf_group.index == 1 && cpi->oxcf.enable_tpl_model) {
-    set_frame_size(cpi, cm->width, cm->height);
-    setup_tpl_stats(cpi);
-  }
-
-  if (oxcf->pass == 1) {
-    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf);
-    av1_first_pass(cpi, source);
-  } else if (oxcf->pass == 2) {
-    if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK)
-      return AOM_CODEC_ERROR;
-  } else {
-    // One pass encode
-    if (Pass0Encode(cpi, size, dest, 0, frame_flags) != AOM_CODEC_OK)
-      return AOM_CODEC_ERROR;
-  }
-  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
-    cpi->previous_hash_table = &cm->cur_frame->hash_table;
-    {
-      int l;
-      for (l = -MAX_PRE_FRAMES; l < cpi->lookahead->max_sz; l++) {
-        if ((cpi->lookahead->buf + l) == source) {
-          cpi->previous_index = l;
-          break;
-        }
-      }
-
-      if (l == cpi->lookahead->max_sz) {
-        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                           "Failed to find last frame original buffer");
-      }
-    }
-  }
-
-  if (!cm->large_scale_tile) {
-    cm->cur_frame->frame_context = *cm->fc;
-  }
-
-#define EXT_TILE_DEBUG 0
-#if EXT_TILE_DEBUG
-  if (cm->large_scale_tile && oxcf->pass == 2) {
-    char fn[20] = "./fc";
-    fn[4] = current_frame->frame_number / 100 + '0';
-    fn[5] = (current_frame->frame_number % 100) / 10 + '0';
-    fn[6] = (current_frame->frame_number % 10) + '0';
-    fn[7] = '\0';
-    av1_print_frame_contexts(cm->fc, fn);
-  }
-#endif  // EXT_TILE_DEBUG
-#undef EXT_TILE_DEBUG
-
-  cm->showable_frame = !cm->show_frame && cm->showable_frame;
-
-  // No frame encoded, or frame was dropped, release scaled references.
-  if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
-    release_scaled_references(cpi);
-  }
-
-  if (*size > 0) {
-    cpi->droppable = is_frame_droppable(cpi);
-  }
-
+#if CONFIG_INTERNAL_STATS
   aom_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
-
-  if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
-    generate_psnr_packet(cpi);
+#endif
+  if (cpi->b_calculate_psnr) {
+    if (cm->show_existing_frame || (oxcf->pass != 1 && cm->show_frame)) {
+      generate_psnr_packet(cpi);
+    }
+  }
+  if (cpi->keep_level_stats && oxcf->pass != 1)
+    av1_update_level_info(cpi, *size, *time_stamp, *time_end);
 
 #if CONFIG_INTERNAL_STATS
   if (oxcf->pass != 1) {
     compute_internal_stats(cpi, (int)(*size));
   }
 #endif  // CONFIG_INTERNAL_STATS
+#if CONFIG_SPEED_STATS
+  if (cpi->oxcf.pass != 1 && !cm->show_existing_frame) {
+    cpi->tx_search_count += cpi->td.mb.tx_search_count;
+    cpi->td.mb.tx_search_count = 0;
+  }
+#endif  // CONFIG_SPEED_STATS
 
   aom_clear_system_state();
 
@@ -6977,8 +5503,8 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
     return -1;
   } else {
     int ret;
-    if (cm->frame_to_show) {
-      *dest = *cm->frame_to_show;
+    if (cm->cur_frame != NULL) {
+      *dest = cm->cur_frame->buf;
       dest->y_width = cm->width;
       dest->y_height = cm->height;
       dest->uv_width = cm->width >> cm->seq_params.subsampling_x;
@@ -6993,10 +5519,9 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
 }
 
 int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
-  if (cpi->last_show_frame_buf_idx == INVALID_IDX) return -1;
+  if (cpi->last_show_frame_buf == NULL) return -1;
 
-  *frame =
-      cpi->common.buffer_pool->frame_bufs[cpi->last_show_frame_buf_idx].buf;
+  *frame = cpi->last_show_frame_buf->buf;
   return 0;
 }
 
@@ -7148,7 +5673,14 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
       upd ^= AOM_ALT2_FLAG;
     }
 
-    av1_update_reference(cpi, upd);
+    cpi->ext_refresh_last_frame = (upd & AOM_LAST_FLAG) != 0;
+    cpi->ext_refresh_golden_frame = (upd & AOM_GOLD_FLAG) != 0;
+    cpi->ext_refresh_alt_ref_frame = (upd & AOM_ALT_FLAG) != 0;
+    cpi->ext_refresh_bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0;
+    cpi->ext_refresh_alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
+    cpi->ext_refresh_frame_flags_pending = 1;
+  } else {
+    cpi->ext_refresh_frame_flags_pending = 0;
   }
 
   cpi->ext_use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs &
@@ -7164,15 +5696,6 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
   }
 }
 
-int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n) {
-  return n * TICKS_PER_SEC * timebase->num / timebase->den;
-}
-
-int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) {
-  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
-  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
-}
-
 aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
   if (!cpi) return NULL;
 
@@ -7189,7 +5712,7 @@ aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
   if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
   memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
 
-  if (write_obu_header(OBU_SEQUENCE_HEADER, 0, &header_buf[0]) !=
+  if (av1_write_obu_header(cpi, OBU_SEQUENCE_HEADER, 0, &header_buf[0]) !=
       obu_header_size) {
     return NULL;
   }
diff --git a/libaom/av1/encoder/encoder.h b/libaom/av1/encoder/encoder.h
index 1ff2ef7..bf02394 100644
--- a/libaom/av1/encoder/encoder.h
+++ b/libaom/av1/encoder/encoder.h
@@ -12,6 +12,7 @@
 #ifndef AOM_AV1_ENCODER_ENCODER_H_
 #define AOM_AV1_ENCODER_ENCODER_H_
 
+#include <stdbool.h>
 #include <stdio.h>
 
 #include "config/aom_config.h"
@@ -24,11 +25,14 @@
 #include "av1/common/onyxc_int.h"
 #include "av1/common/resize.h"
 #include "av1/common/timing.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/firstpass.h"
+#include "av1/encoder/level.h"
 #include "av1/encoder/lookahead.h"
 #include "av1/encoder/mbgraph.h"
 #include "av1/encoder/mcomp.h"
@@ -36,6 +40,7 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/tokenize.h"
+#include "av1/encoder/block.h"
 
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
@@ -59,36 +64,33 @@ typedef struct {
   FRAME_CONTEXT fc;
 } CODING_CONTEXT;
 
-typedef enum {
-  // regular inter frame
-  REGULAR_FRAME = 0,
-  // alternate reference frame
-  ARF_FRAME = 1,
-  // overlay frame
-  OVERLAY_FRAME = 2,
-  // golden frame
-  GLD_FRAME = 3,
-  // backward reference frame
-  BRF_FRAME = 4,
-  // extra alternate reference frame
-  EXT_ARF_FRAME = 5,
+enum {
+  REGULAR_FRAME,       // regular inter frame
+  ARF_FRAME,           // alternate reference frame
+  OVERLAY_FRAME,       // overlay frame
+  GLD_FRAME,           // golden frame
+  BRF_FRAME,           // backward reference frame
+  INTERNAL_ARF_FRAME,  // internal alternate reference frame
   FRAME_CONTEXT_INDEXES
-} FRAME_CONTEXT_INDEX;
+} UENUM1BYTE(FRAME_CONTEXT_INDEX);
 
-typedef enum {
+enum {
   NORMAL = 0,
   FOURFIVE = 1,
   THREEFIVE = 2,
   ONETWO = 3
-} AOM_SCALING;
+} UENUM1BYTE(AOM_SCALING);
 
-typedef enum {
+enum {
   // Good Quality Fast Encoding. The encoder balances quality with the amount of
   // time it takes to encode the output. Speed setting controls how fast.
-  GOOD
-} MODE;
+  GOOD,
+  // Realtime Fast Encoding. Will force some restrictions on bitrate
+  // constraints.
+  REALTIME
+} UENUM1BYTE(MODE);
 
-typedef enum {
+enum {
   FRAMEFLAGS_KEY = 1 << 0,
   FRAMEFLAGS_GOLDEN = 1 << 1,
   FRAMEFLAGS_BWDREF = 1 << 2,
@@ -97,46 +99,62 @@ typedef enum {
   FRAMEFLAGS_INTRAONLY = 1 << 4,
   FRAMEFLAGS_SWITCH = 1 << 5,
   FRAMEFLAGS_ERROR_RESILIENT = 1 << 6,
-} FRAMETYPE_FLAGS;
+} UENUM1BYTE(FRAMETYPE_FLAGS);
 
-typedef enum {
+enum {
   NO_AQ = 0,
   VARIANCE_AQ = 1,
   COMPLEXITY_AQ = 2,
   CYCLIC_REFRESH_AQ = 3,
   AQ_MODE_COUNT  // This should always be the last member of the enum
-} AQ_MODE;
-typedef enum {
+} UENUM1BYTE(AQ_MODE);
+enum {
   NO_DELTA_Q = 0,
   DELTA_Q_ONLY = 1,
   DELTA_Q_LF = 2,
   DELTAQ_MODE_COUNT  // This should always be the last member of the enum
-} DELTAQ_MODE;
+} UENUM1BYTE(DELTAQ_MODE);
 
-typedef enum {
+enum {
   RESIZE_NONE = 0,    // No frame resizing allowed.
   RESIZE_FIXED = 1,   // All frames are coded at the specified scale.
   RESIZE_RANDOM = 2,  // All frames are coded at a random scale.
   RESIZE_MODES
-} RESIZE_MODE;
+} UENUM1BYTE(RESIZE_MODE);
+
+enum {
+  SUPERRES_NONE,     // No frame superres allowed.
+  SUPERRES_FIXED,    // All frames are coded at the specified scale,
+                     // and super-resolved.
+  SUPERRES_RANDOM,   // All frames are coded at a random scale,
+                     // and super-resolved.
+  SUPERRES_QTHRESH,  // Superres scale for a frame is determined based on
+                     // q_index.
+  SUPERRES_AUTO,     // Automatically select superres for appropriate frames.
+  SUPERRES_MODES
+} UENUM1BYTE(SUPERRES_MODE);
 
 typedef enum {
-  SUPERRES_NONE = 0,     // No frame superres allowed
-  SUPERRES_FIXED = 1,    // All frames are coded at the specified scale,
-                         // and super-resolved.
-  SUPERRES_RANDOM = 2,   // All frames are coded at a random scale,
-                         // and super-resolved.
-  SUPERRES_QTHRESH = 3,  // Superres scale for a frame is determined based on
-                         // q_index
-  SUPERRES_MODES
-} SUPERRES_MODE;
+  kInvalid = 0,
+  kLowSadLowSumdiff = 1,
+  kLowSadHighSumdiff = 2,
+  kHighSadLowSumdiff = 3,
+  kHighSadHighSumdiff = 4,
+  kLowVarHighSumdiff = 5,
+  kVeryHighSad = 6,
+} CONTENT_STATE_SB;
+
+enum {
+  SS_CFG_SRC = 0,
+  SS_CFG_LOOKAHEAD = 1,
+  SS_CFG_TOTAL = 2
+} UENUM1BYTE(SS_CFG_OFFSET);
 
 typedef struct TplDepStats {
   int64_t intra_cost;
   int64_t inter_cost;
   int64_t mc_flow;
   int64_t mc_dep_cost;
-  int64_t mc_ref_cost;
 
   int ref_frame_index;
   int_mv mv;
@@ -153,6 +171,12 @@ typedef struct TplDepFrame {
   int base_qindex;
 } TplDepFrame;
 
+typedef enum {
+  COST_UPD_SB,
+  COST_UPD_SBROW,
+  COST_UPD_TILE,
+} COST_UPDATE_TYPE;
+
 #define TPL_DEP_COST_SCALE_LOG2 4
 
 typedef struct AV1EncoderConfig {
@@ -215,6 +239,7 @@ typedef struct AV1EncoderConfig {
   DELTAQ_MODE deltaq_mode;
   int enable_cdef;
   int enable_restoration;
+  int enable_obmc;
   int disable_trellis_quant;
   int using_qm;
   int qm_y;
@@ -274,6 +299,7 @@ typedef struct AV1EncoderConfig {
 
   int min_gf_interval;
   int max_gf_interval;
+  int gf_max_pyr_height;
 
   int row_mt;
   int tile_columns;
@@ -288,11 +314,6 @@ typedef struct AV1EncoderConfig {
   int max_threads;
 
   aom_fixed_buf_t two_pass_stats_in;
-  struct aom_codec_pkt_list *output_pkt_list;
-
-#if CONFIG_FP_MB_STATS
-  aom_fixed_buf_t firstpass_mb_stats_in;
-#endif
 
   aom_tune_metric tuning;
   aom_tune_content content;
@@ -304,15 +325,12 @@ typedef struct AV1EncoderConfig {
   int color_range;
   int render_width;
   int render_height;
-  aom_timing_info_type_t timing_info_type;
   int timing_info_present;
   aom_timing_info_t timing_info;
   int decoder_model_info_present_flag;
   int display_model_info_present_flag;
   int buffer_removal_time_present;
   aom_dec_model_info_t buffer_model;
-  aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
-  aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
   int film_grain_test_vector;
   const char *film_grain_table_filename;
 
@@ -320,18 +338,44 @@ typedef struct AV1EncoderConfig {
   aom_superblock_size_t superblock_size;
   unsigned int large_scale_tile;
   unsigned int single_tile_decoding;
-  int monochrome;
+  uint8_t monochrome;
   unsigned int full_still_picture_hdr;
   int enable_dual_filter;
   unsigned int motion_vector_unit_test;
   const cfg_options_t *cfg;
+  int enable_rect_partitions;
+  int enable_ab_partitions;
+  int enable_1to4_partitions;
+  int min_partition_size;
+  int max_partition_size;
+  int enable_intra_edge_filter;
+  int enable_tx64;
+  int tx_size_search_method;
+  int enable_flip_idtx;
   int enable_order_hint;
-  int enable_jnt_comp;
+  int enable_dist_wtd_comp;
   int enable_ref_frame_mvs;
+  unsigned int max_reference_frames;
+  int enable_reduced_reference_set;
   unsigned int allow_ref_frame_mvs;
+  int enable_masked_comp;
+  int enable_onesided_comp;
+  int enable_interintra_comp;
+  int enable_smooth_interintra;
+  int enable_diff_wtd_comp;
+  int enable_interinter_wedge;
+  int enable_interintra_wedge;
+  int enable_global_motion;
   int enable_warped_motion;
   int allow_warped_motion;
+  int enable_filter_intra;
+  int enable_smooth_intra;
+  int enable_paeth_intra;
+  int enable_cfl_intra;
   int enable_superres;
+  int enable_palette;
+  int enable_intrabc;
+  int enable_angle_delta;
   unsigned int save_as_annexb;
 
 #if CONFIG_DENOISE
@@ -341,6 +385,18 @@ typedef struct AV1EncoderConfig {
 
   unsigned int chroma_subsampling_x;
   unsigned int chroma_subsampling_y;
+  int reduced_tx_type_set;
+  int use_intra_dct_only;
+  int use_inter_dct_only;
+  int use_intra_default_tx_only;
+  int quant_b_adapt;
+  COST_UPDATE_TYPE coeff_cost_upd_freq;
+  COST_UPDATE_TYPE mode_cost_upd_freq;
+  int border_in_pixels;
+  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+  // Bit mask to specify which tier each of the 32 possible operating points
+  // conforms to.
+  unsigned int tier_mask;
 } AV1EncoderConfig;
 
 static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
@@ -397,7 +453,7 @@ typedef struct FRAME_COUNTS {
   unsigned int interintra[BLOCK_SIZE_GROUPS][2];
   unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
   unsigned int wedge_interintra[BLOCK_SIZES_ALL][2];
-  unsigned int compound_type[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
+  unsigned int compound_type[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
   unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES];
   unsigned int obmc[BLOCK_SIZES_ALL][2];
   unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
@@ -433,7 +489,6 @@ typedef struct FRAME_COUNTS {
                                 [SWITCHABLE_FILTERS];
 } FRAME_COUNTS;
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
 #define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
 
 typedef struct {
@@ -467,8 +522,12 @@ typedef struct inter_modes_info {
   int64_t sse_arr[MAX_INTER_MODES];
   int64_t est_rd_arr[MAX_INTER_MODES];
   RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+  bool true_rd_arr[MAX_INTER_MODES];
+  uint8_t blk_skip_arr[MAX_INTER_MODES][MAX_MIB_SIZE * MAX_MIB_SIZE];
+  RD_STATS rd_cost_arr[MAX_INTER_MODES];
+  RD_STATS rd_cost_y_arr[MAX_INTER_MODES];
+  RD_STATS rd_cost_uv_arr[MAX_INTER_MODES];
 } InterModesInfo;
-#endif
 
 // Encoder row synchronization
 typedef struct AV1RowMTSyncData {
@@ -491,16 +550,13 @@ typedef struct AV1RowMTInfo {
 typedef struct TileDataEnc {
   TileInfo tile_info;
   int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
-  int mode_map[BLOCK_SIZES_ALL][MAX_MODES];
   int m_search_count;
   int ex_search_count;
   CFL_CTX cfl;
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
-  DECLARE_ALIGNED(16, FRAME_CONTEXT, backup_tctx);
+  FRAME_CONTEXT *row_ctx;
   uint8_t allow_update_cdf;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
   InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
-#endif
   AV1RowMTSync row_mt_sync;
   AV1RowMTInfo row_mt_info;
 } TileDataEnc;
@@ -535,9 +591,7 @@ typedef struct ThreadData {
   tran_low_t *tree_coeff_buf[MAX_MB_PLANE];
   tran_low_t *tree_qcoeff_buf[MAX_MB_PLANE];
   tran_low_t *tree_dqcoeff_buf[MAX_MB_PLANE];
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
   InterModesInfo *inter_modes_info;
-#endif
   uint32_t *hash_value_buffer[2][2];
   int32_t *wsrc_buf;
   int32_t *mask_buf;
@@ -560,13 +614,13 @@ typedef struct ActiveMap {
 
 #if CONFIG_INTERNAL_STATS
 // types of stats
-typedef enum {
+enum {
   STAT_Y,
   STAT_U,
   STAT_V,
   STAT_ALL,
   NUM_STAT_TYPES  // This should always be the last member of the enum
-} StatType;
+} UENUM1BYTE(StatType);
 
 typedef struct IMAGE_STAT {
   double stat[NUM_STAT_TYPES];
@@ -579,10 +633,83 @@ typedef struct {
   YV12_BUFFER_CONFIG buf;
 } EncRefCntBuffer;
 
-typedef struct TileBufferEnc {
-  uint8_t *data;
-  size_t size;
-} TileBufferEnc;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+typedef struct PartitionStats {
+  int partition_decisions[6][EXT_PARTITION_TYPES];
+  int partition_attempts[6][EXT_PARTITION_TYPES];
+  int64_t partition_times[6][EXT_PARTITION_TYPES];
+
+  int partition_redo;
+} PartitionStats;
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "aom_ports/aom_timer.h"
+// Adjust the following to add new components.
+enum {
+  encode_frame_to_data_rate_time,
+  encode_with_recode_loop_time,
+  loop_filter_time,
+  cdef_time,
+  loop_restoration_time,
+  av1_pack_bitstream_final_time,
+  av1_encode_frame_time,
+  av1_compute_global_motion_time,
+  av1_setup_motion_field_time,
+  encode_sb_time,
+  first_partition_search_pass_time,
+  rd_pick_partition_time,
+  rd_pick_sb_modes_time,
+  av1_rd_pick_intra_mode_sb_time,
+  av1_rd_pick_inter_mode_sb_time,
+  handle_intra_mode_time,
+  handle_inter_mode_time,
+  do_tx_search_time,
+  handle_newmv_time,
+  compound_type_rd_time,
+  interpolation_filter_search_time,
+  motion_mode_rd_time,
+  kTimingComponents,
+} UENUM1BYTE(TIMING_COMPONENT);
+
+static INLINE char const *get_component_name(int index) {
+  switch (index) {
+    case encode_frame_to_data_rate_time:
+      return "encode_frame_to_data_rate_time";
+    case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
+    case loop_filter_time: return "loop_filter_time";
+    case cdef_time: return "cdef_time";
+    case loop_restoration_time: return "loop_restoration_time";
+    case av1_pack_bitstream_final_time: return "av1_pack_bitstream_final_time";
+    case av1_encode_frame_time: return "av1_encode_frame_time";
+    case av1_compute_global_motion_time:
+      return "av1_compute_global_motion_time";
+    case av1_setup_motion_field_time: return "av1_setup_motion_field_time";
+    case encode_sb_time: return "encode_sb_time";
+    case first_partition_search_pass_time:
+      return "first_partition_search_pass_time";
+    case rd_pick_partition_time: return "rd_pick_partition_time";
+    case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+    case av1_rd_pick_intra_mode_sb_time:
+      return "av1_rd_pick_intra_mode_sb_time";
+    case av1_rd_pick_inter_mode_sb_time:
+      return "av1_rd_pick_inter_mode_sb_time";
+    case handle_intra_mode_time: return "handle_intra_mode_time";
+    case handle_inter_mode_time: return "handle_inter_mode_time";
+    case do_tx_search_time: return "do_tx_search_time";
+    case handle_newmv_time: return "handle_newmv_time";
+    case compound_type_rd_time: return "compound_type_rd_time";
+    case interpolation_filter_search_time:
+      return "interpolation_filter_search_time";
+    case motion_mode_rd_time: return "motion_mode_rd_time";
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
+// The maximum number of internal ARFs except ALTREF_FRAME
+#define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
 
 typedef struct AV1_COMP {
   QUANTS quants;
@@ -597,7 +724,6 @@ typedef struct AV1_COMP {
   struct lookahead_entry *alt_ref_source;
   int no_show_kf;
 
-  int optimize_speed_feature;
   int optimize_seg_arr[MAX_SEGMENTS];
 
   YV12_BUFFER_CONFIG *source;
@@ -612,37 +738,20 @@ typedef struct AV1_COMP {
 
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
+  // The following item corresponds to two_pass_partition_search speed features.
+  int two_pass_partition_search;
+
   double csm_rate_array[32];
   double m_rate_array[32];
   int rate_size;
   int rate_index;
   hash_table *previous_hash_table;
   int previous_index;
-  int cur_poc;  // DebugInfo
 
   unsigned int row_mt;
-  int scaled_ref_idx[INTER_REFS_PER_FRAME];
-
-  // For encoder, we have a two-level mapping from reference frame type to the
-  // corresponding buffer in the buffer pool:
-  // * 'remapped_ref_idx[i - 1]' maps reference type ‘i’ (range: LAST_FRAME ...
-  // EXTREF_FRAME) to a remapped index ‘j’ (in range: 0 ... REF_FRAMES - 1)
-  // * Later, 'cm->ref_frame_map[j]' maps the remapped index ‘j’ to actual index
-  //   of the buffer in the buffer pool ‘cm->buffer_pool.frame_bufs’.
-  //
-  // LAST_FRAME,                        ...,      EXTREF_FRAME
-  //      |                                           |
-  //      v                                           v
-  // remapped_ref_idx[LAST_FRAME - 1],  ...,  remapped_ref_idx[EXTREF_FRAME - 1]
-  //      |                                           |
-  //      v                                           v
-  // ref_frame_map[],                   ...,     ref_frame_map[]
-  //
-  // Note: INTRA_FRAME always refers to the current frame, so there's no need to
-  // have a remapped index for the same.
-  int remapped_ref_idx[REF_FRAMES];
+  RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME];
 
-  int last_show_frame_buf_idx;  // last show frame buffer index
+  RefCntBuffer *last_show_frame_buf;  // last show frame buffer
 
   // refresh_*_frame are boolean flags. If 'refresh_xyz_frame' is true, then
   // after the current frame is encoded, the XYZ reference frame gets refreshed
@@ -661,14 +770,11 @@ typedef struct AV1_COMP {
   int refresh_alt2_ref_frame;
   int refresh_alt_ref_frame;
 
-#if USE_SYMM_MULTI_LAYER
-  // When true, a new rule for backward (future) reference frames is in effect:
-  // - BWDREF_FRAME is always the closest future frame available
-  // - ALTREF2_FRAME is always the 2nd closest future frame available
-  // - 'refresh_bwd_ref_frame' flag is used for updating both the BWDREF_FRAME
-  // and ALTREF2_FRAME. ('refresh_alt2_ref_frame' flag is irrelevant).
-  int new_bwdref_update_rule;
-#endif
+  // For each type of reference frame, this contains the index of a reference
+  // frame buffer for a reference frame of the same type.  We use this to
+  // choose our primary reference frame (which is the most recent reference
+  // frame of the same type as the current frame).
+  int fb_of_context_type[REF_FRAMES];
 
   int ext_refresh_frame_flags_pending;
   int ext_refresh_last_frame;
@@ -707,12 +813,6 @@ typedef struct AV1_COMP {
   RATE_CONTROL rc;
   double framerate;
 
-  // Relevant for an inter frame.
-  // - Index '0' corresponds to the values for the currently coded frame.
-  // - Indices LAST_FRAME ... EXTREF_FRAMES are used to store values for all the
-  // possible inter reference frames.
-  int interp_filter_selected[REF_FRAMES + 1][SWITCHABLE];
-
   struct aom_codec_pkt_list *output_pkt_list;
 
   MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
@@ -721,12 +821,14 @@ typedef struct AV1_COMP {
   int ref_frame_flags;
   int ext_ref_frame_flags;
 
+  // speed is passed as a per-frame parameter into the encoder
+  int speed;
+  // sf contains fine-grained config set internally based on speed
   SPEED_FEATURES sf;
 
   unsigned int max_mv_magnitude;
   int mv_step_param;
 
-  int allow_comp_inter_inter;
   int all_one_sided_refs;
 
   uint8_t *segmentation_map;
@@ -737,13 +839,10 @@ typedef struct AV1_COMP {
   fractional_mv_step_fp *find_fractional_mv_step;
   av1_diamond_search_fn_t diamond_search_sad;
   aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
+
+#if CONFIG_INTERNAL_STATS
   uint64_t time_receive_data;
   uint64_t time_compress_data;
-  uint64_t time_pick_lpf;
-  uint64_t time_encode_sb_row;
-
-#if CONFIG_FP_MB_STATS
-  int use_fp_mb_stats;
 #endif
 
   TWO_PASS twopass;
@@ -779,6 +878,9 @@ typedef struct AV1_COMP {
   Metrics metrics;
 #endif
   int b_calculate_psnr;
+#if CONFIG_SPEED_STATS
+  unsigned int tx_search_count;
+#endif  // CONFIG_SPEED_STATS
 
   int droppable;
 
@@ -796,23 +898,21 @@ typedef struct AV1_COMP {
   int resize_pending_width;
   int resize_pending_height;
 
-  int frame_flags;
-
-  search_site_config ss_cfg;
+  // ss_cfg[SS_CFG_LOOKAHEAD] : used in following cases
+  //                           -> temporal filtering
+  //                           -> intrabc
+  // ss_cfg[SS_CFG_SRC] : used everywhere except above mentioned cases
+  search_site_config ss_cfg[SS_CFG_TOTAL];
 
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
 
   TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
-  unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS];
   TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
 
-  TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
-
   int resize_state;
   int resize_avg_qp;
   int resize_buffer_underflow;
-  int resize_count;
 
   // Sequence parameters have been transmitted already and locked
   // or not. Once locked av1_change_config cannot change the seq
@@ -822,19 +922,24 @@ typedef struct AV1_COMP {
   // VARIANCE_AQ segment map refresh
   int vaq_refresh;
 
+  // VAR_BASED_PARTITION thresholds
+  // 0 - threshold_128x128; 1 - threshold_64x64;
+  // 2 - threshold_32x32; 3 - threshold_16x16;
+  // 4 - vbp_threshold_8x8;
+  int64_t vbp_thresholds[5];
+  int64_t vbp_threshold_minmax;
+  int64_t vbp_threshold_sad;
+  int64_t vbp_threshold_copy;
+  BLOCK_SIZE vbp_bsize_min;
+
   // Multi-threading
   int num_workers;
   AVxWorker *workers;
   struct EncWorkerData *tile_thr_data;
-  int refresh_frame_mask;
   int existing_fb_idx_to_show;
-  int is_arf_filter_off[MAX_EXT_ARFS + 1];
-  int num_extra_arfs;
-  int arf_pos_in_gf[MAX_EXT_ARFS + 1];
-  int arf_pos_for_ovrly[MAX_EXT_ARFS + 1];
+  int is_arf_filter_off[MAX_INTERNAL_ARFS + 1];
   int global_motion_search_done;
-  tran_low_t *tcoeff_buf[MAX_MB_PLANE];
-  int extra_arf_allowed;
+  int internal_altref_allowed;
   // A flag to indicate if intrabc is ever used in current frame.
   int intrabc_used;
   int dv_cost[2][MV_VALS];
@@ -842,10 +947,16 @@ typedef struct AV1_COMP {
   int dv_joint_cost[MV_JOINTS];
   int has_lossless_segment;
 
-  // For frame refs short signaling:
-  //   A mapping of each reference frame from its encoder side value to the
-  //   decoder side value obtained following the short signaling procedure.
-  int ref_conv[REF_FRAMES];
+  // Factors to control gating of compound type selection based on best
+  // approximate rd so far
+  int max_comp_type_rd_threshold_mul;
+  int max_comp_type_rd_threshold_div;
+
+  unsigned int tx_domain_dist_threshold;
+
+  // Factor to control R-D optimization of coeffs based on block
+  // mse.
+  unsigned int coeff_opt_dist_threshold;
 
   AV1LfSync lf_row_sync;
   AV1LrSync lr_row_sync;
@@ -865,8 +976,72 @@ typedef struct AV1_COMP {
 #if CONFIG_MULTITHREAD
   pthread_mutex_t *row_mt_mutex_;
 #endif
+  // Set if screen content is set or relevant tools are enabled
+  int is_screen_content_type;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+  PartitionStats partition_stats;
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  // component_time[] are initialized to zero while encoder starts.
+  uint64_t component_time[kTimingComponents];
+  struct aom_usec_timer component_timer[kTimingComponents];
+  // frame_component_time[] are initialized to zero at beginning of each frame.
+  uint64_t frame_component_time[kTimingComponents];
+#endif
+
+  // The following data are for AV1 bitstream levels.
+  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+  int keep_level_stats;
+  AV1LevelInfo level_info[MAX_NUM_OPERATING_POINTS];
+  // Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
+  int frame_header_count;
+  FrameWindowBuffer frame_window_buffer;
 } AV1_COMP;
 
+typedef struct {
+  YV12_BUFFER_CONFIG *source;
+  YV12_BUFFER_CONFIG *last_source;
+  int64_t ts_duration;
+} EncodeFrameInput;
+
+// EncodeFrameParams contains per-frame encoding parameters decided upon by
+// av1_encode_strategy() and passed down to av1_encode()
+struct EncodeFrameParams {
+  int error_resilient_mode;
+  FRAME_TYPE frame_type;
+  int primary_ref_frame;
+  int order_offset;
+  int show_frame;
+  int refresh_frame_flags;
+
+  int show_existing_frame;
+  int existing_fb_idx_to_show;
+
+  // Bitmask of which reference buffers may be referenced by this frame
+  int ref_frame_flags;
+
+  // Reference buffer assignment for this frame.
+  int remapped_ref_idx[REF_FRAMES];
+
+  // Flags which determine which reference buffers are refreshed by this frame
+  int refresh_last_frame;
+  int refresh_golden_frame;
+  int refresh_bwd_ref_frame;
+  int refresh_alt2_ref_frame;
+  int refresh_alt_ref_frame;
+
+  // Speed level to use for this frame: Bigger number means faster.
+  int speed;
+};
+typedef struct EncodeFrameParams EncodeFrameParams;
+
+// EncodeFrameResults contains information about the result of encoding a
+// single frame
+typedef struct {
+  size_t size;  // Size of resulting bitstream
+} EncodeFrameResults;
+
 // Must not be called more than once.
 void av1_initialize_enc(void);
 
@@ -887,6 +1062,11 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                             int64_t *time_end, int flush,
                             const aom_rational_t *timebase);
 
+int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+               const EncodeFrameInput *const frame_input,
+               const EncodeFrameParams *const frame_params,
+               EncodeFrameResults *const frame_results);
+
 int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
 
 int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
@@ -897,12 +1077,12 @@ aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
 
 int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags);
 
-void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags);
-
 int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
 
 int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
 
+void av1_set_frame_size(AV1_COMP *cpi, int width, int height);
+
 int av1_update_entropy(AV1_COMP *cpi, int update);
 
 int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
@@ -916,26 +1096,23 @@ int av1_get_quantizer(struct AV1_COMP *cpi);
 
 int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
 
-int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n);
-int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n);
+// av1 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
 
-static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
-  return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
-         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+static INLINE int64_t timebase_units_to_ticks(const aom_rational_t *timebase,
+                                              int64_t n) {
+  return n * TICKS_PER_SEC * timebase->num / timebase->den;
 }
 
-static INLINE int get_ref_frame_map_idx(const AV1_COMP *cpi,
-                                        MV_REFERENCE_FRAME ref_frame) {
-  return (ref_frame >= LAST_FRAME)
-             ? cpi->remapped_ref_idx[ref_frame - LAST_FRAME]
-             : INVALID_IDX;
+static INLINE int64_t ticks_to_timebase_units(const aom_rational_t *timebase,
+                                              int64_t n) {
+  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
 }
 
-static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi,
-                                        MV_REFERENCE_FRAME ref_frame) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
-  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
+static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
+  return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
 // TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD.
@@ -944,33 +1121,37 @@ static INLINE int av1_use_hash_me(const AV1_COMMON *const cm) {
 }
 
 static INLINE hash_table *av1_get_ref_frame_hash_map(
-    const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
-  return buf_idx != INVALID_IDX
-             ? &cm->buffer_pool->frame_bufs[buf_idx].hash_table
-             : NULL;
+    const AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame) {
+  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+  RefCntBuffer *buf =
+      (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
+  return buf ? &buf->hash_table : NULL;
 }
 
-static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
-    const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
-  return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf
-                                : NULL;
+static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf(
+    const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame) {
+  const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+  return buf != NULL ? &buf->buf : NULL;
 }
 
-static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
-  AV1_COMMON *const cm = &cpi->common;
+static INLINE int enc_is_ref_frame_buf(const AV1_COMMON *const cm,
+                                       const RefCntBuffer *const frame_buf) {
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
-    if (buf_idx == INVALID_IDX) continue;
-    if (frame_buf == &cm->buffer_pool->frame_bufs[buf_idx]) break;
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    if (buf == NULL) continue;
+    if (frame_buf == buf) break;
   }
   return (ref_frame <= ALTREF_FRAME);
 }
 
+static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
+  assert(buf != NULL);
+  ensure_mv_buffer(buf, cm);
+  buf->width = cm->width;
+  buf->height = cm->height;
+}
+
 // Token buffer is only used for palette tokens.
 static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
                                            int sb_size_log2,
@@ -1026,10 +1207,10 @@ static INLINE int is_altref_enabled(const AV1_COMP *const cpi) {
 static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
                                 MV_REFERENCE_FRAME ref1) {
-  xd->block_refs[0] =
-      &cm->current_frame.frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME : 0];
-  xd->block_refs[1] =
-      &cm->current_frame.frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME : 0];
+  xd->block_ref_scale_factors[0] =
+      get_ref_scale_factors_const(cm, ref0 >= LAST_FRAME ? ref0 : 1);
+  xd->block_ref_scale_factors[1] =
+      get_ref_scale_factors_const(cm, ref1 >= LAST_FRAME ? ref1 : 1);
 }
 
 static INLINE int get_chessboard_index(int frame_index) {
@@ -1042,6 +1223,8 @@ static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
 
 void av1_new_framerate(AV1_COMP *cpi, double framerate);
 
+void av1_setup_frame_size(AV1_COMP *cpi);
+
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
 
 // Returns 1 if a frame is scaled and 0 otherwise.
@@ -1062,6 +1245,48 @@ static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
                                      cm->current_frame.frame_type == KEY_FRAME);
 }
 
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static INLINE void set_mode_info_offsets(const AV1_COMP *const cpi,
+                                         MACROBLOCK *const x,
+                                         MACROBLOCKD *const xd, int mi_row,
+                                         int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int idx_str = xd->mi_stride * mi_row + mi_col;
+  xd->mi = cm->mi_grid_visible + idx_str;
+  xd->mi[0] = cm->mi + idx_str;
+  x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of mi block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static INLINE BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+                                             int cols_left, int *bh, int *bw) {
+  int int_size = (int)bsize;
+  if (rows_left <= 0 || cols_left <= 0) {
+    return AOMMIN(bsize, BLOCK_8X8);
+  } else {
+    for (; int_size > 0; int_size -= 3) {
+      *bh = mi_size_high[int_size];
+      *bw = mi_size_wide[int_size];
+      if ((*bh <= rows_left) && (*bw <= cols_left)) {
+        break;
+      }
+    }
+  }
+  return (BLOCK_SIZE)int_size;
+}
+
+static const uint8_t av1_ref_frame_flag_list[REF_FRAMES] = { 0,
+                                                             AOM_LAST_FLAG,
+                                                             AOM_LAST2_FLAG,
+                                                             AOM_LAST3_FLAG,
+                                                             AOM_GOLD_FLAG,
+                                                             AOM_BWD_FLAG,
+                                                             AOM_ALT2_FLAG,
+                                                             AOM_ALT_FLAG };
+
 // Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
 // failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
 // function, the memory must be freed by the caller. Both the buf member of the
@@ -1073,6 +1298,80 @@ static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
 // field.
 aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);
 
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+static INLINE void av1_print_partition_stats(PartitionStats *part_stats) {
+  FILE *f = fopen("partition_stats.csv", "w");
+  if (!f) {
+    return;
+  }
+
+  fprintf(f, "bsize,redo,");
+  for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+    fprintf(f, "decision_%d,", part);
+  }
+  for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+    fprintf(f, "attempt_%d,", part);
+  }
+  for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+    fprintf(f, "time_%d,", part);
+  }
+  fprintf(f, "\n");
+
+  const int bsizes[6] = { 128, 64, 32, 16, 8, 4 };
+
+  for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) {
+    fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo);
+    for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+      fprintf(f, "%d,", part_stats->partition_decisions[bsize_idx][part]);
+    }
+    for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+      fprintf(f, "%d,", part_stats->partition_attempts[bsize_idx][part]);
+    }
+    for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+      fprintf(f, "%ld,", part_stats->partition_times[bsize_idx][part]);
+    }
+    fprintf(f, "\n");
+  }
+  fclose(f);
+}
+
+static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) {
+  assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
+         bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 ||
+         bsize == BLOCK_4X4);
+  switch (bsize) {
+    case BLOCK_128X128: return 0;
+    case BLOCK_64X64: return 1;
+    case BLOCK_32X32: return 2;
+    case BLOCK_16X16: return 3;
+    case BLOCK_8X8: return 4;
+    case BLOCK_4X4: return 5;
+    default: assert(0 && "Invalid bsize for partition_stats."); return -1;
+  }
+}
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(AV1_COMP *cpi, int component) {
+  aom_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(AV1_COMP *cpi, int component) {
+  aom_usec_timer_mark(&cpi->component_timer[component]);
+  cpi->frame_component_time[component] +=
+      aom_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+  switch (type) {
+    case 0: return "KEY_FRAME";
+    case 1: return "INTER_FRAME";
+    case 2: return "INTRA_ONLY_FRAME";
+    case 3: return "S_FRAME";
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libaom/av1/encoder/encodetxb.c b/libaom/av1/encoder/encodetxb.c
index a0c6ec1..37f4bb9 100644
--- a/libaom/av1/encoder/encodetxb.c
+++ b/libaom/av1/encoder/encodetxb.c
@@ -76,21 +76,12 @@ void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
 void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
                           int mi_row, int mi_col) {
   const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
   int mib_size_log2 = cm->seq_params.mib_size_log2;
   int stride = (cm->mi_cols >> mib_size_log2) + 1;
   int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
-  CB_COEFF_BUFFER *coeff_buf = &cpi->coeff_buffer_base[offset];
-  const int txb_offset = x->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  x->mbmi_ext->cb_coef_buff = &cpi->coeff_buffer_base[offset];
+  x->mbmi_ext->cb_offset = x->cb_offset;
   assert(x->cb_offset < (1 << num_pels_log2_lookup[cm->seq_params.sb_size]));
-  for (int plane = 0; plane < num_planes; ++plane) {
-    x->mbmi_ext->tcoeff[plane] = coeff_buf->tcoeff[plane] + x->cb_offset;
-    x->mbmi_ext->eobs[plane] = coeff_buf->eobs[plane] + txb_offset;
-    x->mbmi_ext->txb_skip_ctx[plane] =
-        coeff_buf->txb_skip_ctx[plane] + txb_offset;
-    x->mbmi_ext->dc_sign_ctx[plane] =
-        coeff_buf->dc_sign_ctx[plane] + txb_offset;
-  }
 }
 
 static void write_golomb(aom_writer *w, int level) {
@@ -284,20 +275,16 @@ static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
   return av1_cost_literal(1);
 }
 
-static INLINE int get_br_cost(tran_low_t abs_qc, int ctx,
-                              const int *coeff_lps) {
-  const tran_low_t min_level = 1 + NUM_BASE_LEVELS;
-  const tran_low_t max_level = 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE;
-  (void)ctx;
-  if (abs_qc >= min_level) {
-    if (abs_qc >= max_level) {
-      return coeff_lps[COEFF_BASE_RANGE];  // COEFF_BASE_RANGE * cost0;
-    } else {
-      return coeff_lps[(abs_qc - min_level)];  //  * cost0 + cost1;
-    }
-  }
-  return 0;
-}
+static const int golomb_bits_cost[32] = {
+  0,       512,     512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5,
+  512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7,
+  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9,
+  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9
+};
+static const int golomb_cost_diff[32] = {
+  0,       512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0,
+  512 * 2, 0,   0,       0, 0,       0, 0, 0, 0,       0, 0, 0, 0, 0, 0, 0
+};
 
 static INLINE int get_golomb_cost(int abs_qc) {
   if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
@@ -308,6 +295,32 @@ static INLINE int get_golomb_cost(int abs_qc) {
   return 0;
 }
 
+static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
+                                        int *diff) {
+  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+  int golomb_bits = 0;
+  if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS)
+    *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1];
+
+  if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) {
+    int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    if (r < 32) {
+      golomb_bits = golomb_bits_cost[r];
+      *diff += golomb_cost_diff[r];
+    } else {
+      golomb_bits = get_golomb_cost(level);
+      *diff += (r & (r - 1)) == 0 ? 1024 : 0;
+    }
+  }
+
+  return coeff_lps[base_range] + golomb_bits;
+}
+
+static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) {
+  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+  return coeff_lps[base_range] + get_golomb_cost(level);
+}
+
 static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
                           const int is_eob, const TxbInfo *const txb_info,
                           const LV_MAP_COEFF_COST *const txb_costs,
@@ -331,8 +344,7 @@ static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
     if (abs_qc > NUM_BASE_LEVELS) {
       const int ctx =
           get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class);
-      cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
-      cost += get_golomb_cost(abs_qc);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[ctx]);
     }
   }
   return cost;
@@ -464,8 +476,6 @@ void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
   const int stride = width + TX_PAD_HOR;
   uint8_t *ls = levels;
 
-  memset(levels - TX_PAD_TOP * stride, 0,
-         sizeof(*levels) * TX_PAD_TOP * stride);
   memset(levels + stride * height, 0,
          sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
 
@@ -554,14 +564,15 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
       break;
   }
 
-  if (k_eob_offset_bits[eob_pt] > 0) {
+  const int eob_offset_bits = k_eob_offset_bits[eob_pt];
+  if (eob_offset_bits > 0) {
     const int eob_ctx = eob_pt - 3;
-    int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    int eob_shift = eob_offset_bits - 1;
     int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
     aom_write_symbol(w, bit,
                      ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2);
-    for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
-      eob_shift = k_eob_offset_bits[eob_pt] - 1 - i;
+    for (int i = 1; i < eob_offset_bits; i++) {
+      eob_shift = eob_offset_bits - 1 - i;
       bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
       aom_write_bit(w, bit);
     }
@@ -588,12 +599,11 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
       // level is above 1.
       const int base_range = level - 1 - NUM_BASE_LEVELS;
       const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      aom_cdf_prob *cdf =
+          ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
       for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
         const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
-        aom_write_symbol(
-            w, k,
-            ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
-            BR_CDF_SIZE);
+        aom_write_symbol(w, k, cdf, BR_CDF_SIZE);
         if (k < BR_CDF_SIZE - 1) break;
       }
     }
@@ -628,10 +638,18 @@ static void write_coeffs_txb_wrap(const AV1_COMMON *cm, MACROBLOCK *x,
                                   aom_writer *w, int plane, int block,
                                   int blk_row, int blk_col, TX_SIZE tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
-  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
-  uint16_t eob = x->mbmi_ext->eobs[plane][block];
-  TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
-                      x->mbmi_ext->dc_sign_ctx[plane][block] };
+  const int txb_offset =
+      x->mbmi_ext->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  tran_low_t *tcoeff_txb =
+      x->mbmi_ext->cb_coef_buff->tcoeff[plane] + x->mbmi_ext->cb_offset;
+  uint16_t *eob_txb = x->mbmi_ext->cb_coef_buff->eobs[plane] + txb_offset;
+  uint8_t *txb_skip_ctx_txb =
+      x->mbmi_ext->cb_coef_buff->txb_skip_ctx[plane] + txb_offset;
+  int *dc_sign_ctx_txb =
+      x->mbmi_ext->cb_coef_buff->dc_sign_ctx[plane] + txb_offset;
+  tran_low_t *tcoeff = BLOCK_OFFSET(tcoeff_txb, block);
+  uint16_t eob = eob_txb[block];
+  TXB_CTX txb_ctx = { txb_skip_ctx_txb[block], dc_sign_ctx_txb[block] };
   av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff, eob,
                        &txb_ctx);
 }
@@ -745,7 +763,8 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
 
   av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
 
-  const int(*lps_cost)[COEFF_BASE_RANGE + 1] = coeff_costs->lps_cost;
+  const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] =
+      coeff_costs->lps_cost;
   int c = eob - 1;
   {
     const int pos = scan[c];
@@ -758,11 +777,8 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
     if (v) {
       // sign bit cost
       if (level > NUM_BASE_LEVELS) {
-        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
-        const int base_range =
-            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
-        cost += lps_cost[ctx][base_range];
-        cost += get_golomb_cost(level);
+        const int ctx = get_br_ctx_eob(pos, bwl, tx_class);
+        cost += get_br_cost(level, lps_cost[ctx]);
       }
       if (c) {
         cost += av1_cost_literal(1);
@@ -774,7 +790,7 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
       }
     }
   }
-  const int(*base_cost)[4] = coeff_costs->base_cost;
+  const int(*base_cost)[8] = coeff_costs->base_cost;
   for (c = eob - 2; c >= 1; --c) {
     const int pos = scan[c];
     const int coeff_ctx = coeff_contexts[pos];
@@ -786,10 +802,7 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
       cost += av1_cost_literal(1);
       if (level > NUM_BASE_LEVELS) {
         const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
-        const int base_range =
-            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
-        cost += lps_cost[ctx][base_range];
-        cost += get_golomb_cost(level);
+        cost += get_br_cost(level, lps_cost[ctx]);
       }
     }
     cost += cost0;
@@ -809,10 +822,7 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
       cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
       if (level > NUM_BASE_LEVELS) {
         const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
-        const int base_range =
-            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
-        cost += lps_cost[ctx][base_range];
-        cost += get_golomb_cost(level);
+        cost += get_br_cost(level, lps_cost[ctx]);
       }
     }
   }
@@ -1284,20 +1294,47 @@ static int hbt_create_hashes(TxbInfo *txb_info,
                           txb_eob_costs, p, block, fast_mode, rate_cost);
 }
 
-static AOM_FORCE_INLINE int get_coeff_cost_simple(
+static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
     int ci, tran_low_t abs_qc, int coeff_ctx,
     const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
-    const uint8_t *levels) {
+    const uint8_t *levels, int *cost_low) {
   // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
   // and not the last (scan_idx != eob - 1)
   assert(ci > 0);
   int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  int diff = 0;
+  if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4];
   if (abs_qc) {
     cost += av1_cost_literal(1);
     if (abs_qc > NUM_BASE_LEVELS) {
       const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
-      cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
-      cost += get_golomb_cost(abs_qc);
+      int brcost_diff = 0;
+      cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx],
+                                    &brcost_diff);
+      diff += brcost_diff;
+    }
+  }
+  *cost_low = cost - diff;
+
+  return cost;
+}
+
+static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
+                                     int coeff_ctx, int dc_sign_ctx,
+                                     const LV_MAP_COEFF_COST *txb_costs,
+                                     int bwl, TX_CLASS tx_class) {
+  int cost = 0;
+  cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  if (abs_qc != 0) {
+    if (ci == 0) {
+      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+    } else {
+      cost += av1_cost_literal(1);
+    }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      int br_ctx;
+      br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
     }
   }
   return cost;
@@ -1322,9 +1359,12 @@ static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
       cost += av1_cost_literal(1);
     }
     if (abs_qc > NUM_BASE_LEVELS) {
-      const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
-      cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
-      cost += get_golomb_cost(abs_qc);
+      int br_ctx;
+      if (is_last)
+        br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
+      else
+        br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
     }
   }
   return cost;
@@ -1368,13 +1408,23 @@ static INLINE void update_coeff_general(
     const int64_t rd = RDCOST(rdmult, rate, dist);
 
     tran_low_t qc_low, dqc_low;
-    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
-    const tran_low_t abs_qc_low = abs_qc - 1;
-    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
-    const int rate_low =
-        get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
-                               dc_sign_ctx, txb_costs, bwl, tx_class, levels);
-    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+    tran_low_t abs_qc_low;
+    int64_t dist_low, rd_low;
+    int rate_low;
+    if (abs_qc == 1) {
+      abs_qc_low = qc_low = dqc_low = 0;
+      dist_low = dist0;
+      rate_low = txb_costs->base_cost[coeff_ctx][0];
+    } else {
+      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+      abs_qc_low = abs_qc - 1;
+      dist_low = get_coeff_dist(tqc, dqc_low, shift);
+      rate_low =
+          get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    }
+
+    rd_low = RDCOST(rdmult, rate_low, dist_low);
     if (rd_low < rd) {
       qcoeff[ci] = qc_low;
       dqcoeff[ci] = dqc_low;
@@ -1408,28 +1458,28 @@ static AOM_FORCE_INLINE void update_coeff_simple(
     *accu_rate += txb_costs->base_cost[coeff_ctx][0];
   } else {
     const tran_low_t abs_qc = abs(qc);
-    const tran_low_t tqc = tcoeff[ci];
-    const tran_low_t dqc = dqcoeff[ci];
-    const int rate = get_coeff_cost_simple(ci, abs_qc, coeff_ctx, txb_costs,
-                                           bwl, tx_class, levels);
-    if (abs(dqc) < abs(tqc)) {
+    const tran_low_t abs_tqc = abs(tcoeff[ci]);
+    const tran_low_t abs_dqc = abs(dqcoeff[ci]);
+    int rate_low = 0;
+    const int rate = get_two_coeff_cost_simple(
+        ci, abs_qc, coeff_ctx, txb_costs, bwl, tx_class, levels, &rate_low);
+    if (abs_dqc < abs_tqc) {
       *accu_rate += rate;
       return;
     }
-    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+
+    const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift);
     const int64_t rd = RDCOST(rdmult, rate, dist);
 
-    const int sign = (qc < 0) ? 1 : 0;
-    tran_low_t qc_low, dqc_low;
-    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
     const tran_low_t abs_qc_low = abs_qc - 1;
-    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
-    const int rate_low = get_coeff_cost_simple(
-        ci, abs_qc_low, coeff_ctx, txb_costs, bwl, tx_class, levels);
+    const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+    const int64_t dist_low = get_coeff_dist(abs_tqc, abs_dqc_low, shift);
     const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+
     if (rd_low < rd) {
-      qcoeff[ci] = qc_low;
-      dqcoeff[ci] = dqc_low;
+      const int sign = (qc < 0) ? 1 : 0;
+      qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
+      dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
       levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
       *accu_rate += rate_low;
     } else {
@@ -1438,6 +1488,36 @@ static AOM_FORCE_INLINE void update_coeff_simple(
   }
 }
 
+static INLINE void update_coeff_eob_fast(int *eob, int shift,
+                                         const int16_t *dequant_ptr,
+                                         const int16_t *scan,
+                                         const tran_low_t *coeff_ptr,
+                                         tran_low_t *qcoeff_ptr,
+                                         tran_low_t *dqcoeff_ptr) {
+  // TODO(sarahparker) make this work for aomqm
+  int eob_out = *eob;
+  int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7),
+                  dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) };
+
+  for (int i = *eob - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) {
+      eob_out--;
+      qcoeff_ptr[rc] = 0;
+      dqcoeff_ptr[rc] = 0;
+    } else {
+      break;
+    }
+  }
+
+  *eob = eob_out;
+}
+
 static AOM_FORCE_INLINE void update_coeff_eob(
     int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
     int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height,
@@ -1467,40 +1547,42 @@ static AOM_FORCE_INLINE void update_coeff_eob(
     int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
 
     tran_low_t qc_low, dqc_low;
-    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
-    const tran_low_t abs_qc_low = abs_qc - 1;
-    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
-    const int rate_low =
-        get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, dc_sign_ctx,
-                               txb_costs, bwl, tx_class, levels);
-    const int64_t rd_low =
-        RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+    tran_low_t abs_qc_low;
+    int64_t dist_low, rd_low;
+    int rate_low;
+    if (abs_qc == 1) {
+      abs_qc_low = 0;
+      dqc_low = qc_low = 0;
+      dist_low = 0;
+      rate_low = txb_costs->base_cost[coeff_ctx][0];
+      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
+    } else {
+      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+      abs_qc_low = abs_qc - 1;
+      dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
+      rate_low =
+          get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+    }
 
     int lower_level_new_eob = 0;
     const int new_eob = si + 1;
-    uint8_t tmp_levels[3];
-    for (int ni = 0; ni < *nz_num; ++ni) {
-      const int last_ci = nz_ci[ni];
-      tmp_levels[ni] = levels[get_padded_idx(last_ci, bwl)];
-      levels[get_padded_idx(last_ci, bwl)] = 0;
-    }
-
-    const int coeff_ctx_new_eob = get_lower_levels_ctx_general(
-        1, si, bwl, height, levels, ci, tx_size, tx_class);
+    const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bwl, height, si);
     const int new_eob_cost =
         get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
     int rate_coeff_eob =
-        new_eob_cost + get_coeff_cost_general(1, ci, abs_qc, sign,
-                                              coeff_ctx_new_eob, dc_sign_ctx,
-                                              txb_costs, bwl, tx_class, levels);
+        new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob,
+                                          dc_sign_ctx, txb_costs, bwl,
+                                          tx_class);
     int64_t dist_new_eob = dist;
     int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
 
     if (abs_qc_low > 0) {
       const int rate_coeff_eob_low =
-          new_eob_cost +
-          get_coeff_cost_general(1, ci, abs_qc_low, sign, coeff_ctx_new_eob,
-                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+          new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign,
+                                            coeff_ctx_new_eob, dc_sign_ctx,
+                                            txb_costs, bwl, tx_class);
       const int64_t dist_new_eob_low = dist_low;
       const int64_t rd_new_eob_low =
           RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
@@ -1522,7 +1604,7 @@ static AOM_FORCE_INLINE void update_coeff_eob(
     if (sharpness == 0 && rd_new_eob < rd) {
       for (int ni = 0; ni < *nz_num; ++ni) {
         int last_ci = nz_ci[ni];
-        // levels[get_padded_idx(last_ci, bwl)] = 0;
+        levels[get_padded_idx(last_ci, bwl)] = 0;
         qcoeff[last_ci] = 0;
         dqcoeff[last_ci] = 0;
       }
@@ -1532,10 +1614,6 @@ static AOM_FORCE_INLINE void update_coeff_eob(
       *accu_dist = dist_new_eob;
       lower_level = lower_level_new_eob;
     } else {
-      for (int ni = 0; ni < *nz_num; ++ni) {
-        const int last_ci = nz_ci[ni];
-        levels[get_padded_idx(last_ci, bwl)] = tmp_levels[ni];
-      }
       *accu_rate += rate;
       *accu_dist += dist;
     }
@@ -1575,35 +1653,44 @@ static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
 int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                          int block, TX_SIZE tx_size, TX_TYPE tx_type,
                          const TXB_CTX *const txb_ctx, int *rate_cost,
-                         int sharpness) {
-  const AV1_COMMON *cm = &cpi->common;
+                         int sharpness, int fast_mode) {
   MACROBLOCKD *xd = &x->e_mbd;
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  const TX_CLASS tx_class = tx_type_to_class[tx_type];
-  const MB_MODE_INFO *mbmi = xd->mi[0];
-  const struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
+  const struct macroblock_plane *p = &x->plane[plane];
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  const int shift = av1_get_tx_scale(tx_size);
+  int eob = p->eobs[block];
+  const int16_t *dequant = p->dequant_QTX;
   tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
-  const int16_t *dequant = p->dequant_QTX;
+
+  if (fast_mode) {
+    update_coeff_eob_fast(&eob, shift, dequant, scan, tcoeff, qcoeff, dqcoeff);
+    p->eobs[block] = eob;
+    if (eob == 0) {
+      *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size);
+      return eob;
+    }
+  }
+
+  const AV1_COMMON *cm = &cpi->common;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const MB_MODE_INFO *mbmi = xd->mi[0];
   const int bwl = get_txb_bwl(tx_size);
   const int width = get_txb_wide(tx_size);
   const int height = get_txb_high(tx_size);
   assert(width == (1 << bwl));
   const int is_inter = is_inter_block(mbmi);
-  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
-  const int16_t *scan = scan_order->scan;
   const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
   const int eob_multi_size = txsize_log2_minus4[tx_size];
   const LV_MAP_EOB_COST *txb_eob_costs =
       &x->eob_costs[eob_multi_size][plane_type];
 
-  const int shift = av1_get_tx_scale(tx_size);
-  const int64_t rdmult =
-      ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
-       2) >>
+  const int rshift =
       (sharpness +
        (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
             ? 7 - mbmi->segment_id
@@ -1612,17 +1699,21 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                 cpi->oxcf.deltaq_mode > NO_DELTA_Q && x->sb_energy_level < 0
             ? (3 - x->sb_energy_level)
             : 0));
+  const int64_t rdmult =
+      (((int64_t)x->rdmult *
+        (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
+       2) >>
+      rshift;
 
   uint8_t levels_buf[TX_PAD_2D];
   uint8_t *const levels = set_levels(levels_buf, width);
 
-  av1_txb_init_levels(qcoeff, width, height, levels);
+  if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels);
 
   // TODO(angirbird): check iqmatrix
 
   const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
   const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
-  int eob = p->eobs[block];
   const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
   int accu_rate = eob_cost;
   int64_t accu_dist = 0;
@@ -1642,11 +1733,10 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
     --si;
   } else {
     assert(abs_qc == 1);
-    const int coeff_ctx = get_lower_levels_ctx_general(
-        1, si, bwl, height, levels, ci, tx_size, tx_class);
-    accu_rate += get_coeff_cost_general(1, ci, abs_qc, sign, coeff_ctx,
-                                        txb_ctx->dc_sign_ctx, txb_costs, bwl,
-                                        tx_class, levels);
+    const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, si);
+    accu_rate +=
+        get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx,
+                           txb_costs, bwl, tx_class);
     const tran_low_t tqc = tcoeff[ci];
     const tran_low_t dqc = dqcoeff[ci];
     const int64_t dist = get_coeff_dist(tqc, dqc, shift);
@@ -1657,7 +1747,7 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
 #define UPDATE_COEFF_EOB_CASE(tx_class_literal)                            \
   case tx_class_literal:                                                   \
-    for (; si >= 0 && nz_num <= max_nz_num; --si) {                        \
+    for (; si >= 0 && nz_num <= max_nz_num && !fast_mode; --si) {          \
       update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si,   \
                        tx_size, tx_class_literal, bwl, height,             \
                        txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
@@ -1750,7 +1840,8 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
   const int shift = av1_get_tx_scale(tx_size);
   const int64_t rdmult =
-      ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
+      (((int64_t)x->rdmult * plane_rd_mult[is_inter][plane_type]
+        << (2 * (xd->bd - 8))) +
        2) >>
       2;
   uint8_t levels_buf[TX_PAD_2D];
@@ -1763,10 +1854,9 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
   assert(width == (1 << bwl));
   const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
   TxbInfo txb_info = {
-    qcoeff,   levels,       dqcoeff,    tcoeff,  dequant, shift,
-    tx_size,  txs_ctx,      tx_type,    bwl,     width,   height,
-    eob,      seg_eob,      scan_order, txb_ctx, rdmult,  &cm->coeff_ctx_table,
-    iqmatrix, tx_type_cost,
+    qcoeff,     levels,  dqcoeff, tcoeff,   dequant,      shift, tx_size,
+    txs_ctx,    tx_type, bwl,     width,    height,       eob,   seg_eob,
+    scan_order, txb_ctx, rdmult,  iqmatrix, tx_type_cost,
   };
 
   // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
@@ -1918,15 +2008,22 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
                2);
   }
 
-  x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx;
-  x->mbmi_ext->eobs[plane][block] = eob;
+  const int txb_offset =
+      x->mbmi_ext->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  uint16_t *eob_txb = x->mbmi_ext->cb_coef_buff->eobs[plane] + txb_offset;
+  uint8_t *txb_skip_ctx_txb =
+      x->mbmi_ext->cb_coef_buff->txb_skip_ctx[plane] + txb_offset;
+  txb_skip_ctx_txb[block] = txb_ctx.txb_skip_ctx;
+  eob_txb[block] = eob;
 
   if (eob == 0) {
     av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, blk_row);
     return;
   }
 
-  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+  tran_low_t *tcoeff_txb =
+      x->mbmi_ext->cb_coef_buff->tcoeff[plane] + x->mbmi_ext->cb_offset;
+  tran_low_t *tcoeff = BLOCK_OFFSET(tcoeff_txb, block);
   const int segment_id = mbmi->segment_id;
   const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
@@ -2019,7 +2116,9 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
 #endif  // CONFIG_ENTROPY_STATS
     if (allow_update_cdf)
       update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
-    x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+    int *dc_sign_ctx_txb =
+        x->mbmi_ext->cb_coef_buff->dc_sign_ctx[plane] + txb_offset;
+    dc_sign_ctx_txb[block] = dc_sign_ctx;
   }
 
   const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob);
diff --git a/libaom/av1/encoder/encodetxb.h b/libaom/av1/encoder/encodetxb.h
index 4ee41ce..0682590 100644
--- a/libaom/av1/encoder/encodetxb.h
+++ b/libaom/av1/encoder/encodetxb.h
@@ -42,7 +42,6 @@ typedef struct TxbInfo {
   const SCAN_ORDER *scan_order;
   TXB_CTX *txb_ctx;
   int64_t rdmult;
-  const LV_MAP_CTX_TABLE *coeff_ctx_table;
   const qm_val_t *iqmatrix;
   int tx_type_cost;
 } TxbInfo;
@@ -79,7 +78,7 @@ void hbt_destroy();
 int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                          int block, TX_SIZE tx_size, TX_TYPE tx_type,
                          const TXB_CTX *const txb_ctx, int *rate_cost,
-                         int sharpness);
+                         int sharpness, int fast_mode);
 
 // These numbers are empirically obtained.
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
diff --git a/libaom/av1/encoder/ethread.c b/libaom/av1/encoder/ethread.c
index a3fb93e..c8c2107 100644
--- a/libaom/av1/encoder/ethread.c
+++ b/libaom/av1/encoder/ethread.c
@@ -164,10 +164,7 @@ void av1_row_mt_sync_mem_alloc(AV1RowMTSync *row_mt_sync, AV1_COMMON *cm,
                   aom_malloc(sizeof(*row_mt_sync->cur_col) * rows));
 
   // Set up nsync.
-  if (cm->seq_params.mib_size_log2 == 4)
-    row_mt_sync->sync_range = 2;
-  else
-    row_mt_sync->sync_range = 1;
+  row_mt_sync->sync_range = 1;
 }
 
 // Deallocate row based multi-threading synchronization related mutex and data
@@ -239,26 +236,34 @@ static void switch_tile_and_get_next_job(AV1_COMP *const cpi, int *cur_tile_id,
       int tile_index = tile_row * tile_cols + tile_col;
       TileDataEnc *this_tile = &cpi->tile_data[tile_index];
       AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info;
-      int num_mis_to_encode =
-          this_tile->tile_info.mi_row_end - row_mt_info->current_mi_row;
-
-      // Tile to be processed by this thread is selected on the basis of
-      // availability of jobs:
-      // 1) If jobs are available, tile to be processed is chosen on the
-      // basis of minimum number of threads working for that tile. If two or
-      // more tiles have same number of threads working for them, then the tile
-      // with maximum number of jobs available will be chosen.
-      // 2) If no jobs are available, then end_of_frame is reached.
-      if (num_mis_to_encode > 0) {
-        int num_threads_working = row_mt_info->num_threads_working;
-        if (num_threads_working < min_num_threads_working) {
-          min_num_threads_working = num_threads_working;
-          max_mis_to_encode = 0;
-        }
-        if (num_threads_working == min_num_threads_working &&
-            num_mis_to_encode > max_mis_to_encode) {
-          tile_id = tile_index;
-          max_mis_to_encode = num_mis_to_encode;
+      int num_sb_rows_in_tile =
+          av1_get_sb_rows_in_tile(cm, this_tile->tile_info);
+      int num_sb_cols_in_tile =
+          av1_get_sb_cols_in_tile(cm, this_tile->tile_info);
+      int theoretical_limit_on_threads =
+          AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
+      int num_threads_working = row_mt_info->num_threads_working;
+      if (num_threads_working < theoretical_limit_on_threads) {
+        int num_mis_to_encode =
+            this_tile->tile_info.mi_row_end - row_mt_info->current_mi_row;
+
+        // Tile to be processed by this thread is selected on the basis of
+        // availability of jobs:
+        // 1) If jobs are available, tile to be processed is chosen on the
+        // basis of minimum number of threads working for that tile. If two or
+        // more tiles have same number of threads working for them, then the
+        // tile with maximum number of jobs available will be chosen.
+        // 2) If no jobs are available, then end_of_frame is reached.
+        if (num_mis_to_encode > 0) {
+          if (num_threads_working < min_num_threads_working) {
+            min_num_threads_working = num_threads_working;
+            max_mis_to_encode = 0;
+          }
+          if (num_threads_working == min_num_threads_working &&
+              num_mis_to_encode > max_mis_to_encode) {
+            tile_id = tile_index;
+            max_mis_to_encode = num_mis_to_encode;
+          }
         }
       }
     }
@@ -313,9 +318,14 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
 
     td->mb.e_mbd.tile_ctx = td->tctx;
     td->mb.tile_pb_ctx = &this_tile->tctx;
-    td->mb.backup_tile_ctx = &this_tile->backup_tctx;
-    if (current_mi_row == this_tile->tile_info.mi_row_start)
+    if (this_tile->allow_update_cdf) {
+      td->mb.row_ctx = this_tile->row_ctx;
+      if (current_mi_row == this_tile->tile_info.mi_row_start)
+        memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
+    } else {
       memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
+    }
+
     av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
 
     // Disable exhaustive search speed features for row based multi-threading of
@@ -356,10 +366,8 @@ static int enc_worker_hook(void *arg1, void *unused) {
 
     TileDataEnc *const this_tile =
         &cpi->tile_data[tile_row * cm->tile_cols + tile_col];
-    thread_data->td->tctx = &this_tile->tctx;
-    thread_data->td->mb.e_mbd.tile_ctx = thread_data->td->tctx;
-    thread_data->td->mb.tile_pb_ctx = thread_data->td->tctx;
-    thread_data->td->mb.backup_tile_ctx = &this_tile->backup_tctx;
+    thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+    thread_data->td->mb.tile_pb_ctx = &this_tile->tctx;
     av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
   }
 
@@ -386,7 +394,7 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
   }
 #endif
 
-  for (int i = 0; i < num_workers; i++) {
+  for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &cpi->workers[i];
     EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
 
@@ -397,7 +405,7 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
     thread_data->cpi = cpi;
     thread_data->thread_id = i;
 
-    if (i < num_workers - 1) {
+    if (i > 0) {
       // Allocate thread data.
       CHECK_MEM_ERROR(cm, thread_data->td,
                       aom_memalign(32, sizeof(*thread_data->td)));
@@ -421,11 +429,9 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
           (int32_t *)aom_memalign(
               16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
       CHECK_MEM_ERROR(cm, thread_data->td->inter_modes_info,
                       (InterModesInfo *)aom_malloc(
                           sizeof(*thread_data->td->inter_modes_info)));
-#endif
 
       for (int x = 0; x < 2; x++)
         for (int y = 0; y < 2; y++)
@@ -478,14 +484,14 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
 static void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   // Encode a frame
-  for (int i = 0; i < num_workers; i++) {
+  for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &cpi->workers[i];
     EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
 
     // Set the starting tile for each thread.
     thread_data->start = i;
 
-    if (i == cpi->num_workers - 1)
+    if (i == 0)
       winterface->execute(worker);
     else
       winterface->launch(worker);
@@ -497,7 +503,7 @@ static void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
   int had_error = 0;
 
   // Encoding ends.
-  for (int i = 0; i < num_workers; i++) {
+  for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &cpi->workers[i];
     had_error |= !winterface->sync(worker);
   }
@@ -508,22 +514,25 @@ static void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
 }
 
 static void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) {
-  for (int i = 0; i < num_workers; i++) {
+  for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &cpi->workers[i];
     EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
     cpi->intrabc_used |= thread_data->td->intrabc_used;
     // Accumulate counters.
-    if (i < cpi->num_workers - 1) {
+    if (i > 0) {
       av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
       accumulate_rd_opt(&cpi->td, thread_data->td);
       cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
+#if CONFIG_SPEED_STATS
+      cpi->td.mb.tx_search_count += thread_data->td->mb.tx_search_count;
+#endif  // CONFIG_SPEED_STATS
     }
   }
 }
 
 static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
                                 int num_workers) {
-  for (int i = 0; i < num_workers; i++) {
+  for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &cpi->workers[i];
     EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
 
@@ -541,9 +550,7 @@ static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
       thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
       thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
       thread_data->td->mb.inter_modes_info = thread_data->td->inter_modes_info;
-#endif
       for (int x = 0; x < 2; x++) {
         for (int y = 0; y < 2; y++) {
           memcpy(thread_data->td->hash_value_buffer[x][y],
@@ -560,7 +567,7 @@ static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
       memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
     }
 
-    if (i < num_workers - 1) {
+    if (i > 0) {
       thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
       thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
       for (int j = 0; j < 2; ++j) {
@@ -617,7 +624,7 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
   const int tile_rows = cm->tile_rows;
   MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
   int num_workers = 0;
-  int total_num_sb_rows = 0;
+  int total_num_threads_row_mt = 0;
   int max_sb_rows = 0;
 
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
@@ -632,11 +639,19 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
       TileDataEnc *tile_data = &cpi->tile_data[row * cm->tile_cols + col];
       int num_sb_rows_in_tile =
           av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
-      total_num_sb_rows += num_sb_rows_in_tile;
+      int num_sb_cols_in_tile =
+          av1_get_sb_cols_in_tile(cm, tile_data->tile_info);
+      total_num_threads_row_mt +=
+          AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
       max_sb_rows = AOMMAX(max_sb_rows, num_sb_rows_in_tile);
     }
   }
-  num_workers = AOMMIN(cpi->oxcf.max_threads, total_num_sb_rows);
+  // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of
+  // post-processing stages in encoder is quiet low, so limiting the number of
+  // threads to the theoretical limit in row-mt does not have much impact on
+  // post-processing multi-threading stage. Need to revisit this when
+  // post-processing time starts shooting up.
+  num_workers = AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt);
 
   if (multi_thread_ctxt->allocated_tile_cols != tile_cols ||
       multi_thread_ctxt->allocated_tile_rows != tile_rows ||
@@ -659,9 +674,7 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
       this_tile->row_mt_info.current_mi_row = this_tile->tile_info.mi_row_start;
       this_tile->row_mt_info.num_threads_working = 0;
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
       av1_inter_mode_data_init(this_tile);
-#endif
       av1_zero_above_context(cm, &cpi->td.mb.e_mbd,
                              this_tile->tile_info.mi_col_start,
                              this_tile->tile_info.mi_col_end, tile_row);
diff --git a/libaom/av1/encoder/firstpass.c b/libaom/av1/encoder/firstpass.c
index 5117c67..f6a0fb2 100644
--- a/libaom/av1/encoder/firstpass.c
+++ b/libaom/av1/encoder/firstpass.c
@@ -36,6 +36,7 @@
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/mcomp.h"
@@ -43,63 +44,14 @@
 #include "av1/encoder/reconinter_enc.h"
 
 #define OUTPUT_FPF 0
-#define ARF_STATS_OUTPUT 0
 
-#define GROUP_ADAPTIVE_MAXQ 1
-
-#define BOOST_BREAKOUT 12.5
-#define BOOST_FACTOR 12.5
-#define FACTOR_PT_LOW 0.70
-#define FACTOR_PT_HIGH 0.90
 #define FIRST_PASS_Q 10.0
-#define GF_MAX_BOOST 90.0
 #define INTRA_MODE_PENALTY 1024
-#define KF_MIN_FRAME_BOOST 80.0
-#define KF_MAX_FRAME_BOOST 128.0
-#define MIN_ARF_GF_BOOST 240
-#define MIN_DECAY_FACTOR 0.01
-#define MIN_KF_BOOST 300          // Minimum boost for non-static KF interval
-#define MIN_STATIC_KF_BOOST 5400  // Minimum boost for static KF interval
 #define NEW_MV_MODE_PENALTY 32
 #define DARK_THRESH 64
-#define DEFAULT_GRP_WEIGHT 1.0
-#define RC_FACTOR_MIN 0.75
-#define RC_FACTOR_MAX 1.75
-#define MIN_FWD_KF_INTERVAL 8
 
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
-#define NCOUNT_FRAME_II_THRESH 5.0
-
-#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
-
-#if ARF_STATS_OUTPUT
-unsigned int arf_count = 0;
-#endif
-
-// Resets the first pass file to the given position using a relative seek from
-// the current position.
-static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) {
-  p->stats_in = position;
-}
-
-// Read frame stats at an offset from the current position.
-static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
-  if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
-      (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
-    return NULL;
-  }
-
-  return &p->stats_in[offset];
-}
-
-static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
-  if (p->stats_in >= p->stats_in_end) return EOF;
-
-  *fps = *p->stats_in;
-  ++p->stats_in;
-  return 1;
-}
 
 static void output_stats(FIRSTPASS_STATS *stats,
                          struct aom_codec_pkt_list *pktlist) {
@@ -131,18 +83,7 @@ static void output_stats(FIRSTPASS_STATS *stats,
 #endif
 }
 
-#if CONFIG_FP_MB_STATS
-static void output_fpmb_stats(uint8_t *this_frame_mb_stats, int stats_size,
-                              struct aom_codec_pkt_list *pktlist) {
-  struct aom_codec_cx_pkt pkt;
-  pkt.kind = AOM_CODEC_FPMB_STATS_PKT;
-  pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
-  pkt.data.firstpass_mb_stats.sz = stats_size * sizeof(*this_frame_mb_stats);
-  aom_codec_pkt_list_add(pktlist, &pkt);
-}
-#endif
-
-static void zero_stats(FIRSTPASS_STATS *section) {
+void av1_twopass_zero_stats(FIRSTPASS_STATS *section) {
   section->frame = 0.0;
   section->weight = 0.0;
   section->intra_error = 0.0;
@@ -195,98 +136,8 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
   section->duration += frame->duration;
 }
 
-static void subtract_stats(FIRSTPASS_STATS *section,
-                           const FIRSTPASS_STATS *frame) {
-  section->frame -= frame->frame;
-  section->weight -= frame->weight;
-  section->intra_error -= frame->intra_error;
-  section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
-  section->coded_error -= frame->coded_error;
-  section->sr_coded_error -= frame->sr_coded_error;
-  section->pcnt_inter -= frame->pcnt_inter;
-  section->pcnt_motion -= frame->pcnt_motion;
-  section->pcnt_second_ref -= frame->pcnt_second_ref;
-  section->pcnt_neutral -= frame->pcnt_neutral;
-  section->intra_skip_pct -= frame->intra_skip_pct;
-  section->inactive_zone_rows -= frame->inactive_zone_rows;
-  section->inactive_zone_cols -= frame->inactive_zone_cols;
-  section->MVr -= frame->MVr;
-  section->mvr_abs -= frame->mvr_abs;
-  section->MVc -= frame->MVc;
-  section->mvc_abs -= frame->mvc_abs;
-  section->MVrv -= frame->MVrv;
-  section->MVcv -= frame->MVcv;
-  section->mv_in_out_count -= frame->mv_in_out_count;
-  section->new_mv_count -= frame->new_mv_count;
-  section->count -= frame->count;
-  section->duration -= frame->duration;
-}
-
-// Calculate the linear size relative to a baseline of 1080P
-#define BASE_SIZE 2073600.0  // 1920x1080
-static double get_linear_size_factor(const AV1_COMP *cpi) {
-  const double this_area = cpi->initial_width * cpi->initial_height;
-  return pow(this_area / BASE_SIZE, 0.5);
-}
-
-// Calculate an active area of the image that discounts formatting
-// bars and partially discounts other 0 energy areas.
-#define MIN_ACTIVE_AREA 0.5
-#define MAX_ACTIVE_AREA 1.0
-static double calculate_active_area(const AV1_COMP *cpi,
-                                    const FIRSTPASS_STATS *this_frame) {
-  double active_pct;
-
-  active_pct =
-      1.0 -
-      ((this_frame->intra_skip_pct / 2) +
-       ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
-  return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
-}
-
-// Calculate a modified Error used in distributing bits between easier and
-// harder frames.
-#define ACT_AREA_CORRECTION 0.5
-static double calculate_modified_err(const AV1_COMP *cpi,
-                                     const TWO_PASS *twopass,
-                                     const AV1EncoderConfig *oxcf,
-                                     const FIRSTPASS_STATS *this_frame) {
-  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
-  const double av_weight = stats->weight / stats->count;
-  const double av_err = (stats->coded_error * av_weight) / stats->count;
-  double modified_error =
-      av_err * pow(this_frame->coded_error * this_frame->weight /
-                       DOUBLE_DIVIDE_CHECK(av_err),
-                   oxcf->two_pass_vbrbias / 100.0);
-
-  // Correction for active area. Frames with a reduced active area
-  // (eg due to formatting bars) have a higher error per mb for the
-  // remaining active MBs. The correction here assumes that coding
-  // 0.5N blocks of complexity 2X is a little easier than coding N
-  // blocks of complexity X.
-  modified_error *=
-      pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
-
-  return fclamp(modified_error, twopass->modified_error_min,
-                twopass->modified_error_max);
-}
-
-// This function returns the maximum target rate per frame.
-static int frame_max_bits(const RATE_CONTROL *rc,
-                          const AV1EncoderConfig *oxcf) {
-  int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
-                      (int64_t)oxcf->two_pass_vbrmax_section) /
-                     100;
-  if (max_bits < 0)
-    max_bits = 0;
-  else if (max_bits > rc->max_frame_bandwidth)
-    max_bits = rc->max_frame_bandwidth;
-
-  return (int)max_bits;
-}
-
 void av1_init_first_pass(AV1_COMP *cpi) {
-  zero_stats(&cpi->twopass.total_stats);
+  av1_twopass_zero_stats(&cpi->twopass.total_stats);
 }
 
 void av1_end_first_pass(AV1_COMP *cpi) {
@@ -380,13 +231,13 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
 
   // Override the default variance function to use MSE.
   v_fn_ptr.vf = get_block_variance_fn(bsize);
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
   }
 
   // Center the initial step/diamond search on best mv.
-  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
-                                    step_param, x->sadperbit16, &num00,
+  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg[SS_CFG_SRC], &ref_mv_full,
+                                    &tmp_mv, step_param, x->sadperbit16, &num00,
                                     &v_fn_ptr, ref_mv);
   if (tmp_err < INT_MAX)
     tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
@@ -407,9 +258,9 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
     if (num00) {
       --num00;
     } else {
-      tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
-                                        step_param + n, x->sadperbit16, &num00,
-                                        &v_fn_ptr, ref_mv);
+      tmp_err = cpi->diamond_search_sad(
+          x, &cpi->ss_cfg[SS_CFG_SRC], &ref_mv_full, &tmp_mv, step_param + n,
+          x->sadperbit16, &num00, &v_fn_ptr, ref_mv);
       if (tmp_err < INT_MAX)
         tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
       if (tmp_err < INT_MAX - new_mv_mode_penalty)
@@ -439,26 +290,7 @@ static BLOCK_SIZE get_bsize(const AV1_COMMON *cm, int mb_row, int mb_col) {
 }
 
 static int find_fp_qindex(aom_bit_depth_t bit_depth) {
-  int i;
-
-  for (i = 0; i < QINDEX_RANGE; ++i)
-    if (av1_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q) break;
-
-  if (i == QINDEX_RANGE) i--;
-
-  return i;
-}
-
-static void set_first_pass_params(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  if (!cpi->refresh_alt_ref_frame && (cm->current_frame.frame_number == 0 ||
-                                      (cpi->frame_flags & FRAMEFLAGS_KEY))) {
-    cm->current_frame.frame_type = KEY_FRAME;
-  } else {
-    cm->current_frame.frame_type = INTER_FRAME;
-  }
-  // Do not use periodic key frames.
-  cpi->rc.frames_to_key = INT_MAX;
+  return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1);
 }
 
 static double raw_motion_error_stdev(int *raw_motion_err_list,
@@ -486,7 +318,7 @@ static double raw_motion_error_stdev(int *raw_motion_err_list,
 
 #define UL_INTRA_THRESH 50
 #define INVALID_ROW -1
-void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
+void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
   int mb_row, mb_col;
   MACROBLOCK *const x = &cpi->td.mb;
   AV1_COMMON *const cm = &cpi->common;
@@ -501,7 +333,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
   int i;
 
-  int recon_yoffset, recon_uvoffset;
+  int recon_yoffset, src_yoffset, recon_uvoffset;
   int64_t intra_error = 0;
   int64_t frame_avg_wavelet_energy = 0;
   int64_t coded_error = 0;
@@ -521,15 +353,15 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   int sum_in_vectors = 0;
   MV lastmv = kZeroMv;
   TWO_PASS *twopass = &cpi->twopass;
-  int recon_y_stride, recon_uv_stride, uv_mb_height;
+  int recon_y_stride, src_y_stride, recon_uv_stride, uv_mb_height;
 
-  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  const YV12_BUFFER_CONFIG *const lst_yv12 =
+      get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
   YV12_BUFFER_CONFIG *const new_yv12 = &cm->cur_frame->buf;
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
   double intra_factor;
   double brightness_factor;
-  BufferPool *const pool = cm->buffer_pool;
   const int qindex = find_fp_qindex(seq_params->bit_depth);
   const int mb_scale = mi_size_wide[BLOCK_16X16];
 
@@ -542,12 +374,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   assert(new_yv12 != NULL);
   assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    av1_zero_array(cpi->twopass.frame_mb_stats_buf, cpi->initial_mbs);
-  }
-#endif
-
+  av1_setup_frame_size(cpi);
   aom_clear_system_state();
 
   xd->mi = cm->mi_grid_visible;
@@ -558,7 +385,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   brightness_factor = 0.0;
   neutral_count = 0.0;
 
-  set_first_pass_params(cpi);
+  // Do not use periodic key frames.
+  cpi->rc.frames_to_key = INT_MAX;
+
   av1_set_quantizer(cm, qindex);
 
   av1_setup_block_planes(&x->e_mbd, seq_params->subsampling_x,
@@ -589,12 +418,11 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   }
 
   av1_init_mv_probs(cm);
-  av1_init_lv_map(cm);
   av1_initialize_rd_consts(cpi);
 
   // Tiling is ignored in the first pass.
   av1_tile_init(&tile, cm, 0, 0);
-
+  src_y_stride = cpi->source->y_stride;
   recon_y_stride = new_yv12->y_stride;
   recon_uv_stride = new_yv12->uv_stride;
   uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
@@ -605,6 +433,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
     // Reset above block coeffs.
     xd->up_available = (mb_row != 0);
     recon_yoffset = (mb_row * recon_y_stride * 16);
+    src_yoffset = (mb_row * src_y_stride * 16);
     recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
 
     // Set up limit values for motion vectors to prevent them extending
@@ -620,10 +449,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       double log_intra;
       int level_sample;
 
-#if CONFIG_FP_MB_STATS
-      const int mb_index = mb_row * cm->mb_cols + mb_col;
-#endif
-
       aom_clear_system_state();
 
       const int idx_str = xd->mi_stride * mb_row * mb_scale + mb_col * mb_scale;
@@ -650,11 +475,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       av1_encode_intra_block_plane(cpi, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
       this_error = aom_get_mb_ss(x->plane[0].src_diff);
 
-      // Keep a record of blocks that have almost no intra error residual
-      // (i.e. are in effect completely flat and untextured in the intra
-      // domain). In natural videos this is uncommon, but it is much more
-      // common in animations, graphics and screen content, so may be used
-      // as a signal to detect these types of content.
       if (this_error < UL_INTRA_THRESH) {
         ++intra_skip_count;
       } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
@@ -702,21 +522,15 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       // Accumulate the intra error.
       intra_error += (int64_t)this_error;
 
-      int stride = x->plane[0].src.stride;
+      const int hbd = is_cur_buf_hbd(xd);
+      const int stride = x->plane[0].src.stride;
       uint8_t *buf = x->plane[0].src.buf;
-      for (int r8 = 0; r8 < 2; ++r8)
+      for (int r8 = 0; r8 < 2; ++r8) {
         for (int c8 = 0; c8 < 2; ++c8) {
-          int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
           frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
               buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
         }
-
-#if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        // initialization
-        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
       }
-#endif
 
       // Set up limit values for motion vectors to prevent them extending
       // outside the UMV borders.
@@ -731,7 +545,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
         struct buf_2d unscaled_last_source_buf_2d;
 
         xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        if (is_cur_buf_hbd(xd)) {
           motion_error = highbd_get_prediction_error(
               bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
         } else {
@@ -743,10 +557,10 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
         // frame as the reference. Skip the further motion search on
         // reconstructed frame if this error is small.
         unscaled_last_source_buf_2d.buf =
-            cpi->unscaled_last_source->y_buffer + recon_yoffset;
+            cpi->unscaled_last_source->y_buffer + src_yoffset;
         unscaled_last_source_buf_2d.stride =
             cpi->unscaled_last_source->y_stride;
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        if (is_cur_buf_hbd(xd)) {
           raw_motion_error = highbd_get_prediction_error(
               bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
         } else {
@@ -778,7 +592,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
             int gf_motion_error;
 
             xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            if (is_cur_buf_hbd(xd)) {
               gf_motion_error = highbd_get_prediction_error(
                   bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
             } else {
@@ -816,20 +630,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
         best_ref_mv.row = 0;
         best_ref_mv.col = 0;
 
-#if CONFIG_FP_MB_STATS
-        if (cpi->use_fp_mb_stats) {
-          // intra predication statistics
-          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-          if (this_error > FPMB_ERROR_LARGE_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
-          } else if (this_error < FPMB_ERROR_SMALL_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
-          }
-        }
-#endif
-
         if (motion_error <= this_error) {
           aom_clear_system_state();
 
@@ -855,8 +655,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
           xd->mi[0]->tx_size = TX_4X4;
           xd->mi[0]->ref_frame[0] = LAST_FRAME;
           xd->mi[0]->ref_frame[1] = NONE_FRAME;
-          av1_build_inter_predictors_sby(cm, xd, mb_row * mb_scale,
-                                         mb_col * mb_scale, NULL, bsize);
+          av1_enc_build_inter_predictor(cm, xd, mb_row * mb_scale,
+                                        mb_col * mb_scale, NULL, bsize,
+                                        AOM_PLANE_Y, AOM_PLANE_Y);
           av1_encode_sby_pass1(cm, x, bsize);
           sum_mvr += mv.row;
           sum_mvr_abs += abs(mv.row);
@@ -868,50 +669,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
           best_ref_mv = mv;
 
-#if CONFIG_FP_MB_STATS
-          if (cpi->use_fp_mb_stats) {
-            // inter predication statistics
-            cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-            if (this_error > FPMB_ERROR_LARGE_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_ERROR_LARGE_MASK;
-            } else if (this_error < FPMB_ERROR_SMALL_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_ERROR_SMALL_MASK;
-            }
-          }
-#endif
-
           if (!is_zero_mv(&mv)) {
             ++mvcount;
 
-#if CONFIG_FP_MB_STATS
-            if (cpi->use_fp_mb_stats) {
-              cpi->twopass.frame_mb_stats_buf[mb_index] &=
-                  ~FPMB_MOTION_ZERO_MASK;
-              // check estimated motion direction
-              if (mv.col > 0 && mv.col >= abs(mv.row)) {
-                // right direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_RIGHT_MASK;
-              } else if (mv.row < 0 && abs(mv.row) >= abs(mv.col)) {
-                // up direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_UP_MASK;
-              } else if (mv.col < 0 && abs(mv.col) >= abs(mv.row)) {
-                // left direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_LEFT_MASK;
-              } else {
-                // down direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_DOWN_MASK;
-              }
-            }
-#endif
-
             // Non-zero vector, was it different from the last non zero vector?
             if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count;
             lastmv = mv;
@@ -955,6 +715,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       x->plane[2].src.buf += uv_mb_height;
 
       recon_yoffset += 16;
+      src_yoffset += 16;
       recon_uvoffset += uv_mb_height;
     }
     // Adjust to the next row of MBs.
@@ -1039,19 +800,12 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
     // TODO(paulwilkins):  Handle the case when duration is set to 0, or
     // something less than the full time between subsequent values of
     // cpi->source_time_stamp.
-    fps.duration = (double)(source->ts_end - source->ts_start);
+    fps.duration = (double)ts_duration;
 
     // Don't want to do output stats with a stack variable!
     twopass->this_frame_stats = fps;
     output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
     accumulate_stats(&twopass->total_stats, &fps);
-
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      output_fpmb_stats(twopass->frame_mb_stats_buf, cpi->initial_mbs,
-                        cpi->output_pkt_list);
-    }
-#endif
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
@@ -1062,10 +816,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
        ((twopass->this_frame_stats.intra_error /
          DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
     if (gld_yv12 != NULL) {
-      assign_frame_buffer(
-          pool->frame_bufs,
-          &cm->ref_frame_map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)],
-          cm->ref_frame_map[get_ref_frame_map_idx(cpi, LAST_FRAME)]);
+      assign_frame_buffer_p(
+          &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
+          cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
     }
     twopass->sr_update_lag = 1;
   } else {
@@ -1075,19 +828,16 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   aom_extend_frame_borders(new_yv12, num_planes);
 
   // The frame we just compressed now becomes the last frame.
-  assign_frame_buffer(
-      pool->frame_bufs,
-      &cm->ref_frame_map[get_ref_frame_map_idx(cpi, LAST_FRAME)],
-      cm->new_fb_idx);
+  assign_frame_buffer_p(
+      &cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)], cm->cur_frame);
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
   if (current_frame->frame_number == 0 &&
-      get_ref_frame_map_idx(cpi, GOLDEN_FRAME) != INVALID_IDX) {
-    assign_frame_buffer(
-        pool->frame_bufs,
-        &cm->ref_frame_map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)],
-        cm->ref_frame_map[get_ref_frame_map_idx(cpi, LAST_FRAME)]);
+      get_ref_frame_map_idx(cm, GOLDEN_FRAME) != INVALID_IDX) {
+    assign_frame_buffer_p(
+        &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
+        cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
   }
 
   // Use this to see what the first pass reconstruction looks like.
@@ -1108,2333 +858,3 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
   ++current_frame->frame_number;
 }
-
-static double calc_correction_factor(double err_per_mb, double err_divisor,
-                                     double pt_low, double pt_high, int q,
-                                     aom_bit_depth_t bit_depth) {
-  const double error_term = err_per_mb / err_divisor;
-
-  // Adjustment based on actual quantizer to power term.
-  const double power_term =
-      AOMMIN(av1_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
-
-  // Calculate correction factor.
-  if (power_term < 1.0) assert(error_term >= 0.0);
-
-  return fclamp(pow(error_term, power_term), 0.05, 5.0);
-}
-
-#define ERR_DIVISOR 100.0
-static int get_twopass_worst_quality(const AV1_COMP *cpi,
-                                     const double section_err,
-                                     double inactive_zone,
-                                     int section_target_bandwidth,
-                                     double group_weight_factor) {
-  const RATE_CONTROL *const rc = &cpi->rc;
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-
-  inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
-
-  if (section_target_bandwidth <= 0) {
-    return rc->worst_quality;  // Highest value allowed
-  } else {
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cpi->common.MBs;
-    const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
-    const double av_err_per_mb = section_err / active_mbs;
-    const double speed_term = 1.0;
-    double ediv_size_correction;
-    const int target_norm_bits_per_mb =
-        (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
-        active_mbs;
-    int q;
-
-    // Larger image formats are expected to be a little harder to code
-    // relatively given the same prediction error score. This in part at
-    // least relates to the increased size and hence coding overheads of
-    // motion vectors. Some account of this is made through adjustment of
-    // the error divisor.
-    ediv_size_correction =
-        AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi)));
-    if (ediv_size_correction < 1.0)
-      ediv_size_correction = -(1.0 / ediv_size_correction);
-    ediv_size_correction *= 4.0;
-
-    // Try and pick a max Q that will be high enough to encode the
-    // content at the given rate.
-    for (q = rc->best_quality; q < rc->worst_quality; ++q) {
-      const double factor = calc_correction_factor(
-          av_err_per_mb, ERR_DIVISOR - ediv_size_correction, FACTOR_PT_LOW,
-          FACTOR_PT_HIGH, q, cpi->common.seq_params.bit_depth);
-      const int bits_per_mb = av1_rc_bits_per_mb(
-          INTER_FRAME, q, factor * speed_term * group_weight_factor,
-          cpi->common.seq_params.bit_depth);
-      if (bits_per_mb <= target_norm_bits_per_mb) break;
-    }
-
-    // Restriction on active max q for constrained quality mode.
-    if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level);
-    return q;
-  }
-}
-
-static void setup_rf_level_maxq(AV1_COMP *cpi) {
-  int i;
-  RATE_CONTROL *const rc = &cpi->rc;
-  for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
-    int qdelta = av1_frame_type_qdelta(cpi, i, rc->worst_quality);
-    rc->rf_level_maxq[i] = AOMMAX(rc->worst_quality + qdelta, rc->best_quality);
-  }
-}
-
-void av1_init_second_pass(AV1_COMP *cpi) {
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *const twopass = &cpi->twopass;
-  double frame_rate;
-  FIRSTPASS_STATS *stats;
-
-  zero_stats(&twopass->total_stats);
-  zero_stats(&twopass->total_left_stats);
-
-  if (!twopass->stats_in_end) return;
-
-  stats = &twopass->total_stats;
-
-  *stats = *twopass->stats_in_end;
-  twopass->total_left_stats = *stats;
-
-  frame_rate = 10000000.0 * stats->count / stats->duration;
-  // Each frame can have a different duration, as the frame rate in the source
-  // isn't guaranteed to be constant. The frame rate prior to the first frame
-  // encoded in the second pass is a guess. However, the sum duration is not.
-  // It is calculated based on the actual durations of all frames from the
-  // first pass.
-  av1_new_framerate(cpi, frame_rate);
-  twopass->bits_left =
-      (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
-
-  // This variable monitors how far behind the second ref update is lagging.
-  twopass->sr_update_lag = 1;
-
-  // Scan the first pass file and calculate a modified total error based upon
-  // the bias/power function used to allocate bits.
-  {
-    const double avg_error =
-        stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
-    const FIRSTPASS_STATS *s = twopass->stats_in;
-    double modified_error_total = 0.0;
-    twopass->modified_error_min =
-        (avg_error * oxcf->two_pass_vbrmin_section) / 100;
-    twopass->modified_error_max =
-        (avg_error * oxcf->two_pass_vbrmax_section) / 100;
-    while (s < twopass->stats_in_end) {
-      modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
-      ++s;
-    }
-    twopass->modified_error_left = modified_error_total;
-  }
-
-  // Reset the vbr bits off target counters
-  cpi->rc.vbr_bits_off_target = 0;
-  cpi->rc.vbr_bits_off_target_fast = 0;
-
-  cpi->rc.rate_error_estimate = 0;
-
-  // Static sequence monitor variables.
-  twopass->kf_zeromotion_pct = 100;
-  twopass->last_kfgroup_zeromotion_pct = 100;
-
-  if (oxcf->resize_mode != RESIZE_NONE) {
-    setup_rf_level_maxq(cpi);
-  }
-}
-
-#define SR_DIFF_PART 0.0015
-#define MOTION_AMP_PART 0.003
-#define INTRA_PART 0.005
-#define DEFAULT_DECAY_LIMIT 0.75
-#define LOW_SR_DIFF_TRHESH 0.1
-#define SR_DIFF_MAX 128.0
-
-static double get_sr_decay_rate(const AV1_COMP *cpi,
-                                const FIRSTPASS_STATS *frame) {
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                             : cpi->common.MBs;
-  double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
-  double sr_decay = 1.0;
-  double modified_pct_inter;
-  double modified_pcnt_intra;
-  const double motion_amplitude_factor =
-      frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
-
-  modified_pct_inter = frame->pcnt_inter;
-  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
-      (double)NCOUNT_FRAME_II_THRESH) {
-    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
-  }
-  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
-
-  if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
-    sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX);
-    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
-               (MOTION_AMP_PART * motion_amplitude_factor) -
-               (INTRA_PART * modified_pcnt_intra);
-  }
-  return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
-}
-
-// This function gives an estimate of how badly we believe the prediction
-// quality is decaying from frame to frame.
-static double get_zero_motion_factor(const AV1_COMP *cpi,
-                                     const FIRSTPASS_STATS *frame) {
-  const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
-  double sr_decay = get_sr_decay_rate(cpi, frame);
-  return AOMMIN(sr_decay, zero_motion_pct);
-}
-
-#define ZM_POWER_FACTOR 0.75
-
-static double get_prediction_decay_rate(const AV1_COMP *cpi,
-                                        const FIRSTPASS_STATS *next_frame) {
-  const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
-  const double zero_motion_factor =
-      (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
-                  ZM_POWER_FACTOR));
-
-  return AOMMAX(zero_motion_factor,
-                (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
-}
-
-// Function to test for a condition where a complex transition is followed
-// by a static section. For example in slide shows where there is a fade
-// between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(AV1_COMP *cpi, int frame_interval,
-                                      int still_interval,
-                                      double loop_decay_rate,
-                                      double last_decay_rate) {
-  TWO_PASS *const twopass = &cpi->twopass;
-  RATE_CONTROL *const rc = &cpi->rc;
-
-  // Break clause to detect very still sections after motion
-  // For example a static image after a fade or other transition
-  // instead of a clean scene cut.
-  if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 &&
-      last_decay_rate < 0.9) {
-    int j;
-
-    // Look ahead a few frames to see if static condition persists...
-    for (j = 0; j < still_interval; ++j) {
-      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
-      if (stats >= twopass->stats_in_end) break;
-
-      if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
-    }
-
-    // Only if it does do we signal a transition to still.
-    return j == still_interval;
-  }
-
-  return 0;
-}
-
-// This function detects a flash through the high relative pcnt_second_ref
-// score in the frame following a flash frame. The offset passed in should
-// reflect this.
-static int detect_flash(const TWO_PASS *twopass, int offset) {
-  const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
-
-  // What we are looking for here is a situation where there is a
-  // brief break in prediction (such as a flash) but subsequent frames
-  // are reasonably well predicted by an earlier (pre flash) frame.
-  // The recovery after a flash is indicated by a high pcnt_second_ref
-  // compared to pcnt_inter.
-  return next_frame != NULL &&
-         next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
-         next_frame->pcnt_second_ref >= 0.5;
-}
-
-// Update the motion related elements to the GF arf boost calculation.
-static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
-                                          double *mv_in_out,
-                                          double *mv_in_out_accumulator,
-                                          double *abs_mv_in_out_accumulator,
-                                          double *mv_ratio_accumulator) {
-  const double pct = stats->pcnt_motion;
-
-  // Accumulate Motion In/Out of frame stats.
-  *mv_in_out = stats->mv_in_out_count * pct;
-  *mv_in_out_accumulator += *mv_in_out;
-  *abs_mv_in_out_accumulator += fabs(*mv_in_out);
-
-  // Accumulate a measure of how uniform (or conversely how random) the motion
-  // field is (a ratio of abs(mv) / mv).
-  if (pct > 0.05) {
-    const double mvr_ratio =
-        fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
-    const double mvc_ratio =
-        fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
-
-    *mv_ratio_accumulator +=
-        pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
-    *mv_ratio_accumulator +=
-        pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
-  }
-}
-
-#define BASELINE_ERR_PER_MB 1000.0
-static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame,
-                               double this_frame_mv_in_out, double max_boost) {
-  double frame_boost;
-  const double lq = av1_convert_qindex_to_q(
-      cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.seq_params.bit_depth);
-  const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
-  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                       : cpi->common.MBs;
-
-  // Correct for any inactive region in the image
-  num_mbs = (int)AOMMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
-
-  // Underlying boost factor is based on inter error ratio.
-  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
-                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
-  frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
-
-  // Increase boost for frames where new data coming into frame (e.g. zoom out).
-  // Slightly reduce boost if there is a net balance of motion out of the frame
-  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
-  if (this_frame_mv_in_out > 0.0)
-    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
-  // In the extreme case the boost is halved.
-  else
-    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
-
-  return AOMMIN(frame_boost, max_boost * boost_q_correction);
-}
-
-static int calc_arf_boost(AV1_COMP *cpi, int offset, int f_frames, int b_frames,
-                          int *f_boost, int *b_boost) {
-  TWO_PASS *const twopass = &cpi->twopass;
-  int i;
-  double boost_score = 0.0;
-  double mv_ratio_accumulator = 0.0;
-  double decay_accumulator = 1.0;
-  double this_frame_mv_in_out = 0.0;
-  double mv_in_out_accumulator = 0.0;
-  double abs_mv_in_out_accumulator = 0.0;
-  int arf_boost;
-  int flash_detected = 0;
-
-  // Search forward from the proposed arf/next gf position.
-  for (i = 0; i < f_frames; ++i) {
-    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
-    if (this_frame == NULL) break;
-
-    // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(
-        this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
-        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-
-    // We want to discount the flash frame itself and the recovery
-    // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(twopass, i + offset) ||
-                     detect_flash(twopass, i + offset + 1);
-
-    // Accumulate the effect of prediction quality decay.
-    if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
-      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
-                              ? MIN_DECAY_FACTOR
-                              : decay_accumulator;
-    }
-
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
-  }
-
-  *f_boost = (int)boost_score;
-
-  // Reset for backward looking loop.
-  boost_score = 0.0;
-  mv_ratio_accumulator = 0.0;
-  decay_accumulator = 1.0;
-  this_frame_mv_in_out = 0.0;
-  mv_in_out_accumulator = 0.0;
-  abs_mv_in_out_accumulator = 0.0;
-
-  // Search backward towards last gf position.
-  for (i = -1; i >= -b_frames; --i) {
-    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
-    if (this_frame == NULL) break;
-
-    // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(
-        this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
-        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-
-    // We want to discount the the flash frame itself and the recovery
-    // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(twopass, i + offset) ||
-                     detect_flash(twopass, i + offset + 1);
-
-    // Cumulative effect of prediction quality decay.
-    if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
-      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
-                              ? MIN_DECAY_FACTOR
-                              : decay_accumulator;
-    }
-
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
-  }
-  *b_boost = (int)boost_score;
-
-  arf_boost = (*f_boost + *b_boost);
-  if (arf_boost < ((b_frames + f_frames) * 20))
-    arf_boost = ((b_frames + f_frames) * 20);
-  arf_boost = AOMMAX(arf_boost, MIN_ARF_GF_BOOST);
-
-  return arf_boost;
-}
-
-// Calculate a section intra ratio used in setting max loop filter.
-static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
-                                         const FIRSTPASS_STATS *end,
-                                         int section_length) {
-  const FIRSTPASS_STATS *s = begin;
-  double intra_error = 0.0;
-  double coded_error = 0.0;
-  int i = 0;
-
-  while (s < end && i < section_length) {
-    intra_error += s->intra_error;
-    coded_error += s->coded_error;
-    ++s;
-    ++i;
-  }
-
-  return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
-}
-
-// Calculate the total bits to allocate in this GF/ARF group.
-static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
-                                             double gf_group_err) {
-  const RATE_CONTROL *const rc = &cpi->rc;
-  const TWO_PASS *const twopass = &cpi->twopass;
-  const int max_bits = frame_max_bits(rc, &cpi->oxcf);
-  int64_t total_group_bits;
-
-  // Calculate the bits to be allocated to the group as a whole.
-  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
-    total_group_bits = (int64_t)(twopass->kf_group_bits *
-                                 (gf_group_err / twopass->kf_group_error_left));
-  } else {
-    total_group_bits = 0;
-  }
-
-  // Clamp odd edge cases.
-  total_group_bits = (total_group_bits < 0)
-                         ? 0
-                         : (total_group_bits > twopass->kf_group_bits)
-                               ? twopass->kf_group_bits
-                               : total_group_bits;
-
-  // Clip based on user supplied data rate variability limit.
-  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
-    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
-
-  return total_group_bits;
-}
-
-// Calculate the number bits extra to assign to boosted frames in a group.
-static int calculate_boost_bits(int frame_count, int boost,
-                                int64_t total_group_bits) {
-  int allocation_chunks;
-
-  // return 0 for invalid inputs (could arise e.g. through rounding errors)
-  if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
-
-  allocation_chunks = (frame_count * 100) + boost;
-
-  // Prevent overflow.
-  if (boost > 1023) {
-    int divisor = boost >> 10;
-    boost /= divisor;
-    allocation_chunks /= divisor;
-  }
-
-  // Calculate the number of extra bits for use in the boosted frame or frames.
-  return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
-                0);
-}
-
-#if USE_SYMM_MULTI_LAYER
-// #define CHCEK_GF_PARAMETER
-#ifdef CHCEK_GF_PARAMETER
-void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
-                        int frame_nums) {
-  static const char *update_type_strings[] = {
-    "KF_UPDATE",          "LF_UPDATE",      "GF_UPDATE",
-    "ARF_UPDATE",         "OVERLAY_UPDATE", "BRF_UPDATE",
-    "LAST_BIPRED_UPDATE", "BIPRED_UPDATE",  "INTNL_OVERLAY_UPDATE",
-    "INTNL_ARF_UPDATE"
-  };
-  FILE *fid = fopen("GF_PARAMS.txt", "a");
-
-  fprintf(fid, "\n{%d}\n", gf_interval);
-  for (int i = 0; i <= frame_nums; ++i) {
-    fprintf(fid, "%s %d %d %d %d\n",
-            update_type_strings[gf_group->update_type[i]],
-            gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
-            gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
-  }
-
-  fprintf(fid, "number of nodes in each level: \n");
-  for (int i = 0; i < MAX_PYRAMID_LVL; ++i) {
-    fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]);
-  }
-  fprintf(fid, "\n");
-  fclose(fid);
-}
-#endif  // CHCEK_GF_PARAMETER
-static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) {
-  // Derive rf_level from update_type
-  switch (update_type) {
-    case LF_UPDATE: return INTER_NORMAL;
-    case ARF_UPDATE: return GF_ARF_STD;
-    case OVERLAY_UPDATE: return INTER_NORMAL;
-    case BRF_UPDATE: return GF_ARF_LOW;
-    case LAST_BIPRED_UPDATE: return INTER_NORMAL;
-    case BIPRED_UPDATE: return INTER_NORMAL;
-    case INTNL_ARF_UPDATE: return GF_ARF_LOW;
-    case INTNL_OVERLAY_UPDATE: return INTER_NORMAL;
-    default: return INTER_NORMAL;
-  }
-}
-
-static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r,
-                                   int *frame_ind, int arf_ind, int level) {
-  if (r - l < 4) {
-    while (++l < r) {
-      // leaf nodes, not a look-ahead frame
-      gf_group->update_type[*frame_ind] = LF_UPDATE;
-      gf_group->arf_src_offset[*frame_ind] = 0;
-      gf_group->arf_pos_in_gf[*frame_ind] = 0;
-      gf_group->arf_update_idx[*frame_ind] = arf_ind;
-      gf_group->pyramid_level[*frame_ind] = 0;
-      ++gf_group->pyramid_lvl_nodes[0];
-      ++(*frame_ind);
-    }
-  } else {
-    int m = (l + r) / 2;
-    int arf_pos_in_gf = *frame_ind;
-
-    gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
-    gf_group->arf_src_offset[*frame_ind] = m - l - 1;
-    gf_group->arf_pos_in_gf[*frame_ind] = 0;
-    gf_group->arf_update_idx[*frame_ind] = 1;  // mark all internal ARF 1
-    gf_group->pyramid_level[*frame_ind] = level;
-    ++gf_group->pyramid_lvl_nodes[level];
-    ++(*frame_ind);
-
-    // set parameters for frames displayed before this frame
-    set_multi_layer_params(gf_group, l, m, frame_ind, 1, level - 1);
-
-    // for overlay frames, we need to record the position of its corresponding
-    // arf frames for bit allocation
-    gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
-    gf_group->arf_src_offset[*frame_ind] = 0;
-    gf_group->arf_pos_in_gf[*frame_ind] = arf_pos_in_gf;
-    gf_group->arf_update_idx[*frame_ind] = 1;
-    gf_group->pyramid_level[*frame_ind] = 0;
-    ++(*frame_ind);
-
-    // set parameters for frames displayed after this frame
-    set_multi_layer_params(gf_group, m, r, frame_ind, arf_ind, level - 1);
-  }
-}
-
-static INLINE unsigned char get_pyramid_height(int pyramid_width) {
-  assert(pyramid_width <= 16 && pyramid_width >= 4 &&
-         "invalid gf interval for pyramid structure");
-
-  return pyramid_width > 12 ? 4 : (pyramid_width > 6 ? 3 : 2);
-}
-
-static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
-                                              const int gf_interval) {
-  int frame_index = 0;
-  gf_group->pyramid_height = get_pyramid_height(gf_interval);
-
-  assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL);
-
-  av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL);
-
-  // At the beginning of each GF group it will be a key or overlay frame,
-  gf_group->update_type[frame_index] = OVERLAY_UPDATE;
-  gf_group->arf_src_offset[frame_index] = 0;
-  gf_group->arf_pos_in_gf[frame_index] = 0;
-  gf_group->arf_update_idx[frame_index] = 0;
-  gf_group->pyramid_level[frame_index] = 0;
-  ++frame_index;
-
-  // ALT0
-  gf_group->update_type[frame_index] = ARF_UPDATE;
-  gf_group->arf_src_offset[frame_index] = gf_interval - 1;
-  gf_group->arf_pos_in_gf[frame_index] = 0;
-  gf_group->arf_update_idx[frame_index] = 0;
-  gf_group->pyramid_level[frame_index] = gf_group->pyramid_height;
-  ++frame_index;
-
-  // set parameters for the rest of the frames
-  set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0,
-                         gf_group->pyramid_height - 1);
-  return frame_index;
-}
-
-static void define_customized_gf_group_structure(AV1_COMP *cpi) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-  const int key_frame = cpi->common.current_frame.frame_type == KEY_FRAME;
-
-  assert(rc->baseline_gf_interval >= 4 &&
-         rc->baseline_gf_interval <= MAX_PYRAMID_SIZE);
-
-  const int gf_update_frames =
-      construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval);
-  int frame_index;
-
-  cpi->num_extra_arfs = 0;
-
-  for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
-    // Set unused variables to default values
-    gf_group->bidir_pred_enabled[frame_index] = 0;
-    gf_group->brf_src_offset[frame_index] = 0;
-
-    // Special handle for the first frame for assigning update_type
-    if (frame_index == 0) {
-      // For key frames the frame target rate is already set and it
-      // is also the golden frame.
-      if (key_frame) {
-        gf_group->update_type[frame_index] = KF_UPDATE;
-        continue;
-      }
-
-      if (rc->source_alt_ref_active) {
-        gf_group->update_type[frame_index] = OVERLAY_UPDATE;
-      } else {
-        gf_group->update_type[frame_index] = GF_UPDATE;
-      }
-    } else {
-      if (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
-        ++cpi->num_extra_arfs;
-    }
-
-    // Assign rf level based on update type
-    gf_group->rf_level[frame_index] =
-        update_type_2_rf_level(gf_group->update_type[frame_index]);
-  }
-
-  // NOTE: We need to configure the frame at the end of the sequence + 1 that
-  //       will be the start frame for the next group. Otherwise prior to the
-  //       call to av1_rc_get_second_pass_params() the data will be undefined.
-  if (rc->source_alt_ref_pending) {
-    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
-    gf_group->rf_level[frame_index] = INTER_NORMAL;
-  } else {
-    gf_group->update_type[frame_index] = GF_UPDATE;
-    gf_group->rf_level[frame_index] = GF_ARF_STD;
-  }
-
-  gf_group->bidir_pred_enabled[frame_index] = 0;
-  gf_group->brf_src_offset[frame_index] = 0;
-  gf_group->arf_update_idx[frame_index] = 0;
-  // This value is only used for INTNL_OVERLAY_UPDATE
-  gf_group->arf_pos_in_gf[frame_index] = 0;
-
-  // This parameter is useless?
-  gf_group->arf_ref_idx[frame_index] = 0;
-#ifdef CHCEK_GF_PARAMETER
-  check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames);
-#endif
-}
-
-// It is an example of how to define a GF stucture manually. The function will
-// result in exactly the same GF group structure as
-// define_customized_gf_group_structure() when rc->baseline_gf_interval == 4
-#if USE_MANUAL_GF4_STRUCT
-#define GF_INTERVAL_4 4
-static const unsigned char gf4_multi_layer_params[][GF_FRAME_PARAMS] = {
-  {
-      // gf_group->index == 0 (Frame 0)
-      // It can also be KEY frame. Will assign the proper value
-      // in define_gf_group_structure
-      OVERLAY_UPDATE,  // update_type (default value)
-      0,               // arf_src_offset
-      0,               // arf_pos_in_gf
-      0                // arf_update_idx
-  },
-  {
-      // gf_group->index == 1 (Frame 4)
-      ARF_UPDATE,         // update_type
-      GF_INTERVAL_4 - 1,  // arf_src_offset
-      0,                  // arf_pos_in_gf
-      0                   // arf_update_idx
-  },
-  {
-      // gf_group->index == 2 (Frame 2)
-      INTNL_ARF_UPDATE,          // update_type
-      (GF_INTERVAL_4 >> 1) - 1,  // arf_src_offset
-      0,                         // arf_pos_in_gf
-      0                          // arf_update_idx
-  },
-  {
-      // gf_group->index == 3 (Frame 1)
-      LAST_BIPRED_UPDATE,  // update_type
-      0,                   // arf_src_offset
-      0,                   // arf_pos_in_gf
-      0                    // arf_update_idx
-  },
-
-  {
-      // gf_group->index == 4 (Frame 2 - OVERLAY)
-      INTNL_OVERLAY_UPDATE,  // update_type
-      0,                     // arf_src_offset
-      2,                     // arf_pos_in_gf
-      0                      // arf_update_idx
-  },
-  {
-      // gf_group->index == 5 (Frame 3)
-      LF_UPDATE,  // update_type
-      0,          // arf_src_offset
-      0,          // arf_pos_in_gf
-      1           // arf_update_idx
-  }
-};
-
-static int define_gf_group_structure_4(AV1_COMP *cpi) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-  const int key_frame = cpi->common.current_frame.frame_type == KEY_FRAME;
-
-  assert(rc->baseline_gf_interval == GF_INTERVAL_4);
-
-  const int gf_update_frames = rc->baseline_gf_interval + 2;
-  int frame_index;
-
-  for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
-    int param_idx = 0;
-
-    gf_group->bidir_pred_enabled[frame_index] = 0;
-
-    if (frame_index == 0) {
-      // gf_group->arf_src_offset[frame_index] = 0;
-      gf_group->brf_src_offset[frame_index] = 0;
-      gf_group->bidir_pred_enabled[frame_index] = 0;
-
-      // For key frames the frame target rate is already set and it
-      // is also the golden frame.
-      if (key_frame) continue;
-
-      gf_group->update_type[frame_index] =
-          gf4_multi_layer_params[frame_index][param_idx++];
-
-      if (rc->source_alt_ref_active) {
-        gf_group->update_type[frame_index] = OVERLAY_UPDATE;
-      } else {
-        gf_group->update_type[frame_index] = GF_UPDATE;
-      }
-      param_idx++;
-    } else {
-      gf_group->update_type[frame_index] =
-          gf4_multi_layer_params[frame_index][param_idx++];
-    }
-
-    // setup other parameters
-    gf_group->rf_level[frame_index] =
-        update_type_2_rf_level(gf_group->update_type[frame_index]);
-
-    // == arf_src_offset ==
-    gf_group->arf_src_offset[frame_index] =
-        gf4_multi_layer_params[frame_index][param_idx++];
-
-    // == arf_pos_in_gf ==
-    gf_group->arf_pos_in_gf[frame_index] =
-        gf4_multi_layer_params[frame_index][param_idx++];
-
-    // == arf_update_idx ==
-    gf_group->brf_src_offset[frame_index] =
-        gf4_multi_layer_params[frame_index][param_idx];
-  }
-
-  // NOTE: We need to configure the frame at the end of the sequence + 1 that
-  //       will be the start frame for the next group. Otherwise prior to the
-  //       call to av1_rc_get_second_pass_params() the data will be undefined.
-  gf_group->arf_update_idx[frame_index] = 0;
-  gf_group->arf_ref_idx[frame_index] = 0;
-
-  if (rc->source_alt_ref_pending) {
-    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
-    gf_group->rf_level[frame_index] = INTER_NORMAL;
-
-  } else {
-    gf_group->update_type[frame_index] = GF_UPDATE;
-    gf_group->rf_level[frame_index] = GF_ARF_STD;
-  }
-
-  gf_group->bidir_pred_enabled[frame_index] = 0;
-  gf_group->brf_src_offset[frame_index] = 0;
-
-  // This value is only used for INTNL_OVERLAY_UPDATE
-  gf_group->arf_pos_in_gf[frame_index] = 0;
-
-  return gf_update_frames;
-}
-#endif  // USE_MANUAL_GF4_STRUCT
-#endif  // USE_SYMM_MULTI_LAYER
-
-static void define_gf_group_structure(AV1_COMP *cpi) {
-  RATE_CONTROL *const rc = &cpi->rc;
-
-#if USE_SYMM_MULTI_LAYER
-  const int valid_customized_gf_length =
-      rc->baseline_gf_interval >= 4 &&
-      rc->baseline_gf_interval <= MAX_PYRAMID_SIZE;
-  // used the new structure only if extra_arf is allowed
-  if (valid_customized_gf_length && rc->source_alt_ref_pending &&
-      cpi->extra_arf_allowed > 0) {
-#if USE_MANUAL_GF4_STRUCT
-    if (rc->baseline_gf_interval == 4)
-      define_gf_group_structure_4(cpi);
-    else
-#endif
-      define_customized_gf_group_structure(cpi);
-    cpi->new_bwdref_update_rule = 1;
-    return;
-  } else {
-    cpi->new_bwdref_update_rule = 0;
-  }
-#endif
-
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-  int i;
-  int frame_index = 0;
-  const int key_frame = cpi->common.current_frame.frame_type == KEY_FRAME;
-
-  // The use of bi-predictive frames are only enabled when following 3
-  // conditions are met:
-  // (1) ALTREF is enabled;
-  // (2) The bi-predictive group interval is at least 2; and
-  // (3) The bi-predictive group interval is strictly smaller than the
-  //     golden group interval.
-  const int is_bipred_enabled =
-      cpi->extra_arf_allowed && rc->source_alt_ref_pending &&
-      rc->bipred_group_interval &&
-      rc->bipred_group_interval <=
-          (rc->baseline_gf_interval - rc->source_alt_ref_pending);
-  int bipred_group_end = 0;
-  int bipred_frame_index = 0;
-
-  const unsigned char ext_arf_interval =
-      (unsigned char)(rc->baseline_gf_interval / (cpi->num_extra_arfs + 1) - 1);
-  int which_arf = cpi->num_extra_arfs;
-  int subgroup_interval[MAX_EXT_ARFS + 1];
-  int is_sg_bipred_enabled = is_bipred_enabled;
-  int accumulative_subgroup_interval = 0;
-
-  // For key frames the frame target rate is already set and it
-  // is also the golden frame.
-  // === [frame_index == 0] ===
-  if (!key_frame) {
-    if (rc->source_alt_ref_active) {
-      gf_group->update_type[frame_index] = OVERLAY_UPDATE;
-      gf_group->rf_level[frame_index] = INTER_NORMAL;
-    } else {
-      gf_group->update_type[frame_index] = GF_UPDATE;
-      gf_group->rf_level[frame_index] = GF_ARF_STD;
-    }
-    gf_group->arf_update_idx[frame_index] = 0;
-    gf_group->arf_ref_idx[frame_index] = 0;
-  }
-
-  gf_group->bidir_pred_enabled[frame_index] = 0;
-  gf_group->brf_src_offset[frame_index] = 0;
-
-  frame_index++;
-
-  bipred_frame_index++;
-
-  // === [frame_index == 1] ===
-  if (rc->source_alt_ref_pending) {
-    gf_group->update_type[frame_index] = ARF_UPDATE;
-    gf_group->rf_level[frame_index] = GF_ARF_STD;
-    gf_group->arf_src_offset[frame_index] =
-        (unsigned char)(rc->baseline_gf_interval - 1);
-
-    gf_group->arf_update_idx[frame_index] = 0;
-    gf_group->arf_ref_idx[frame_index] = 0;
-
-    gf_group->bidir_pred_enabled[frame_index] = 0;
-    gf_group->brf_src_offset[frame_index] = 0;
-    // NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
-
-    // Work out the ARFs' positions in this gf group
-    // NOTE(weitinglin): ALT_REFs' are indexed inversely, but coded in display
-    // order (except for the original ARF). In the example of three ALT_REF's,
-    // We index ALTREF's as: KEY ----- ALT2 ----- ALT1 ----- ALT0
-    // but code them in the following order:
-    // KEY-ALT0-ALT2 ----- OVERLAY2-ALT1 ----- OVERLAY1 ----- OVERLAY0
-    //
-    // arf_pos_for_ovrly[]: Position for OVERLAY
-    // arf_pos_in_gf[]:     Position for ALTREF
-    cpi->arf_pos_for_ovrly[0] = frame_index + cpi->num_extra_arfs +
-                                gf_group->arf_src_offset[frame_index] + 1;
-    for (i = 0; i < cpi->num_extra_arfs; ++i) {
-      cpi->arf_pos_for_ovrly[i + 1] =
-          frame_index + (cpi->num_extra_arfs - i) * (ext_arf_interval + 2);
-      subgroup_interval[i] = cpi->arf_pos_for_ovrly[i] -
-                             cpi->arf_pos_for_ovrly[i + 1] - (i == 0 ? 1 : 2);
-    }
-    subgroup_interval[cpi->num_extra_arfs] =
-        cpi->arf_pos_for_ovrly[cpi->num_extra_arfs] - frame_index -
-        (cpi->num_extra_arfs == 0 ? 1 : 2);
-
-    ++frame_index;
-
-    // Insert an extra ARF
-    // === [frame_index == 2] ===
-    if (cpi->num_extra_arfs) {
-      gf_group->update_type[frame_index] = INTNL_ARF_UPDATE;
-      gf_group->rf_level[frame_index] = GF_ARF_LOW;
-      gf_group->arf_src_offset[frame_index] = ext_arf_interval;
-
-      gf_group->arf_update_idx[frame_index] = which_arf;
-      gf_group->arf_ref_idx[frame_index] = 0;
-      ++frame_index;
-    }
-    accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs];
-  }
-
-  const int normal_frames =
-      rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
-
-  for (i = 0; i < normal_frames; ++i) {
-    gf_group->arf_update_idx[frame_index] = which_arf;
-    gf_group->arf_ref_idx[frame_index] = which_arf;
-
-    // If we are going to have ARFs, check whether we can have BWDREF in this
-    // subgroup, and further, whether we can have ARF subgroup which contains
-    // the BWDREF subgroup but contained within the GF group:
-    //
-    // GF group --> ARF subgroup --> BWDREF subgroup
-    if (rc->source_alt_ref_pending) {
-      is_sg_bipred_enabled =
-          is_bipred_enabled &&
-          (subgroup_interval[which_arf] > rc->bipred_group_interval);
-    }
-
-    // NOTE: BIDIR_PRED is only enabled when the length of the bi-predictive
-    //       frame group interval is strictly smaller than that of the GOLDEN
-    //       FRAME group interval.
-    // TODO(zoeliu): Currently BIDIR_PRED is only enabled when alt-ref is on.
-    if (is_sg_bipred_enabled && !bipred_group_end) {
-      const int cur_brf_src_offset = rc->bipred_group_interval - 1;
-
-      if (bipred_frame_index == 1) {
-        // --- BRF_UPDATE ---
-        gf_group->update_type[frame_index] = BRF_UPDATE;
-        gf_group->rf_level[frame_index] = GF_ARF_LOW;
-        gf_group->brf_src_offset[frame_index] = cur_brf_src_offset;
-      } else if (bipred_frame_index == rc->bipred_group_interval) {
-        // --- LAST_BIPRED_UPDATE ---
-        gf_group->update_type[frame_index] = LAST_BIPRED_UPDATE;
-        gf_group->rf_level[frame_index] = INTER_NORMAL;
-        gf_group->brf_src_offset[frame_index] = 0;
-
-        // Reset the bi-predictive frame index.
-        bipred_frame_index = 0;
-      } else {
-        // --- BIPRED_UPDATE ---
-        gf_group->update_type[frame_index] = BIPRED_UPDATE;
-        gf_group->rf_level[frame_index] = INTER_NORMAL;
-        gf_group->brf_src_offset[frame_index] = 0;
-      }
-      gf_group->bidir_pred_enabled[frame_index] = 1;
-
-      bipred_frame_index++;
-      // Check whether the next bi-predictive frame group would entirely be
-      // included within the current golden frame group.
-      // In addition, we need to avoid coding a BRF right before an ARF.
-      if (bipred_frame_index == 1 &&
-          (i + 2 + cur_brf_src_offset) >= accumulative_subgroup_interval) {
-        bipred_group_end = 1;
-      }
-    } else {
-      gf_group->update_type[frame_index] = LF_UPDATE;
-      gf_group->rf_level[frame_index] = INTER_NORMAL;
-      gf_group->bidir_pred_enabled[frame_index] = 0;
-      gf_group->brf_src_offset[frame_index] = 0;
-    }
-
-    ++frame_index;
-
-    // Check if we need to update the ARF.
-    if (is_sg_bipred_enabled && cpi->num_extra_arfs && which_arf > 0 &&
-        frame_index > cpi->arf_pos_for_ovrly[which_arf]) {
-      --which_arf;
-      accumulative_subgroup_interval += subgroup_interval[which_arf] + 1;
-
-      // Meet the new subgroup; Reset the bipred_group_end flag.
-      bipred_group_end = 0;
-      // Insert another extra ARF after the overlay frame
-      if (which_arf) {
-        gf_group->update_type[frame_index] = INTNL_ARF_UPDATE;
-        gf_group->rf_level[frame_index] = GF_ARF_LOW;
-        gf_group->arf_src_offset[frame_index] = ext_arf_interval;
-
-        gf_group->arf_update_idx[frame_index] = which_arf;
-        gf_group->arf_ref_idx[frame_index] = 0;
-        ++frame_index;
-      }
-    }
-  }
-
-  // NOTE: We need to configure the frame at the end of the sequence + 1 that
-  //       will be the start frame for the next group. Otherwise prior to the
-  //       call to av1_rc_get_second_pass_params() the data will be undefined.
-  gf_group->arf_update_idx[frame_index] = 0;
-  gf_group->arf_ref_idx[frame_index] = 0;
-
-  if (rc->source_alt_ref_pending) {
-    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
-    gf_group->rf_level[frame_index] = INTER_NORMAL;
-
-    cpi->arf_pos_in_gf[0] = 1;
-    if (cpi->num_extra_arfs) {
-      // Overwrite the update_type for extra-ARF's corresponding internal
-      // OVERLAY's: Change from LF_UPDATE to INTNL_OVERLAY_UPDATE.
-      for (i = cpi->num_extra_arfs; i > 0; --i) {
-        cpi->arf_pos_in_gf[i] =
-            (i == cpi->num_extra_arfs ? 2 : cpi->arf_pos_for_ovrly[i + 1] + 1);
-
-        gf_group->update_type[cpi->arf_pos_for_ovrly[i]] = INTNL_OVERLAY_UPDATE;
-        gf_group->rf_level[cpi->arf_pos_for_ovrly[i]] = INTER_NORMAL;
-      }
-    }
-  } else {
-    gf_group->update_type[frame_index] = GF_UPDATE;
-    gf_group->rf_level[frame_index] = GF_ARF_STD;
-  }
-
-  gf_group->bidir_pred_enabled[frame_index] = 0;
-  gf_group->brf_src_offset[frame_index] = 0;
-}
-
-#if USE_SYMM_MULTI_LAYER
-#define NEW_MULTI_LVL_BOOST_VBR_ALLOC 1
-
-#if NEW_MULTI_LVL_BOOST_VBR_ALLOC
-#define LEAF_REDUCTION_FACTOR 0.75
-static double lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = {
-  { 1.0, 0.0, 0.0 }, { 0.6, 0.4, 0 }, { 0.45, 0.35, 0.20 }
-};
-#endif  // NEW_MULTI_LVL_BOOST_VBR_ALLOC
-#endif  // USE_SYMM_MULTI_LAYER
-static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
-                                   double group_error, int gf_arf_bits) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-  int i;
-  int frame_index = 0;
-  int key_frame;
-  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
-  int64_t total_group_bits = gf_group_bits;
-  int ext_arf_boost[MAX_EXT_ARFS];
-
-  define_gf_group_structure(cpi);
-
-  av1_zero_array(ext_arf_boost, MAX_EXT_ARFS);
-
-  key_frame = cpi->common.current_frame.frame_type == KEY_FRAME;
-
-  // For key frames the frame target rate is already set and it
-  // is also the golden frame.
-  // === [frame_index == 0] ===
-  if (!key_frame) {
-    if (rc->source_alt_ref_active)
-      gf_group->bit_allocation[frame_index] = 0;
-    else
-      gf_group->bit_allocation[frame_index] = gf_arf_bits;
-
-    // Step over the golden frame / overlay frame
-    FIRSTPASS_STATS frame_stats;
-    if (EOF == input_stats(twopass, &frame_stats)) return;
-  }
-
-  // Deduct the boost bits for arf (or gf if it is not a key frame)
-  // from the group total.
-  if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
-
-  frame_index++;
-
-  // Store the bits to spend on the ARF if there is one.
-  // === [frame_index == 1] ===
-  if (rc->source_alt_ref_pending) {
-    gf_group->bit_allocation[frame_index] = gf_arf_bits;
-
-    ++frame_index;
-
-    // Skip all the extra-ARF's right after ARF at the starting segment of
-    // the current GF group.
-    if (cpi->num_extra_arfs) {
-      while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
-        ++frame_index;
-    }
-  }
-
-#if USE_SYMM_MULTI_LAYER
-#if NEW_MULTI_LVL_BOOST_VBR_ALLOC
-  // Save.
-  const int tmp_frame_index = frame_index;
-  int budget_reduced_from_leaf_level = 0;
-#endif  // NEW_MULTI_LVL_BOOST_VBR_ALLOC
-#endif  // USE_SYMM_MULTI_LAYER
-
-  // Allocate bits to the other frames in the group.
-  const int normal_frames =
-      rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
-
-  for (i = 0; i < normal_frames; ++i) {
-    FIRSTPASS_STATS frame_stats;
-    if (EOF == input_stats(twopass, &frame_stats)) break;
-
-    const double modified_err =
-        calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
-    const double err_fraction =
-        (group_error > 0) ? modified_err / DOUBLE_DIVIDE_CHECK(group_error)
-                          : 0.0;
-    const int target_frame_size =
-        clamp((int)((double)total_group_bits * err_fraction), 0,
-              AOMMIN(max_bits, (int)total_group_bits));
-
-    if (gf_group->update_type[frame_index] == BRF_UPDATE) {
-      // Boost up the allocated bits on BWDREF_FRAME
-      gf_group->bit_allocation[frame_index] =
-          target_frame_size + (target_frame_size >> 2);
-    } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) {
-      // Press down the allocated bits on LAST_BIPRED_UPDATE frames
-      gf_group->bit_allocation[frame_index] =
-          target_frame_size - (target_frame_size >> 1);
-    } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) {
-      // TODO(zoeliu): To investigate whether the allocated bits on
-      // BIPRED_UPDATE frames need to be further adjusted.
-      gf_group->bit_allocation[frame_index] = target_frame_size;
-#if USE_SYMM_MULTI_LAYER
-    } else if (cpi->new_bwdref_update_rule &&
-               gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
-      assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
-             "non-valid height for a pyramid structure");
-
-      const int arf_pos = gf_group->arf_pos_in_gf[frame_index];
-      gf_group->bit_allocation[frame_index] = 0;
-
-      gf_group->bit_allocation[arf_pos] = target_frame_size;
-      // Note: Boost, if needed, is added in the next loop.
-#endif  // USE_SYMM_MULTI_LAYER
-    } else {
-      assert(gf_group->update_type[frame_index] == LF_UPDATE ||
-             gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
-      gf_group->bit_allocation[frame_index] = target_frame_size;
-#if MULTI_LVL_BOOST_VBR_CQ
-      if (cpi->new_bwdref_update_rule) {
-#if NEW_MULTI_LVL_BOOST_VBR_ALLOC
-        const int this_budget_reduction =
-            (int)(target_frame_size * LEAF_REDUCTION_FACTOR);
-        gf_group->bit_allocation[frame_index] -= this_budget_reduction;
-        budget_reduced_from_leaf_level += this_budget_reduction;
-#else
-        gf_group->bit_allocation[frame_index] -= (target_frame_size >> 1);
-#endif  // NEW_MULTI_LVL_BOOST_VBR_ALLOC
-      }
-#endif  // MULTI_LVL_BOOST_VBR_CQ
-    }
-
-    ++frame_index;
-
-    // Skip all the extra-ARF's.
-    if (cpi->num_extra_arfs) {
-      while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
-        ++frame_index;
-    }
-  }
-
-#if USE_SYMM_MULTI_LAYER
-#if MULTI_LVL_BOOST_VBR_CQ
-  if (budget_reduced_from_leaf_level > 0) {
-    // Restore.
-    frame_index = tmp_frame_index;
-
-    // Re-distribute this extra budget to overlay frames in the group.
-    for (i = 0; i < normal_frames; ++i) {
-      if (cpi->new_bwdref_update_rule &&
-          gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
-        assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
-               "non-valid height for a pyramid structure");
-        const int arf_pos = gf_group->arf_pos_in_gf[frame_index];
-        const int this_lvl = gf_group->pyramid_level[arf_pos];
-        const int dist2top = gf_group->pyramid_height - 1 - this_lvl;
-#if NEW_MULTI_LVL_BOOST_VBR_ALLOC
-        const double lvl_boost_factor =
-            lvl_budget_factor[gf_group->pyramid_height - 2][dist2top];
-        const int extra_size =
-            (int)(budget_reduced_from_leaf_level * lvl_boost_factor /
-                  gf_group->pyramid_lvl_nodes[this_lvl]);
-#else
-        const int target_frame_size = gf_group->bit_allocation[arf_pos];
-        const int extra_size = target_frame_size >> dist2top;
-#endif  // NEW_MULTI_LVL_BOOST_VBR_ALLOC
-        gf_group->bit_allocation[arf_pos] += extra_size;
-      }
-      ++frame_index;
-
-      // Skip all the extra-ARF's.
-      if (cpi->num_extra_arfs) {
-        while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
-          ++frame_index;
-      }
-    }
-  }
-#endif  // MULTI_LVL_BOOST_VBR_CQ
-#endif  // USE_SYMM_MULTI_LAYER
-
-#if USE_SYMM_MULTI_LAYER
-  if (cpi->new_bwdref_update_rule == 0 && rc->source_alt_ref_pending) {
-#else
-  if (rc->source_alt_ref_pending) {
-#endif
-    if (cpi->num_extra_arfs) {
-      // NOTE: For bit allocation, move the allocated bits associated with
-      //       INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE.
-      //       i > 0 for extra-ARF's and i == 0 for ARF:
-      //         arf_pos_for_ovrly[i]: Position for INTNL_OVERLAY_UPDATE
-      //         arf_pos_in_gf[i]: Position for INTNL_ARF_UPDATE
-      for (i = cpi->num_extra_arfs; i > 0; --i) {
-        assert(gf_group->update_type[cpi->arf_pos_for_ovrly[i]] ==
-               INTNL_OVERLAY_UPDATE);
-
-        // Encoder's choice:
-        //   Set show_existing_frame == 1 for all extra-ARF's, and hence
-        //   allocate zero bit for both all internal OVERLAY frames.
-        gf_group->bit_allocation[cpi->arf_pos_in_gf[i]] =
-            gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]];
-        gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]] = 0;
-      }
-    }
-  }
-}
-
-// Returns true if KF group and GF group both are almost completely static.
-static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion) {
-  return (gf_zero_motion >= 0.995) &&
-         (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
-}
-
-// Analyse and define a gf/arf group.
-static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  AV1_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
-  AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *const twopass = &cpi->twopass;
-  FIRSTPASS_STATS next_frame;
-  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
-  int i;
-
-  double boost_score = 0.0;
-#if !CONFIG_FIX_GF_LENGTH
-  double old_boost_score = 0.0;
-  double mv_ratio_accumulator_thresh;
-  int active_max_gf_interval;
-  int active_min_gf_interval;
-#endif
-  double gf_group_err = 0.0;
-#if GROUP_ADAPTIVE_MAXQ
-  double gf_group_raw_error = 0.0;
-#endif
-  double gf_group_skip_pct = 0.0;
-  double gf_group_inactive_zone_rows = 0.0;
-  double gf_first_frame_err = 0.0;
-  double mod_frame_err = 0.0;
-
-  double mv_ratio_accumulator = 0.0;
-  double decay_accumulator = 1.0;
-  double zero_motion_accumulator = 1.0;
-
-  double loop_decay_rate = 1.00;
-  double last_loop_decay_rate = 1.00;
-
-  double this_frame_mv_in_out = 0.0;
-  double mv_in_out_accumulator = 0.0;
-  double abs_mv_in_out_accumulator = 0.0;
-
-  unsigned int allow_alt_ref = is_altref_enabled(cpi);
-
-  int f_boost = 0;
-  int b_boost = 0;
-  int flash_detected;
-  int64_t gf_group_bits;
-  double gf_group_error_left;
-  int gf_arf_bits;
-  const int is_key_frame = frame_is_intra_only(cm);
-  const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
-
-  cpi->extra_arf_allowed = 1;
-
-  // Reset the GF group data structures unless this is a key
-  // frame in which case it will already have been done.
-  if (is_key_frame == 0) {
-    av1_zero(twopass->gf_group);
-  }
-
-  aom_clear_system_state();
-  av1_zero(next_frame);
-
-  // Load stats for the current frame.
-  mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
-
-  // Note the error of the frame at the start of the group. This will be
-  // the GF frame error if we code a normal gf.
-  gf_first_frame_err = mod_frame_err;
-
-  // If this is a key frame or the overlay from a previous arf then
-  // the error score / cost of this frame has already been accounted for.
-  if (arf_active_or_kf) {
-    gf_group_err -= gf_first_frame_err;
-#if GROUP_ADAPTIVE_MAXQ
-    gf_group_raw_error -= this_frame->coded_error;
-#endif
-    gf_group_skip_pct -= this_frame->intra_skip_pct;
-    gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
-  }
-#if !CONFIG_FIX_GF_LENGTH
-  // Motion breakout threshold for loop below depends on image size.
-  mv_ratio_accumulator_thresh =
-      (cpi->initial_height + cpi->initial_width) / 4.0;
-  // Set a maximum and minimum interval for the GF group.
-  // If the image appears almost completely static we can extend beyond this.
-  {
-    int int_max_q = (int)(av1_convert_qindex_to_q(
-        twopass->active_worst_quality, cpi->common.seq_params.bit_depth));
-    int int_lbq = (int)(av1_convert_qindex_to_q(
-        rc->last_boosted_qindex, cpi->common.seq_params.bit_depth));
-
-    active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200);
-    if (active_min_gf_interval > rc->max_gf_interval)
-      active_min_gf_interval = rc->max_gf_interval;
-
-    // The value chosen depends on the active Q range. At low Q we have
-    // bits to spare and are better with a smaller interval and smaller boost.
-    // At high Q when there are few bits to spare we are better with a longer
-    // interval to spread the cost of the GF.
-    active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
-
-    // We have: active_min_gf_interval <= rc->max_gf_interval
-    if (active_max_gf_interval < active_min_gf_interval)
-      active_max_gf_interval = active_min_gf_interval;
-    else if (active_max_gf_interval > rc->max_gf_interval)
-      active_max_gf_interval = rc->max_gf_interval;
-  }
-#endif  // !CONFIG_FIX_GF_LENGTH
-  double avg_sr_coded_error = 0;
-  double avg_raw_err_stdev = 0;
-  int non_zero_stdev_count = 0;
-
-  i = 0;
-  while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
-    ++i;
-
-    // Accumulate error score of frames in this gf group.
-    mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
-    gf_group_err += mod_frame_err;
-#if GROUP_ADAPTIVE_MAXQ
-    gf_group_raw_error += this_frame->coded_error;
-#endif
-    gf_group_skip_pct += this_frame->intra_skip_pct;
-    gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
-
-    if (EOF == input_stats(twopass, &next_frame)) break;
-
-    // Test for the case where there is a brief flash but the prediction
-    // quality back to an earlier frame is then restored.
-    flash_detected = detect_flash(twopass, 0);
-
-    // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(
-        &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
-        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-    // sum up the metric values of current gf group
-    avg_sr_coded_error += next_frame.sr_coded_error;
-    if (fabs(next_frame.raw_error_stdev) > 0.000001) {
-      non_zero_stdev_count++;
-      avg_raw_err_stdev += next_frame.raw_error_stdev;
-    }
-
-    // Accumulate the effect of prediction quality decay.
-    if (!flash_detected) {
-      last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
-
-      decay_accumulator = decay_accumulator * loop_decay_rate;
-
-      // Monitor for static sections.
-      if ((rc->frames_since_key + i - 1) > 1) {
-        zero_motion_accumulator = AOMMIN(
-            zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
-      }
-
-      // Break clause to detect very still sections after motion. For example,
-      // a static image after a fade or other transition.
-      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
-                                     last_loop_decay_rate)) {
-        allow_alt_ref = 0;
-        break;
-      }
-    }
-
-    // Calculate a boost number for this frame.
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
-#if CONFIG_FIX_GF_LENGTH
-    // If almost totally static, we will not use the FIXED_GF_LENGTH later, so
-    // we can continue for more frames.
-    if (i >= (FIXED_GF_LENGTH + 1) &&
-        !is_almost_static(zero_motion_accumulator,
-                          twopass->kf_zeromotion_pct)) {
-      break;
-    }
-#else
-    // Break out conditions.
-    // Break at maximum of active_max_gf_interval unless almost totally static.
-    //
-    // Note that the addition of a test of rc->source_alt_ref_active is
-    // deliberate. The effect of this is that after a normal altref group even
-    // if the material is static there will be one normal length GF group
-    // before allowing longer GF groups. The reason for this is that in cases
-    // such as slide shows where slides are separated by a complex transition
-    // such as a fade, the arf group spanning the transition may not be coded
-    // at a very high quality and hence this frame (with its overlay) is a
-    // poor golden frame to use for an extended group.
-    if ((i >= (active_max_gf_interval + arf_active_or_kf) &&
-         ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) ||
-        (
-            // Don't break out with a very short interval.
-            (i >= active_min_gf_interval + arf_active_or_kf) &&
-            (!flash_detected) &&
-            ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
-             (abs_mv_in_out_accumulator > 3.0) ||
-             (mv_in_out_accumulator < -2.0) ||
-             ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
-      // If GF group interval is < 12, we force it to be 8. Otherwise,
-      // if it is >= 12, we keep it as is.
-      // NOTE: 'i' is 1 more than the GF group interval candidate that is being
-      //       checked.
-      if (i == (8 + 1) || i >= (12 + 1)) {
-        boost_score = old_boost_score;
-        break;
-      }
-    }
-    old_boost_score = boost_score;
-#endif  // CONFIG_FIX_GF_LENGTH
-    *this_frame = next_frame;
-  }
-  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
-
-  // Was the group length constrained by the requirement for a new KF?
-  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
-
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                             : cpi->common.MBs;
-  assert(num_mbs > 0);
-  if (i) avg_sr_coded_error /= i;
-
-  if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
-
-  // Disable extra altrefs and backward refs for "still" gf group:
-  //   zero_motion_accumulator: minimum percentage of (0,0) motion;
-  //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
-  //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
-  //                            motion error per block of each frame.
-  const int disable_bwd_extarf =
-      (zero_motion_accumulator > MIN_ZERO_MOTION &&
-       avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
-       avg_raw_err_stdev < MAX_RAW_ERR_VAR);
-
-  if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
-
-  const int use_alt_ref =
-      !is_almost_static(zero_motion_accumulator, twopass->kf_zeromotion_pct) &&
-      allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
-      (i >= rc->min_gf_interval);
-
-#define REDUCE_GF_LENGTH_THRESH 4
-#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
-#define REDUCE_GF_LENGTH_BY 1
-  int alt_offset = 0;
-#if REDUCE_LAST_GF_LENGTH
-  // The length reduction strategy is tweaked using AOM_Q mode, and doesn't work
-  // for VBR mode.
-  // Also, we don't have do adjustment for lossless mode.
-  const int allow_gf_length_reduction =
-      (cpi->oxcf.rc_mode == AOM_Q || cpi->extra_arf_allowed == 0) &&
-      !is_lossless_requested(&cpi->oxcf);
-
-  if (allow_gf_length_reduction && use_alt_ref) {
-    // adjust length of this gf group if one of the following condition met
-    // 1: only one overlay frame left and this gf is too long
-    // 2: next gf group is too short to have arf compared to the current gf
-
-    // maximum length of next gf group
-    const int next_gf_len = rc->frames_to_key - i;
-    const int single_overlay_left =
-        next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
-    // the next gf is probably going to have a ARF but it will be shorter than
-    // this gf
-    const int unbalanced_gf =
-        i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
-        next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
-        next_gf_len + 1 >= rc->min_gf_interval;
-
-    if (single_overlay_left || unbalanced_gf) {
-      // Note: Tried roll_back = DIVIDE_AND_ROUND(i, 8), but is does not work
-      // better in the current setting
-      const int roll_back = REDUCE_GF_LENGTH_BY;
-      alt_offset = -roll_back;
-      i -= roll_back;
-    }
-  }
-#endif  // REDUCE_LAST_GF_LENGTH
-
-  // Should we use the alternate reference frame.
-  if (use_alt_ref) {
-    // Calculate the boost for alt ref.
-    rc->gfu_boost =
-        calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost);
-    rc->source_alt_ref_pending = 1;
-
-    // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF
-    cpi->preserve_arf_as_gld = 1;
-  } else {
-    rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
-    rc->source_alt_ref_pending = 0;
-    cpi->preserve_arf_as_gld = 0;
-  }
-
-  // Set the interval until the next gf.
-  // If forward keyframes are enabled, ensure the final gf group obeys the
-  // MIN_FWD_KF_INTERVAL.
-  if (cpi->oxcf.fwd_kf_enabled &&
-      ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) {
-    if (i == rc->frames_to_key) {
-      rc->baseline_gf_interval = i;
-      // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
-    } else if ((rc->frames_to_key - i <
-                AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
-               (rc->frames_to_key != i)) {
-      // if possible, merge the last two gf groups
-      if (rc->frames_to_key <= MAX_PYRAMID_SIZE) {
-        rc->baseline_gf_interval = rc->frames_to_key;
-        // if merging the last two gf groups creates a group that is too long,
-        // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
-      } else {
-        rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
-      }
-    } else {
-      rc->baseline_gf_interval = i - rc->source_alt_ref_pending;
-    }
-  } else {
-    rc->baseline_gf_interval = i - rc->source_alt_ref_pending;
-  }
-
-#if REDUCE_LAST_ALT_BOOST
-#define LAST_ALR_BOOST_FACTOR 0.2f
-  rc->arf_boost_factor = 1.0;
-  if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
-    // Reduce the boost of altref in the last gf group
-    if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY ||
-        rc->frames_to_key - i == 0) {
-      rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
-    }
-  }
-#endif
-
-  if (!cpi->extra_arf_allowed) {
-    cpi->num_extra_arfs = 0;
-  } else {
-#if USE_SYMM_MULTI_LAYER
-    if (rc->baseline_gf_interval == 4 && rc->source_alt_ref_pending)
-      cpi->num_extra_arfs = 1;
-    else
-      cpi->num_extra_arfs = get_number_of_extra_arfs(
-          rc->baseline_gf_interval, rc->source_alt_ref_pending);
-#else
-    // Compute how many extra alt_refs we can have
-    cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
-                                                   rc->source_alt_ref_pending);
-#endif  // USE_SYMM_MULTI_LAYER
-  }
-
-#if !USE_SYMM_MULTI_LAYER
-  // Currently at maximum two extra ARFs' are allowed
-  assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
-#endif
-
-  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-
-  rc->bipred_group_interval = BFG_INTERVAL;
-  // The minimum bi-predictive frame group interval is 2.
-  if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0;
-
-  // Reset the file position.
-  reset_fpf_position(twopass, start_pos);
-
-  // Calculate the bits to be allocated to the gf/arf group as a whole
-  gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
-
-#if GROUP_ADAPTIVE_MAXQ
-  // Calculate an estimate of the maxq needed for the group.
-  // We are more agressive about correcting for sections
-  // where there could be significant overshoot than for easier
-  // sections where we do not wish to risk creating an overshoot
-  // of the allocated bit budget.
-  if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) {
-    const int vbr_group_bits_per_frame =
-        (int)(gf_group_bits / rc->baseline_gf_interval);
-    const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
-    const double group_av_skip_pct =
-        gf_group_skip_pct / rc->baseline_gf_interval;
-    const double group_av_inactive_zone =
-        ((gf_group_inactive_zone_rows * 2) /
-         (rc->baseline_gf_interval * (double)cm->mb_rows));
-
-    int tmp_q;
-    // rc factor is a weight factor that corrects for local rate control drift.
-    double rc_factor = 1.0;
-    if (rc->rate_error_estimate > 0) {
-      rc_factor = AOMMAX(RC_FACTOR_MIN,
-                         (double)(100 - rc->rate_error_estimate) / 100.0);
-    } else {
-      rc_factor = AOMMIN(RC_FACTOR_MAX,
-                         (double)(100 - rc->rate_error_estimate) / 100.0);
-    }
-    tmp_q = get_twopass_worst_quality(
-        cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
-        vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor);
-    twopass->active_worst_quality =
-        AOMMAX(tmp_q, twopass->active_worst_quality >> 1);
-  }
-#endif
-
-  // Calculate the extra bits to be used for boosted frame(s)
-  gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost,
-                                     gf_group_bits);
-
-  // Adjust KF group bits and error remaining.
-  twopass->kf_group_error_left -= (int64_t)gf_group_err;
-
-  // If this is an arf update we want to remove the score for the overlay
-  // frame at the end which will usually be very cheap to code.
-  // The overlay frame has already, in effect, been coded so we want to spread
-  // the remaining bits among the other frames.
-  // For normal GFs remove the score for the GF itself unless this is
-  // also a key frame in which case it has already been accounted for.
-  if (rc->source_alt_ref_pending) {
-    gf_group_error_left = gf_group_err - mod_frame_err;
-  } else if (is_key_frame == 0) {
-    gf_group_error_left = gf_group_err - gf_first_frame_err;
-  } else {
-    gf_group_error_left = gf_group_err;
-  }
-
-  // Allocate bits to each of the frames in the GF group.
-  allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits);
-
-  // Reset the file position.
-  reset_fpf_position(twopass, start_pos);
-
-  // Calculate a section intra ratio used in setting max loop filter.
-  if (cpi->common.current_frame.frame_type != KEY_FRAME) {
-    twopass->section_intra_rating = calculate_section_intra_ratio(
-        start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
-  }
-}
-
-// Threshold for use of the lagging second reference frame. High second ref
-// usage may point to a transient event like a flash or occlusion rather than
-// a real scene cut.
-#define SECOND_REF_USEAGE_THRESH 0.1
-// Minimum % intra coding observed in first pass (1.0 = 100%)
-#define MIN_INTRA_LEVEL 0.25
-// Minimum ratio between the % of intra coding and inter coding in the first
-// pass after discounting neutral blocks (discounting neutral blocks in this
-// way helps catch scene cuts in clips with very flat areas or letter box
-// format clips with image padding.
-#define INTRA_VS_INTER_THRESH 2.0
-// Hard threshold where the first pass chooses intra for almost all blocks.
-// In such a case even if the frame is not a scene cut coding a key frame
-// may be a good option.
-#define VERY_LOW_INTER_THRESH 0.05
-// Maximum threshold for the relative ratio of intra error score vs best
-// inter error score.
-#define KF_II_ERR_THRESHOLD 2.5
-// In real scene cuts there is almost always a sharp change in the intra
-// or inter error score.
-#define ERR_CHANGE_THRESHOLD 0.4
-// For real scene cuts we expect an improvment in the intra inter error
-// ratio in the next frame.
-#define II_IMPROVEMENT_THRESHOLD 3.5
-#define KF_II_MAX 128.0
-
-static int test_candidate_kf(TWO_PASS *twopass,
-                             const FIRSTPASS_STATS *last_frame,
-                             const FIRSTPASS_STATS *this_frame,
-                             const FIRSTPASS_STATS *next_frame) {
-  int is_viable_kf = 0;
-  double pcnt_intra = 1.0 - this_frame->pcnt_inter;
-  double modified_pcnt_inter =
-      this_frame->pcnt_inter - this_frame->pcnt_neutral;
-
-  // Does the frame satisfy the primary criteria of a key frame?
-  // See above for an explanation of the test criteria.
-  // If so, then examine how well it predicts subsequent frames.
-  if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
-      (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
-      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
-       ((pcnt_intra > MIN_INTRA_LEVEL) &&
-        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
-        ((this_frame->intra_error /
-          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
-         KF_II_ERR_THRESHOLD) &&
-        ((fabs(last_frame->coded_error - this_frame->coded_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
-          ERR_CHANGE_THRESHOLD) ||
-         (fabs(last_frame->intra_error - this_frame->intra_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
-          ERR_CHANGE_THRESHOLD) ||
-         ((next_frame->intra_error /
-           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
-          II_IMPROVEMENT_THRESHOLD))))) {
-    int i;
-    const FIRSTPASS_STATS *start_pos = twopass->stats_in;
-    FIRSTPASS_STATS local_next_frame = *next_frame;
-    double boost_score = 0.0;
-    double old_boost_score = 0.0;
-    double decay_accumulator = 1.0;
-
-    // Examine how well the key frame predicts subsequent frames.
-    for (i = 0; i < 16; ++i) {
-      double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
-                             DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
-
-      if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
-
-      // Cumulative effect of decay in prediction quality.
-      if (local_next_frame.pcnt_inter > 0.85)
-        decay_accumulator *= local_next_frame.pcnt_inter;
-      else
-        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
-
-      // Keep a running total.
-      boost_score += (decay_accumulator * next_iiratio);
-
-      // Test various breakout clauses.
-      if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
-          (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) <
-            0.20) &&
-           (next_iiratio < 3.0)) ||
-          ((boost_score - old_boost_score) < 3.0) ||
-          (local_next_frame.intra_error < 200)) {
-        break;
-      }
-
-      old_boost_score = boost_score;
-
-      // Get the next frame details
-      if (EOF == input_stats(twopass, &local_next_frame)) break;
-    }
-
-    // If there is tolerable prediction for at least the next 3 frames then
-    // break out else discard this potential key frame and move on
-    if (boost_score > 30.0 && (i > 3)) {
-      is_viable_kf = 1;
-    } else {
-      // Reset the file position
-      reset_fpf_position(twopass, start_pos);
-
-      is_viable_kf = 0;
-    }
-  }
-
-  return is_viable_kf;
-}
-
-#define FRAMES_TO_CHECK_DECAY 8
-
-static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  int i, j;
-  RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const FIRSTPASS_STATS first_frame = *this_frame;
-  const FIRSTPASS_STATS *const start_position = twopass->stats_in;
-  FIRSTPASS_STATS next_frame;
-  FIRSTPASS_STATS last_frame;
-  int kf_bits = 0;
-  int loop_decay_counter = 0;
-  double decay_accumulator = 1.0;
-  double av_decay_accumulator = 0.0;
-  double zero_motion_accumulator = 1.0;
-  double boost_score = 0.0;
-  double kf_mod_err = 0.0;
-  double kf_group_err = 0.0;
-  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
-
-  av1_zero(next_frame);
-
-  cpi->common.current_frame.frame_type = KEY_FRAME;
-  rc->frames_since_key = 0;
-
-  // Reset the GF group data structures.
-  av1_zero(*gf_group);
-
-  // Is this a forced key frame by interval.
-  rc->this_key_frame_forced = rc->next_key_frame_forced;
-
-  // Clear the alt ref active flag and last group multi arf flags as they
-  // can never be set for a key frame.
-  rc->source_alt_ref_active = 0;
-
-  // KF is always a GF so clear frames till next gf counter.
-  rc->frames_till_gf_update_due = 0;
-
-  rc->frames_to_key = 1;
-
-  twopass->kf_group_bits = 0;        // Total bits available to kf group
-  twopass->kf_group_error_left = 0;  // Group modified error score.
-
-  kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
-
-  // Initialize the decay rates for the recent frames to check
-  for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
-
-  // Find the next keyframe.
-  i = 0;
-  while (twopass->stats_in < twopass->stats_in_end &&
-         rc->frames_to_key < cpi->oxcf.key_freq) {
-    // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
-
-    // Load the next frame's stats.
-    last_frame = *this_frame;
-    input_stats(twopass, this_frame);
-
-    // Provided that we are not at the end of the file...
-    if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
-      double loop_decay_rate;
-
-      // Check for a scene cut.
-      if (test_candidate_kf(twopass, &last_frame, this_frame,
-                            twopass->stats_in))
-        break;
-
-      // How fast is the prediction quality decaying?
-      loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
-
-      // We want to know something about the recent past... rather than
-      // as used elsewhere where we are concerned with decay in prediction
-      // quality since the last GF or KF.
-      recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
-      decay_accumulator = 1.0;
-      for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
-        decay_accumulator *= recent_loop_decay[j];
-
-      // Special check for transition or high motion followed by a
-      // static scene.
-      if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
-                                     loop_decay_rate, decay_accumulator))
-        break;
-
-      // Step on to the next frame.
-      ++rc->frames_to_key;
-
-      // If we don't have a real key frame within the next two
-      // key_freq intervals then break out of the loop.
-      if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break;
-    } else {
-      ++rc->frames_to_key;
-    }
-    ++i;
-  }
-
-  // If there is a max kf interval set by the user we must obey it.
-  // We already breakout of the loop above at 2x max.
-  // This code centers the extra kf if the actual natural interval
-  // is between 1x and 2x.
-  if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) {
-    FIRSTPASS_STATS tmp_frame = first_frame;
-
-    rc->frames_to_key /= 2;
-
-    // Reset to the start of the group.
-    reset_fpf_position(twopass, start_position);
-
-    kf_group_err = 0.0;
-
-    // Rescan to get the correct error data for the forced kf group.
-    for (i = 0; i < rc->frames_to_key; ++i) {
-      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
-      input_stats(twopass, &tmp_frame);
-    }
-    rc->next_key_frame_forced = 1;
-  } else if (twopass->stats_in == twopass->stats_in_end ||
-             rc->frames_to_key >= cpi->oxcf.key_freq) {
-    rc->next_key_frame_forced = 1;
-  } else {
-    rc->next_key_frame_forced = 0;
-  }
-
-  // Special case for the last key frame of the file.
-  if (twopass->stats_in >= twopass->stats_in_end) {
-    // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
-  }
-
-  // Calculate the number of bits that should be assigned to the kf group.
-  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
-    // Maximum number of bits for a single normal frame (not key frame).
-    const int max_bits = frame_max_bits(rc, &cpi->oxcf);
-
-    // Maximum number of bits allocated to the key frame group.
-    int64_t max_grp_bits;
-
-    // Default allocation based on bits left and relative
-    // complexity of the section.
-    twopass->kf_group_bits = (int64_t)(
-        twopass->bits_left * (kf_group_err / twopass->modified_error_left));
-
-    // Clip based on maximum per frame rate defined by the user.
-    max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
-    if (twopass->kf_group_bits > max_grp_bits)
-      twopass->kf_group_bits = max_grp_bits;
-  } else {
-    twopass->kf_group_bits = 0;
-  }
-  twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
-
-  // Reset the first pass file position.
-  reset_fpf_position(twopass, start_position);
-
-  // Scan through the kf group collating various stats used to determine
-  // how many bits to spend on it.
-  decay_accumulator = 1.0;
-  boost_score = 0.0;
-  const double kf_max_boost =
-      cpi->oxcf.rc_mode == AOM_Q
-          ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
-                   KF_MAX_FRAME_BOOST)
-          : KF_MAX_FRAME_BOOST;
-  for (i = 0; i < (rc->frames_to_key - 1); ++i) {
-    if (EOF == input_stats(twopass, &next_frame)) break;
-
-    // Monitor for static sections.
-    // For the first frame in kf group, the second ref indicator is invalid.
-    if (i > 0) {
-      zero_motion_accumulator = AOMMIN(
-          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
-    } else {
-      zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion;
-    }
-
-    // Not all frames in the group are necessarily used in calculating boost.
-    if ((i <= rc->max_gf_interval) ||
-        ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
-      const double frame_boost =
-          calc_frame_boost(cpi, this_frame, 0, kf_max_boost);
-
-      // How fast is prediction quality decaying.
-      if (!detect_flash(twopass, 0)) {
-        const double loop_decay_rate =
-            get_prediction_decay_rate(cpi, &next_frame);
-        decay_accumulator *= loop_decay_rate;
-        decay_accumulator = AOMMAX(decay_accumulator, MIN_DECAY_FACTOR);
-        av_decay_accumulator += decay_accumulator;
-        ++loop_decay_counter;
-      }
-      boost_score += (decay_accumulator * frame_boost);
-    }
-  }
-  if (loop_decay_counter > 0)
-    av_decay_accumulator /= (double)loop_decay_counter;
-
-  reset_fpf_position(twopass, start_position);
-
-  // Store the zero motion percentage
-  twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
-
-  // Calculate a section intra ratio used in setting max loop filter.
-  twopass->section_intra_rating = calculate_section_intra_ratio(
-      start_position, twopass->stats_in_end, rc->frames_to_key);
-
-  rc->kf_boost = (int)(av_decay_accumulator * boost_score);
-
-  // Special case for static / slide show content but don't apply
-  // if the kf group is very short.
-  if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) &&
-      (rc->frames_to_key > 8)) {
-    rc->kf_boost = AOMMAX(rc->kf_boost, MIN_STATIC_KF_BOOST);
-  } else {
-    // Apply various clamps for min and max boost
-    rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
-    rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
-  }
-
-  // Work out how many bits to allocate for the key frame itself.
-  kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
-                                 twopass->kf_group_bits);
-  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
-  //        kf_bits, twopass->kf_zeromotion_pct);
-
-  // Work out the fraction of the kf group bits reserved for the inter frames
-  // within the group after discounting the bits for the kf itself.
-  if (twopass->kf_group_bits) {
-    twopass->kfgroup_inter_fraction =
-        (double)(twopass->kf_group_bits - kf_bits) /
-        (double)twopass->kf_group_bits;
-  } else {
-    twopass->kfgroup_inter_fraction = 1.0;
-  }
-
-  twopass->kf_group_bits -= kf_bits;
-
-  // Save the bits to spend on the key frame.
-  gf_group->bit_allocation[0] = kf_bits;
-  gf_group->update_type[0] = KF_UPDATE;
-  gf_group->rf_level[0] = KF_STD;
-
-  // Note the total error score of the kf group minus the key frame itself.
-  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
-
-  // Adjust the count of total modified error left.
-  // The count of bits left is adjusted elsewhere based on real coded frame
-  // sizes.
-  twopass->modified_error_left -= kf_group_err;
-}
-
-void av1_configure_buffer_updates_firstpass(AV1_COMP *cpi,
-                                            FRAME_UPDATE_TYPE update_type) {
-  RATE_CONTROL *rc = &cpi->rc;
-
-  cpi->refresh_last_frame = 1;
-  cpi->refresh_golden_frame = 0;
-  cpi->refresh_bwd_ref_frame = 0;
-  cpi->refresh_alt2_ref_frame = 0;
-  cpi->refresh_alt_ref_frame = 0;
-
-  rc->is_bwd_ref_frame = 0;
-
-  switch (update_type) {
-    case ARF_UPDATE:
-      cpi->refresh_alt_ref_frame = 1;
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-
-      rc->is_src_frame_alt_ref = 0;
-      break;
-    case INTNL_ARF_UPDATE:
-      cpi->refresh_alt2_ref_frame = 1;
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-      rc->is_src_frame_alt_ref = 0;
-      rc->is_src_frame_ext_arf = 0;
-
-      break;
-    case BIPRED_UPDATE:
-      cpi->refresh_bwd_ref_frame = 1;
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      rc->is_bwd_ref_frame = 1;
-      break;
-    default: break;
-  }
-}
-
-static int is_skippable_frame(const AV1_COMP *cpi) {
-  // If the current frame does not have non-zero motion vector detected in the
-  // first  pass, and so do its previous and forward frames, then this frame
-  // can be skipped for partition check, and the partition size is assigned
-  // according to the variance
-  const TWO_PASS *const twopass = &cpi->twopass;
-
-  return (!frame_is_intra_only(&cpi->common) &&
-          twopass->stats_in - 2 > twopass->stats_in_start &&
-          twopass->stats_in < twopass->stats_in_end &&
-          (twopass->stats_in - 1)->pcnt_inter -
-                  (twopass->stats_in - 1)->pcnt_motion ==
-              1 &&
-          (twopass->stats_in - 2)->pcnt_inter -
-                  (twopass->stats_in - 2)->pcnt_motion ==
-              1 &&
-          twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
-}
-
-void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  CurrentFrame *const current_frame = &cm->current_frame;
-  RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-  int frames_left;
-  FIRSTPASS_STATS this_frame;
-
-  int target_rate;
-
-  frames_left = (int)(twopass->total_stats.count - current_frame->frame_number);
-
-  if (!twopass->stats_in) return;
-
-  // If this is an arf frame then we dont want to read the stats file or
-  // advance the input pointer as we already have what we need.
-  if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
-      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
-    av1_configure_buffer_updates(cpi);
-    target_rate = gf_group->bit_allocation[gf_group->index];
-    target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
-    rc->base_frame_target = target_rate;
-
-    if (cpi->no_show_kf) {
-      assert(gf_group->update_type[gf_group->index] == ARF_UPDATE);
-      current_frame->frame_type = KEY_FRAME;
-    } else {
-      current_frame->frame_type = INTER_FRAME;
-    }
-
-    // Do the firstpass stats indicate that this frame is skippable for the
-    // partition search?
-    if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
-      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-    }
-
-    return;
-  }
-
-  aom_clear_system_state();
-
-  if (cpi->oxcf.rc_mode == AOM_Q) {
-    twopass->active_worst_quality = cpi->oxcf.cq_level;
-  } else if (current_frame->frame_number == 0) {
-    // Special case code for first frame.
-    const int section_target_bandwidth =
-        (int)(twopass->bits_left / frames_left);
-    const double section_length = twopass->total_left_stats.count;
-    const double section_error =
-        twopass->total_left_stats.coded_error / section_length;
-    const double section_intra_skip =
-        twopass->total_left_stats.intra_skip_pct / section_length;
-    const double section_inactive_zone =
-        (twopass->total_left_stats.inactive_zone_rows * 2) /
-        ((double)cm->mb_rows * section_length);
-    const int tmp_q = get_twopass_worst_quality(
-        cpi, section_error, section_intra_skip + section_inactive_zone,
-        section_target_bandwidth, DEFAULT_GRP_WEIGHT);
-
-    twopass->active_worst_quality = tmp_q;
-    twopass->baseline_active_worst_quality = tmp_q;
-    rc->ni_av_qi = tmp_q;
-    rc->last_q[INTER_FRAME] = tmp_q;
-    rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
-    rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
-    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
-    rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
-  }
-
-  av1_zero(this_frame);
-  if (EOF == input_stats(twopass, &this_frame)) return;
-
-  // Set the frame content type flag.
-  if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
-    twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
-  else
-    twopass->fr_content_type = FC_NORMAL;
-
-  // Keyframe and section processing.
-  if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
-    FIRSTPASS_STATS this_frame_copy;
-    this_frame_copy = this_frame;
-    // Define next KF group and assign bits to it.
-    find_next_key_frame(cpi, &this_frame);
-    this_frame = this_frame_copy;
-  } else {
-    current_frame->frame_type = INTER_FRAME;
-  }
-
-  // Define a new GF/ARF group. (Should always enter here for key frames).
-  if (rc->frames_till_gf_update_due == 0) {
-    define_gf_group(cpi, &this_frame);
-
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-
-#if ARF_STATS_OUTPUT
-    {
-      FILE *fpfile;
-      fpfile = fopen("arf.stt", "a");
-      ++arf_count;
-      fprintf(fpfile, "%10d %10d %10d %10d %10d\n", current_frame->frame_number,
-              rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
-              rc->gfu_boost);
-
-      fclose(fpfile);
-    }
-#endif
-  }
-
-  av1_configure_buffer_updates(cpi);
-
-  // Do the firstpass stats indicate that this frame is skippable for the
-  // partition search?
-  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
-    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-  }
-
-  target_rate = gf_group->bit_allocation[gf_group->index];
-
-  if (cpi->common.current_frame.frame_type == KEY_FRAME)
-    target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate);
-  else
-    target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
-
-  rc->base_frame_target = target_rate;
-
-  {
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cpi->common.MBs;
-    // The multiplication by 256 reverses a scaling factor of (>> 8)
-    // applied when combining MB error values for the frame.
-    twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0);
-    twopass->frame_avg_haar_energy =
-        log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0);
-  }
-
-  // Update the total stats remaining structure.
-  subtract_stats(&twopass->total_left_stats, &this_frame);
-}
-
-#define MINQ_ADJ_LIMIT 48
-#define MINQ_ADJ_LIMIT_CQ 20
-#define HIGH_UNDERSHOOT_RATIO 2
-void av1_twopass_postencode_update(AV1_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
-  RATE_CONTROL *const rc = &cpi->rc;
-  const int bits_used = rc->base_frame_target;
-
-  // VBR correction is done through rc->vbr_bits_off_target. Based on the
-  // sign of this value, a limited % adjustment is made to the target rate
-  // of subsequent frames, to try and push it back towards 0. This method
-  // is designed to prevent extreme behaviour at the end of a clip
-  // or group of frames.
-  rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
-  twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0);
-
-  // Calculate the pct rc error.
-  if (rc->total_actual_bits) {
-    rc->rate_error_estimate =
-        (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
-    rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
-  } else {
-    rc->rate_error_estimate = 0;
-  }
-
-  if (cpi->common.current_frame.frame_type != KEY_FRAME) {
-    twopass->kf_group_bits -= bits_used;
-    twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
-  }
-  twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
-
-  // If the rate control is drifting consider adjustment to min or maxq.
-  if ((cpi->oxcf.rc_mode != AOM_Q) &&
-      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
-      !cpi->rc.is_src_frame_alt_ref) {
-    const int maxq_adj_limit =
-        rc->worst_quality - twopass->active_worst_quality;
-    const int minq_adj_limit =
-        (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
-
-    // Undershoot.
-    if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
-      --twopass->extend_maxq;
-      if (rc->rolling_target_bits >= rc->rolling_actual_bits)
-        ++twopass->extend_minq;
-      // Overshoot.
-    } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
-      --twopass->extend_minq;
-      if (rc->rolling_target_bits < rc->rolling_actual_bits)
-        ++twopass->extend_maxq;
-    } else {
-      // Adjustment for extreme local overshoot.
-      if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
-          rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
-        ++twopass->extend_maxq;
-
-      // Unwind undershoot or overshoot adjustment.
-      if (rc->rolling_target_bits < rc->rolling_actual_bits)
-        --twopass->extend_minq;
-      else if (rc->rolling_target_bits > rc->rolling_actual_bits)
-        --twopass->extend_maxq;
-    }
-
-    twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
-    twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
-
-    // If there is a big and undexpected undershoot then feed the extra
-    // bits back in quickly. One situation where this may happen is if a
-    // frame is unexpectedly almost perfectly predicted by the ARF or GF
-    // but not very well predcited by the previous frame.
-    if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
-      int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
-      if (rc->projected_frame_size < fast_extra_thresh) {
-        rc->vbr_bits_off_target_fast +=
-            fast_extra_thresh - rc->projected_frame_size;
-        rc->vbr_bits_off_target_fast =
-            AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
-
-        // Fast adaptation of minQ if necessary to use up the extra bits.
-        if (rc->avg_frame_bandwidth) {
-          twopass->extend_minq_fast =
-              (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
-        }
-        twopass->extend_minq_fast = AOMMIN(
-            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
-      } else if (rc->vbr_bits_off_target_fast) {
-        twopass->extend_minq_fast = AOMMIN(
-            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
-      } else {
-        twopass->extend_minq_fast = 0;
-      }
-    }
-  }
-}
diff --git a/libaom/av1/encoder/firstpass.h b/libaom/av1/encoder/firstpass.h
index 7c40615..1b8636c 100644
--- a/libaom/av1/encoder/firstpass.h
+++ b/libaom/av1/encoder/firstpass.h
@@ -21,35 +21,7 @@
 extern "C" {
 #endif
 
-#if CONFIG_FP_MB_STATS
-
-#define FPMB_DCINTRA_MASK 0x01
-
-#define FPMB_MOTION_ZERO_MASK 0x02
-#define FPMB_MOTION_LEFT_MASK 0x04
-#define FPMB_MOTION_RIGHT_MASK 0x08
-#define FPMB_MOTION_UP_MASK 0x10
-#define FPMB_MOTION_DOWN_MASK 0x20
-
-#define FPMB_ERROR_SMALL_MASK 0x40
-#define FPMB_ERROR_LARGE_MASK 0x80
-#define FPMB_ERROR_SMALL_TH 2000
-#define FPMB_ERROR_LARGE_TH 48000
-
-typedef struct {
-  uint8_t *mb_stats_start;
-  uint8_t *mb_stats_end;
-} FIRSTPASS_MB_STATS;
-#endif
-
-// Length of the bi-predictive frame group (BFG)
-// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
-//       number of bi-predictive frames.
-#define BFG_INTERVAL 2
-// The maximum number of extra ALTREF's except ALTREF_FRAME
-#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
-
-#define MIN_EXT_ARF_INTERVAL 4
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
 
 #define MIN_ZERO_MOTION 0.95
 #define MAX_SR_CODED_ERROR 40
@@ -59,73 +31,99 @@ typedef struct {
 #define VLOW_MOTION_THRESHOLD 950
 
 typedef struct {
+  // Frame number in display order, if stats are for a single frame.
+  // No real meaning for a collection of frames.
   double frame;
+  // Weight assigned to this frame (or total weight for the collection of
+  // frames) currently based on intra factor and brightness factor. This is used
+  // to distribute bits betweeen easier and harder frames.
   double weight;
+  // Intra prediction error.
   double intra_error;
+  // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
   double frame_avg_wavelet_energy;
+  // Best of intra pred error and inter pred error using last frame as ref.
   double coded_error;
+  // Best of intra pred error and inter pred error using golden frame as ref.
   double sr_coded_error;
+  // Percentage of blocks with inter pred error < intra pred error.
   double pcnt_inter;
+  // Percentage of blocks using (inter prediction and) non-zero motion vectors.
   double pcnt_motion;
+  // Percentage of blocks where golden frame was the best reference. That is:
+  // inter pred error using golden frame < inter pred error using last frame and
+  // inter pred error using golden frame < intra pred error
   double pcnt_second_ref;
+  // Percentage of blocks where intra and inter prediction errors were very
+  // close. Note that this is a 'weighted count', that is, the so blocks may be
+  // weighted by how close the two errors were.
   double pcnt_neutral;
+  // Percentage of blocks that have almost no intra error residual
+  // (i.e. are in effect completely flat and untextured in the intra
+  // domain). In natural videos this is uncommon, but it is much more
+  // common in animations, graphics and screen content, so may be used
+  // as a signal to detect these types of content.
   double intra_skip_pct;
-  double inactive_zone_rows;  // Image mask rows top and bottom.
-  double inactive_zone_cols;  // Image mask columns at left and right edges.
+  // Image mask rows top and bottom.
+  double inactive_zone_rows;
+  // Image mask columns at left and right edges.
+  double inactive_zone_cols;
+  // Average of row motion vectors.
   double MVr;
+  // Mean of absolute value of row motion vectors.
   double mvr_abs;
+  // Mean of column motion vectors.
   double MVc;
+  // Mean of absolute value of column motion vectors.
   double mvc_abs;
+  // Variance of row motion vectors.
   double MVrv;
+  // Variance of column motion vectors.
   double MVcv;
+  // Value in range [-1,1] indicating fraction of row and column motion vectors
+  // that point inwards (negative MV value) or outwards (positive MV value).
+  // For example, value of 1 indicates, all row/column MVs are inwards.
   double mv_in_out_count;
+  // Count of unique non-zero motion vectors.
   double new_mv_count;
+  // Duration of the frame / collection of frames.
   double duration;
+  // 1.0 if stats are for a single frame, OR
+  // Number of frames in this collection for which the stats are accumulated.
   double count;
   // standard deviation for (0, 0) motion prediction error
   double raw_error_stdev;
 } FIRSTPASS_STATS;
 
-typedef enum {
-  KF_UPDATE = 0,
-  LF_UPDATE = 1,
-  GF_UPDATE = 2,
-  ARF_UPDATE = 3,
-  OVERLAY_UPDATE = 4,
-  BRF_UPDATE = 5,            // Backward Reference Frame
-  LAST_BIPRED_UPDATE = 6,    // Last Bi-predictive Frame
-  BIPRED_UPDATE = 7,         // Bi-predictive Frame, but not the last one
-  INTNL_OVERLAY_UPDATE = 8,  // Internal Overlay Frame
-  INTNL_ARF_UPDATE = 9,      // Internal Altref Frame (candidate for ALTREF2)
-  FRAME_UPDATE_TYPES = 10
-} FRAME_UPDATE_TYPE;
+enum {
+  KF_UPDATE,
+  LF_UPDATE,
+  GF_UPDATE,
+  ARF_UPDATE,
+  OVERLAY_UPDATE,
+  INTNL_OVERLAY_UPDATE,  // Internal Overlay Frame
+  INTNL_ARF_UPDATE,      // Internal Altref Frame
+  FRAME_UPDATE_TYPES
+} UENUM1BYTE(FRAME_UPDATE_TYPE);
 
 #define FC_ANIMATION_THRESH 0.15
-typedef enum {
+enum {
   FC_NORMAL = 0,
   FC_GRAPHICS_ANIMATION = 1,
   FRAME_CONTENT_TYPES = 2
-} FRAME_CONTENT_TYPE;
+} UENUM1BYTE(FRAME_CONTENT_TYPE);
 
 typedef struct {
   unsigned char index;
-  RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 1];
   FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 1];
   unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
   unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
-#if USE_SYMM_MULTI_LAYER
   unsigned char arf_pos_in_gf[MAX_STATIC_GF_GROUP_LENGTH + 1];
   unsigned char pyramid_level[MAX_STATIC_GF_GROUP_LENGTH + 1];
   unsigned char pyramid_height;
   unsigned char pyramid_lvl_nodes[MAX_PYRAMID_LVL];
-#endif  // USE_SYMM_MULTI_LAYER
-  unsigned char brf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char bidir_pred_enabled[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char ref_fb_idx_map[MAX_STATIC_GF_GROUP_LENGTH + 1][REF_FRAMES];
-  unsigned char refresh_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char refresh_flag[MAX_STATIC_GF_GROUP_LENGTH + 1];
   int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  int size;
 } GF_GROUP;
 
 typedef struct {
@@ -144,11 +142,6 @@ typedef struct {
   double mb_av_energy;
   double frame_avg_haar_energy;
 
-#if CONFIG_FP_MB_STATS
-  uint8_t *frame_mb_stats_buf;
-  uint8_t *this_frame_mb_stats;
-  FIRSTPASS_MB_STATS firstpass_mb_stats;
-#endif
   // An indication of the content type of the current frame
   FRAME_CONTENT_TYPE fr_content_type;
 
@@ -165,7 +158,6 @@ typedef struct {
 
   int kf_zeromotion_pct;
   int last_kfgroup_zeromotion_pct;
-  int gf_zeromotion_pct;
   int active_worst_quality;
   int baseline_active_worst_quality;
   int extend_minq;
@@ -176,30 +168,15 @@ typedef struct {
 } TWO_PASS;
 
 struct AV1_COMP;
+struct EncodeFrameParams;
+struct AV1EncoderConfig;
 
 void av1_init_first_pass(struct AV1_COMP *cpi);
 void av1_rc_get_first_pass_params(struct AV1_COMP *cpi);
-void av1_first_pass(struct AV1_COMP *cpi, const struct lookahead_entry *source);
+void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
 void av1_end_first_pass(struct AV1_COMP *cpi);
 
-void av1_init_second_pass(struct AV1_COMP *cpi);
-void av1_rc_get_second_pass_params(struct AV1_COMP *cpi);
-void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi,
-                                            FRAME_UPDATE_TYPE update_type);
-
-// Post encode update of the rate control parameters for 2-pass
-void av1_twopass_postencode_update(struct AV1_COMP *cpi);
-
-static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
-  if (arf_pending && MAX_EXT_ARFS > 0)
-    return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)
-               ? MAX_EXT_ARFS
-               : interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS
-                     ? MAX_EXT_ARFS - 1
-                     : 0;
-  else
-    return 0;
-}
+void av1_twopass_zero_stats(FIRSTPASS_STATS *section);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libaom/av1/encoder/global_motion.c b/libaom/av1/encoder/global_motion.c
index e35a208..b8b13c3 100644
--- a/libaom/av1/encoder/global_motion.c
+++ b/libaom/av1/encoder/global_motion.c
@@ -32,17 +32,24 @@
 #define MIN_INLIER_PROB 0.1
 
 #define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR)
-#define USE_GM_FEATURE_BASED 1
 
 // Border over which to compute the global motion
 #define ERRORADV_BORDER 0
 
 // Number of pyramid levels in disflow computation
-#define N_LEVELS 5
+#define N_LEVELS 2
 // Size of square patches in the disflow dense grid
-#define PATCH_SIZE 5
+#define PATCH_SIZE 8
+// Center point of square patch
+#define PATCH_CENTER ((PATCH_SIZE + 1) >> 1)
+// Step size between patches, lower value means greater patch overlap
+#define PATCH_STEP 1
 // Minimum size of border padding for disflow
 #define MIN_PAD 7
+// Warp error convergence threshold for disflow
+#define DISFLOW_ERROR_TR 0.01
+// Max number of iterations if warp convergence is not found
+#define DISFLOW_MAX_ITR 10
 
 // Struct for an image pyramid
 typedef struct {
@@ -104,7 +111,7 @@ static void convert_to_params(const double *params, int32_t *model) {
 void av1_convert_model_to_params(const double *params,
                                  WarpedMotionParams *model) {
   convert_to_params(params, model->wmmat);
-  model->wmtype = get_gmtype(model);
+  model->wmtype = get_wmtype(model);
   model->invalid = 0;
 }
 
@@ -237,7 +244,7 @@ int64_t av1_refine_integerized_param(WarpedMotionParams *wm,
     }
   }
   force_wmtype(wm, wmtype);
-  wm->wmtype = get_gmtype(wm);
+  wm->wmtype = get_wmtype(wm);
   return best_error;
 }
 
@@ -268,7 +275,6 @@ static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm,
   return buf_8bit;
 }
 
-#if USE_GM_FEATURE_BASED
 static int compute_global_motion_feature_based(
     TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
     int bit_depth, int *num_inliers_by_motion, double *params_by_motion,
@@ -323,7 +329,7 @@ static int compute_global_motion_feature_based(
   }
   return 0;
 }
-#else
+
 static INLINE RansacFuncDouble
 get_ransac_double_prec_type(TransformationType type) {
   switch (type) {
@@ -334,6 +340,35 @@ get_ransac_double_prec_type(TransformationType type) {
   }
 }
 
+// Don't use points around the frame border since they are less reliable
+static INLINE int valid_point(int x, int y, int width, int height) {
+  return (x > (PATCH_SIZE + PATCH_CENTER)) &&
+         (x < (width - PATCH_SIZE - PATCH_CENTER)) &&
+         (y > (PATCH_SIZE + PATCH_CENTER)) &&
+         (y < (height - PATCH_SIZE - PATCH_CENTER));
+}
+
+static int determine_disflow_correspondence(int *frm_corners,
+                                            int num_frm_corners, double *flow_u,
+                                            double *flow_v, int width,
+                                            int height, int stride,
+                                            double *correspondences) {
+  int num_correspondences = 0;
+  int x, y;
+  for (int i = 0; i < num_frm_corners; ++i) {
+    x = frm_corners[2 * i];
+    y = frm_corners[2 * i + 1];
+    if (valid_point(x, y, width, height)) {
+      correspondences[4 * num_correspondences] = x;
+      correspondences[4 * num_correspondences + 1] = y;
+      correspondences[4 * num_correspondences + 2] = x + flow_u[y * stride + x];
+      correspondences[4 * num_correspondences + 3] = y + flow_v[y * stride + x];
+      num_correspondences++;
+    }
+  }
+  return num_correspondences;
+}
+
 double getCubicValue(double p[4], double x) {
   return p[1] + 0.5 * x *
                     (p[2] - p[0] +
@@ -436,21 +471,24 @@ unsigned char interpolate(unsigned char *ref, double x, double y, int width,
 
 // Warps a block using flow vector [u, v] and computes the mse
 double compute_warp_and_error(unsigned char *ref, unsigned char *frm, int width,
-                              int height, int stride, double u, double v) {
+                              int height, int stride, int x, int y, double u,
+                              double v, int16_t *dt) {
   int i, j;
-  double warped, x, y;
+  unsigned char warped;
+  double x_w, y_w;
   double mse = 0;
-  double err = 0;
-  for (i = 0; i < height; ++i)
-    for (j = 0; j < width; ++j) {
-      x = (double)j - u;
-      y = (double)i - v;
-      warped = interpolate(ref, x, y, width, height, stride);
+  int16_t err = 0;
+  for (i = y; i < y + PATCH_SIZE; ++i)
+    for (j = x; j < x + PATCH_SIZE; ++j) {
+      x_w = (double)j + u;
+      y_w = (double)i + v;
+      warped = interpolate(ref, x_w, y_w, width, height, stride);
       err = warped - frm[j + i * stride];
       mse += err * err;
+      dt[(i - y) * PATCH_SIZE + (j - x)] = err;
     }
 
-  mse /= (width * height);
+  mse /= (PATCH_SIZE * PATCH_SIZE);
   return mse;
 }
 
@@ -465,19 +503,21 @@ double compute_warp_and_error(unsigned char *ref, unsigned char *frm, int width,
 // 2.)   b = |sum(dx * dt)|
 //           |sum(dy * dt)|
 // Where the sums are computed over a square window of PATCH_SIZE.
-static INLINE void compute_flow_system(const double *dx, const double *dy,
-                                       const double *dt, int stride, double *M,
-                                       double *b) {
+static INLINE void compute_flow_system(const double *dx, int dx_stride,
+                                       const double *dy, int dy_stride,
+                                       const int16_t *dt, int dt_stride,
+                                       double *M, double *b) {
   for (int i = 0; i < PATCH_SIZE; i++) {
     for (int j = 0; j < PATCH_SIZE; j++) {
-      M[0] += dx[i * stride + j] * dx[i * stride + j];
-      M[1] += dx[i * stride + j] * dy[i * stride + j];
-      M[3] += dy[i * stride + j] * dy[i * stride + j];
+      M[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j];
+      M[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j];
+      M[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j];
 
-      b[0] += dx[i * stride + j] * dt[i * stride + j];
-      b[1] += dy[i * stride + j] * dt[i * stride + j];
+      b[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j];
+      b[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j];
     }
   }
+
   M[2] = M[1];
 }
 
@@ -501,6 +541,7 @@ static INLINE void solve_2x2_system(const double *M, const double *b,
   output_vec[1] = -M[2] * mult_b0 + M_0 * mult_b1;
 }
 
+/*
 static INLINE void image_difference(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     int16_t *dst, int dst_stride, int height,
@@ -515,6 +556,7 @@ static INLINE void image_difference(const uint8_t *src, int src_stride,
     }
   }
 }
+*/
 
 // Compute an image gradient using a sobel filter.
 // If dir == 1, compute the x gradient. If dir == 0, compute y. This function
@@ -523,7 +565,7 @@ static INLINE void image_difference(const uint8_t *src, int src_stride,
 static INLINE void sobel_xy_image_gradient(const uint8_t *src, int src_stride,
                                            double *dst, int dst_stride,
                                            int height, int width, int dir) {
-  double norm = 1.0 / 8;
+  double norm = 1.0;
   // TODO(sarahparker) experiment with doing this over larger block sizes
   const int block_unit = 8;
   // Filter in 8x8 blocks to eventually make use of optimized convolve function
@@ -606,6 +648,24 @@ static void compute_flow_pyramids(unsigned char *frm, const int frm_width,
                    frm_pyr->heights[0], frm_pyr->widths[0],
                    frm_pyr->strides[0]);
 
+  if (compute_grad) {
+    cur_width = frm_pyr->widths[0];
+    cur_height = frm_pyr->heights[0];
+    cur_stride = frm_pyr->strides[0];
+    cur_loc = frm_pyr->level_loc[0];
+    assert(frm_pyr->has_gradient && frm_pyr->level_dx_buffer != NULL &&
+           frm_pyr->level_dy_buffer != NULL);
+    // Computation x gradient
+    sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+                            frm_pyr->level_dx_buffer + cur_loc, cur_stride,
+                            cur_height, cur_width, 1);
+
+    // Computation y gradient
+    sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+                            frm_pyr->level_dy_buffer + cur_loc, cur_stride,
+                            cur_height, cur_width, 0);
+  }
+
   // Start at the finest level and resize down to the coarsest level
   for (int level = 1; level < n_levels; ++level) {
     update_level_dims(frm_pyr, level);
@@ -636,6 +696,86 @@ static void compute_flow_pyramids(unsigned char *frm, const int frm_width,
   }
 }
 
+static INLINE void compute_flow_at_point(unsigned char *frm, unsigned char *ref,
+                                         double *dx, double *dy, int x, int y,
+                                         int width, int height, int stride,
+                                         double *u, double *v) {
+  double M[4] = { 0 };
+  double b[2] = { 0 };
+  double tmp_output_vec[2] = { 0 };
+  double error = 0;
+  int16_t dt[PATCH_SIZE * PATCH_SIZE];
+  double o_u = *u;
+  double o_v = *v;
+
+  for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+    error = compute_warp_and_error(ref, frm, width, height, stride, x, y, *u,
+                                   *v, dt);
+    if (error <= DISFLOW_ERROR_TR) break;
+    compute_flow_system(dx, stride, dy, stride, dt, PATCH_SIZE, M, b);
+    solve_2x2_system(M, b, tmp_output_vec);
+    *u += tmp_output_vec[0];
+    *v += tmp_output_vec[1];
+  }
+  if (fabs(*u - o_u) > PATCH_SIZE || fabs(*v - o_u) > PATCH_SIZE) {
+    *u = o_u;
+    *v = o_v;
+  }
+}
+
+// make sure flow_u and flow_v start at 0
+static void compute_flow_field(ImagePyramid *frm_pyr, ImagePyramid *ref_pyr,
+                               double *flow_u, double *flow_v) {
+  int cur_width, cur_height, cur_stride, cur_loc, patch_loc, patch_center;
+  double *u_upscale =
+      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+  double *v_upscale =
+      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+
+  assert(frm_pyr->n_levels == ref_pyr->n_levels);
+
+  // Compute flow field from coarsest to finest level of the pyramid
+  for (int level = frm_pyr->n_levels - 1; level >= 0; --level) {
+    cur_width = frm_pyr->widths[level];
+    cur_height = frm_pyr->heights[level];
+    cur_stride = frm_pyr->strides[level];
+    cur_loc = frm_pyr->level_loc[level];
+
+    for (int i = PATCH_SIZE; i < cur_height - PATCH_SIZE; i += PATCH_STEP) {
+      for (int j = PATCH_SIZE; j < cur_width - PATCH_SIZE; j += PATCH_STEP) {
+        patch_loc = i * cur_stride + j;
+        patch_center = patch_loc + PATCH_CENTER * cur_stride + PATCH_CENTER;
+        compute_flow_at_point(frm_pyr->level_buffer + cur_loc,
+                              ref_pyr->level_buffer + cur_loc,
+                              frm_pyr->level_dx_buffer + cur_loc + patch_loc,
+                              frm_pyr->level_dy_buffer + cur_loc + patch_loc, j,
+                              i, cur_width, cur_height, cur_stride,
+                              flow_u + patch_center, flow_v + patch_center);
+      }
+    }
+    // TODO(sarahparker) Replace this with upscale function in resize.c
+    if (level > 0) {
+      int h_upscale = frm_pyr->heights[level - 1];
+      int w_upscale = frm_pyr->widths[level - 1];
+      int s_upscale = frm_pyr->strides[level - 1];
+      for (int i = 0; i < h_upscale; ++i) {
+        for (int j = 0; j < w_upscale; ++j) {
+          u_upscale[j + i * s_upscale] =
+              flow_u[(int)(j >> 1) + (int)(i >> 1) * cur_stride];
+          v_upscale[j + i * s_upscale] =
+              flow_v[(int)(j >> 1) + (int)(i >> 1) * cur_stride];
+        }
+      }
+      memcpy(flow_u, u_upscale,
+             frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+      memcpy(flow_v, v_upscale,
+             frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+    }
+  }
+  aom_free(u_upscale);
+  aom_free(v_upscale);
+}
+
 static int compute_global_motion_disflow_based(
     TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
     int bit_depth, int *num_inliers_by_motion, double *params_by_motion,
@@ -647,6 +787,11 @@ static int compute_global_motion_disflow_based(
   const int ref_width = ref->y_width;
   const int ref_height = ref->y_height;
   const int pad_size = AOMMAX(PATCH_SIZE, MIN_PAD);
+  int num_frm_corners;
+  int num_correspondences;
+  double *correspondences;
+  int frm_corners[2 * MAX_CORNERS];
+  RansacFuncDouble ransac = get_ransac_double_prec_type(type);
   assert(frm_width == ref_width);
   assert(frm_height == ref_height);
 
@@ -683,29 +828,63 @@ static int compute_global_motion_disflow_based(
   compute_flow_pyramids(ref_buffer, ref_width, ref_height, ref->y_stride,
                         n_levels, pad_size, compute_gradient, ref_pyr);
 
-  // TODO(sarahparker) Implement the rest of DISFlow, currently only the image
-  // pyramid is implemented.
-  (void)num_inliers_by_motion;
-  (void)params_by_motion;
-  (void)num_motions;
-  (void)type;
+  double *flow_u =
+      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+  double *flow_v =
+      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+
+  memset(flow_u, 0,
+         frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+  memset(flow_v, 0,
+         frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+
+  compute_flow_field(frm_pyr, ref_pyr, flow_u, flow_v);
+
+  // compute interest points in images using FAST features
+  num_frm_corners = fast_corner_detect(frm_buffer, frm_width, frm_height,
+                                       frm->y_stride, frm_corners, MAX_CORNERS);
+  // find correspondences between the two images using the flow field
+  correspondences = aom_malloc(num_frm_corners * 4 * sizeof(*correspondences));
+  num_correspondences = determine_disflow_correspondence(
+      frm_corners, num_frm_corners, flow_u, flow_v, frm_width, frm_height,
+      frm_pyr->strides[0], correspondences);
+  ransac(correspondences, num_correspondences, num_inliers_by_motion,
+         params_by_motion, num_motions);
+
   free_pyramid(frm_pyr);
   free_pyramid(ref_pyr);
+  aom_free(correspondences);
+  aom_free(flow_u);
+  aom_free(flow_v);
+  // Set num_inliers = 0 for motions with too few inliers so they are ignored.
+  for (int i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) {
+      num_inliers_by_motion[i] = 0;
+    }
+  }
+
+  // Return true if any one of the motions has inliers.
+  for (int i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] > 0) return 1;
+  }
   return 0;
 }
-#endif
 
 int av1_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *frm,
                               YV12_BUFFER_CONFIG *ref, int bit_depth,
+                              GlobalMotionEstimationType gm_estimation_type,
                               int *num_inliers_by_motion,
                               double *params_by_motion, int num_motions) {
-#if USE_GM_FEATURE_BASED
-  return compute_global_motion_feature_based(type, frm, ref, bit_depth,
-                                             num_inliers_by_motion,
-                                             params_by_motion, num_motions);
-#else
-  return compute_global_motion_disflow_based(type, frm, ref, bit_depth,
-                                             num_inliers_by_motion,
-                                             params_by_motion, num_motions);
-#endif
+  switch (gm_estimation_type) {
+    case GLOBAL_MOTION_FEATURE_BASED:
+      return compute_global_motion_feature_based(type, frm, ref, bit_depth,
+                                                 num_inliers_by_motion,
+                                                 params_by_motion, num_motions);
+    case GLOBAL_MOTION_DISFLOW_BASED:
+      return compute_global_motion_disflow_based(type, frm, ref, bit_depth,
+                                                 num_inliers_by_motion,
+                                                 params_by_motion, num_motions);
+    default: assert(0 && "Unknown global motion estimation type");
+  }
+  return 0;
 }
diff --git a/libaom/av1/encoder/global_motion.h b/libaom/av1/encoder/global_motion.h
index 42cf221..2cfddad 100644
--- a/libaom/av1/encoder/global_motion.h
+++ b/libaom/av1/encoder/global_motion.h
@@ -22,6 +22,11 @@ extern "C" {
 
 #define RANSAC_NUM_MOTIONS 1
 
+typedef enum {
+  GLOBAL_MOTION_FEATURE_BASED,
+  GLOBAL_MOTION_DISFLOW_BASED,
+} GlobalMotionEstimationType;
+
 void av1_convert_model_to_params(const double *params,
                                  WarpedMotionParams *model);
 
@@ -56,6 +61,7 @@ int64_t av1_refine_integerized_param(WarpedMotionParams *wm,
 */
 int av1_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *frm,
                               YV12_BUFFER_CONFIG *ref, int bit_depth,
+                              GlobalMotionEstimationType gm_estimation_type,
                               int *num_inliers_by_motion,
                               double *params_by_motion, int num_motions);
 #ifdef __cplusplus
diff --git a/libaom/av1/encoder/gop_structure.c b/libaom/av1/encoder/gop_structure.c
new file mode 100644
index 0000000..73cb0ed
--- /dev/null
+++ b/libaom/av1/encoder/gop_structure.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/onyxc_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+
+// Set parameters for frames between 'start' and 'end' (excluding both).
+static void set_multi_layer_params(GF_GROUP *const gf_group, int start, int end,
+                                   int *frame_ind, int arf_ind, int level) {
+  assert(level >= MIN_PYRAMID_LVL);
+  const int num_frames_to_process = end - start - 1;
+  assert(num_frames_to_process >= 0);
+  if (num_frames_to_process == 0) return;
+
+  // Either we are at the last level of the pyramid, or we don't have enough
+  // frames between 'l' and 'r' to create one more level.
+  if (level == MIN_PYRAMID_LVL || num_frames_to_process < 3) {
+    // Leaf nodes.
+    while (++start < end) {
+      gf_group->update_type[*frame_ind] = LF_UPDATE;
+      gf_group->arf_src_offset[*frame_ind] = 0;
+      gf_group->arf_pos_in_gf[*frame_ind] = 0;
+      gf_group->arf_update_idx[*frame_ind] = arf_ind;
+      gf_group->pyramid_level[*frame_ind] = MIN_PYRAMID_LVL;
+      ++gf_group->pyramid_lvl_nodes[MIN_PYRAMID_LVL];
+      ++(*frame_ind);
+    }
+  } else {
+    const int m = (start + end) / 2;
+    const int arf_pos_in_gf = *frame_ind;
+
+    // Internal ARF.
+    gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+    gf_group->arf_src_offset[*frame_ind] = m - start - 1;
+    gf_group->arf_pos_in_gf[*frame_ind] = 0;
+    gf_group->arf_update_idx[*frame_ind] = 1;  // mark all internal ARF 1
+    gf_group->pyramid_level[*frame_ind] = level;
+    ++gf_group->pyramid_lvl_nodes[level];
+    ++(*frame_ind);
+
+    // Frames displayed before this internal ARF.
+    set_multi_layer_params(gf_group, start, m, frame_ind, 1, level - 1);
+
+    // Overlay for internal ARF.
+    gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+    gf_group->arf_src_offset[*frame_ind] = 0;
+    gf_group->arf_pos_in_gf[*frame_ind] = arf_pos_in_gf;  // For bit allocation.
+    gf_group->arf_update_idx[*frame_ind] = 1;
+    gf_group->pyramid_level[*frame_ind] = MIN_PYRAMID_LVL;
+    ++(*frame_ind);
+
+    // Frames displayed after this internal ARF.
+    set_multi_layer_params(gf_group, m, end, frame_ind, arf_ind, level - 1);
+  }
+}
+
+static int construct_multi_layer_gf_structure(
+    GF_GROUP *const gf_group, int gf_interval, int pyr_height,
+    FRAME_UPDATE_TYPE first_frame_update_type) {
+  gf_group->pyramid_height = pyr_height;
+  av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL);
+  int frame_index = 0;
+
+  // Keyframe / Overlay frame / Golden frame.
+  assert(gf_interval >= 1);
+  assert(first_frame_update_type == KF_UPDATE ||
+         first_frame_update_type == OVERLAY_UPDATE ||
+         first_frame_update_type == GF_UPDATE);
+  gf_group->update_type[frame_index] = first_frame_update_type;
+  gf_group->arf_src_offset[frame_index] = 0;
+  gf_group->arf_pos_in_gf[frame_index] = 0;
+  gf_group->arf_update_idx[frame_index] = 0;
+  gf_group->pyramid_level[frame_index] = MIN_PYRAMID_LVL;
+  ++frame_index;
+
+  // ALTREF.
+  const int use_altref = (gf_group->pyramid_height > 0);
+  if (use_altref) {
+    gf_group->update_type[frame_index] = ARF_UPDATE;
+    gf_group->arf_src_offset[frame_index] = gf_interval - 1;
+    gf_group->arf_pos_in_gf[frame_index] = 0;
+    gf_group->arf_update_idx[frame_index] = 0;
+    gf_group->pyramid_level[frame_index] = gf_group->pyramid_height;
+    ++frame_index;
+  }
+
+  // Rest of the frames.
+  const int next_height =
+      use_altref ? gf_group->pyramid_height - 1 : gf_group->pyramid_height;
+  assert(next_height >= MIN_PYRAMID_LVL);
+  set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0,
+                         next_height);
+  return frame_index;
+}
+
+#define CHECK_GF_PARAMETER 0
+#if CHECK_GF_PARAMETER
+void check_frame_params(GF_GROUP *const gf_group, int gf_interval) {
+  static const char *update_type_strings[FRAME_UPDATE_TYPES] = {
+    "KF_UPDATE",       "LF_UPDATE",      "GF_UPDATE",
+    "ARF_UPDATE",      "OVERLAY_UPDATE", "INTNL_OVERLAY_UPDATE",
+    "INTNL_ARF_UPDATE"
+  };
+  FILE *fid = fopen("GF_PARAMS.txt", "a");
+
+  fprintf(fid, "\ngf_interval = {%d}\n", gf_interval);
+  for (int i = 0; i <= gf_group->size; ++i) {
+    fprintf(fid, "#%2d : %s %d %d %d %d\n", i,
+            update_type_strings[gf_group->update_type[i]],
+            gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
+            gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
+  }
+
+  fprintf(fid, "number of nodes in each level: \n");
+  for (int i = 0; i < gf_group->pyramid_height; ++i) {
+    fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]);
+  }
+  fprintf(fid, "\n");
+  fclose(fid);
+}
+#endif  // CHECK_GF_PARAMETER
+
+static INLINE int max_pyramid_height_from_width(int pyramid_width) {
+  if (pyramid_width > 12) return 4;
+  if (pyramid_width > 6) return 3;
+  if (pyramid_width > 3) return 2;
+  if (pyramid_width > 1) return 1;
+  return 0;
+}
+
+static int get_pyramid_height(const AV1_COMP *const cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  assert(IMPLIES(cpi->oxcf.gf_max_pyr_height == MIN_PYRAMID_LVL,
+                 !rc->source_alt_ref_pending));  // define_gf_group() enforced.
+  if (!rc->source_alt_ref_pending) {
+    return MIN_PYRAMID_LVL;
+  }
+  assert(cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
+  if (!cpi->internal_altref_allowed) {
+    assert(MIN_PYRAMID_LVL + 1 <= cpi->oxcf.gf_max_pyr_height);
+    return MIN_PYRAMID_LVL + 1;
+  }
+  return AOMMIN(max_pyramid_height_from_width(rc->baseline_gf_interval),
+                cpi->oxcf.gf_max_pyr_height);
+}
+
+void av1_gop_setup_structure(AV1_COMP *cpi,
+                             const EncodeFrameParams *const frame_params) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  const int key_frame = (frame_params->frame_type == KEY_FRAME);
+  const FRAME_UPDATE_TYPE first_frame_update_type =
+      key_frame ? KF_UPDATE
+                : rc->source_alt_ref_active ? OVERLAY_UPDATE : GF_UPDATE;
+  gf_group->size = construct_multi_layer_gf_structure(
+      gf_group, rc->baseline_gf_interval, get_pyramid_height(cpi),
+      first_frame_update_type);
+
+  // We need to configure the frame at the end of the sequence + 1 that
+  // will be the start frame for the next group. Otherwise prior to the
+  // call to av1_get_second_pass_params(), the data will be undefined.
+  gf_group->update_type[gf_group->size] =
+      (rc->source_alt_ref_pending) ? OVERLAY_UPDATE : GF_UPDATE;
+  gf_group->arf_update_idx[gf_group->size] = 0;
+  gf_group->arf_pos_in_gf[gf_group->size] = 0;
+
+#if CHECK_GF_PARAMETER
+  check_frame_params(gf_group, rc->baseline_gf_interval);
+#endif
+}
diff --git a/libaom/av1/encoder/gop_structure.h b/libaom/av1/encoder/gop_structure.h
new file mode 100644
index 0000000..d9d5ae7
--- /dev/null
+++ b/libaom/av1/encoder/gop_structure.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GOP_STRUCTURE_H_
+#define AOM_AV1_ENCODER_GOP_STRUCTURE_H_
+
+#include "av1/common/onyxc_int.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+
+// Set up the Group-Of-Pictures structure for this GF_GROUP.  This involves
+// deciding where to place the various FRAME_UPDATE_TYPEs in the group.  It does
+// this primarily by setting the contents of
+// cpi->twopass.gf_group.update_type[].
+void av1_gop_setup_structure(
+    struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_GOP_STRUCTURE_H_
diff --git a/libaom/av1/encoder/hash_motion.c b/libaom/av1/encoder/hash_motion.c
index e85a516..00915e5 100644
--- a/libaom/av1/encoder/hash_motion.c
+++ b/libaom/av1/encoder/hash_motion.c
@@ -147,7 +147,8 @@ static void hash_table_add_to_table(hash_table *p_hash_table,
   }
 }
 
-int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value) {
+int32_t av1_hash_table_count(const hash_table *p_hash_table,
+                             uint32_t hash_value) {
   if (p_hash_table->p_lookup_table[hash_value] == NULL) {
     return 0;
   } else {
@@ -392,8 +393,9 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
                               uint32_t *hash_value1, uint32_t *hash_value2,
                               int use_highbitdepth, MACROBLOCK *x) {
   uint32_t to_hash[4];
-  const int add_value = hash_block_size_to_index(block_size) << crc_bits;
+  int add_value = hash_block_size_to_index(block_size);
   assert(add_value >= 0);
+  add_value <<= crc_bits;
   const int crc_mask = (1 << crc_bits) - 1;
 
   // 2x2 subblock hash values in current CU
diff --git a/libaom/av1/encoder/hash_motion.h b/libaom/av1/encoder/hash_motion.h
index df3ec32..ed9bb6e 100644
--- a/libaom/av1/encoder/hash_motion.h
+++ b/libaom/av1/encoder/hash_motion.h
@@ -37,7 +37,8 @@ typedef struct _hash_table {
 void av1_hash_table_init(hash_table *p_hash_table, struct macroblock *x);
 void av1_hash_table_destroy(hash_table *p_hash_table);
 void av1_hash_table_create(hash_table *p_hash_table);
-int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value);
+int32_t av1_hash_table_count(const hash_table *p_hash_table,
+                             uint32_t hash_value);
 Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
                                      uint32_t hash_value);
 int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
diff --git a/libaom/av1/encoder/level.c b/libaom/av1/encoder/level.c
new file mode 100644
index 0000000..1668bdf
--- /dev/null
+++ b/libaom/av1/encoder/level.c
@@ -0,0 +1,647 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/system_state.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/level.h"
+
+#define UNDEFINED_LEVEL                                                 \
+  {                                                                     \
+    .level = SEQ_LEVEL_MAX, .max_picture_size = 0, .max_h_size = 0,     \
+    .max_v_size = 0, .max_display_rate = 0, .max_decode_rate = 0,       \
+    .max_header_rate = 0, .main_mbps = 0, .high_mbps = 0, .main_cr = 0, \
+    .high_cr = 0, .max_tiles = 0, .max_tile_cols = 0                    \
+  }
+
+static const AV1LevelSpec av1_level_defs[SEQ_LEVELS] = {
+  { .level = SEQ_LEVEL_2_0,
+    .max_picture_size = 147456,
+    .max_h_size = 2048,
+    .max_v_size = 1152,
+    .max_display_rate = 4423680L,
+    .max_decode_rate = 5529600L,
+    .max_header_rate = 150,
+    .main_mbps = 1.5,
+    .high_mbps = 0,
+    .main_cr = 2.0,
+    .high_cr = 0,
+    .max_tiles = 8,
+    .max_tile_cols = 4 },
+  { .level = SEQ_LEVEL_2_1,
+    .max_picture_size = 278784,
+    .max_h_size = 2816,
+    .max_v_size = 1584,
+    .max_display_rate = 8363520L,
+    .max_decode_rate = 10454400L,
+    .max_header_rate = 150,
+    .main_mbps = 3.0,
+    .high_mbps = 0,
+    .main_cr = 2.0,
+    .high_cr = 0,
+    .max_tiles = 8,
+    .max_tile_cols = 4 },
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  { .level = SEQ_LEVEL_3_0,
+    .max_picture_size = 665856,
+    .max_h_size = 4352,
+    .max_v_size = 2448,
+    .max_display_rate = 19975680L,
+    .max_decode_rate = 24969600L,
+    .max_header_rate = 150,
+    .main_mbps = 6.0,
+    .high_mbps = 0,
+    .main_cr = 2.0,
+    .high_cr = 0,
+    .max_tiles = 16,
+    .max_tile_cols = 6 },
+  { .level = SEQ_LEVEL_3_1,
+    .max_picture_size = 1065024,
+    .max_h_size = 5504,
+    .max_v_size = 3096,
+    .max_display_rate = 31950720L,
+    .max_decode_rate = 39938400L,
+    .max_header_rate = 150,
+    .main_mbps = 10.0,
+    .high_mbps = 0,
+    .main_cr = 2.0,
+    .high_cr = 0,
+    .max_tiles = 16,
+    .max_tile_cols = 6 },
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  { .level = SEQ_LEVEL_4_0,
+    .max_picture_size = 2359296,
+    .max_h_size = 6144,
+    .max_v_size = 3456,
+    .max_display_rate = 70778880L,
+    .max_decode_rate = 77856768L,
+    .max_header_rate = 300,
+    .main_mbps = 12.0,
+    .high_mbps = 30.0,
+    .main_cr = 4.0,
+    .high_cr = 4.0,
+    .max_tiles = 32,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_4_1,
+    .max_picture_size = 2359296,
+    .max_h_size = 6144,
+    .max_v_size = 3456,
+    .max_display_rate = 141557760L,
+    .max_decode_rate = 155713536L,
+    .max_header_rate = 300,
+    .main_mbps = 20.0,
+    .high_mbps = 50.0,
+    .main_cr = 4.0,
+    .high_cr = 4.0,
+    .max_tiles = 32,
+    .max_tile_cols = 8 },
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  { .level = SEQ_LEVEL_5_0,
+    .max_picture_size = 8912896,
+    .max_h_size = 8192,
+    .max_v_size = 4352,
+    .max_display_rate = 267386880L,
+    .max_decode_rate = 273715200L,
+    .max_header_rate = 300,
+    .main_mbps = 30.0,
+    .high_mbps = 100.0,
+    .main_cr = 6.0,
+    .high_cr = 4.0,
+    .max_tiles = 64,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_5_1,
+    .max_picture_size = 8912896,
+    .max_h_size = 8192,
+    .max_v_size = 4352,
+    .max_display_rate = 534773760L,
+    .max_decode_rate = 547430400L,
+    .max_header_rate = 300,
+    .main_mbps = 40.0,
+    .high_mbps = 160.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 64,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_5_2,
+    .max_picture_size = 8912896,
+    .max_h_size = 8192,
+    .max_v_size = 4352,
+    .max_display_rate = 1069547520L,
+    .max_decode_rate = 1094860800L,
+    .max_header_rate = 300,
+    .main_mbps = 60.0,
+    .high_mbps = 240.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 64,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_5_3,
+    .max_picture_size = 8912896,
+    .max_h_size = 8192,
+    .max_v_size = 4352,
+    .max_display_rate = 1069547520L,
+    .max_decode_rate = 1176502272L,
+    .max_header_rate = 300,
+    .main_mbps = 60.0,
+    .high_mbps = 240.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 64,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_6_0,
+    .max_picture_size = 35651584,
+    .max_h_size = 16384,
+    .max_v_size = 8704,
+    .max_display_rate = 1069547520L,
+    .max_decode_rate = 1176502272L,
+    .max_header_rate = 300,
+    .main_mbps = 60.0,
+    .high_mbps = 240.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 128,
+    .max_tile_cols = 16 },
+  { .level = SEQ_LEVEL_6_1,
+    .max_picture_size = 35651584,
+    .max_h_size = 16384,
+    .max_v_size = 8704,
+    .max_display_rate = 2139095040L,
+    .max_decode_rate = 2189721600L,
+    .max_header_rate = 300,
+    .main_mbps = 100.0,
+    .high_mbps = 480.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 128,
+    .max_tile_cols = 16 },
+  { .level = SEQ_LEVEL_6_2,
+    .max_picture_size = 35651584,
+    .max_h_size = 16384,
+    .max_v_size = 8704,
+    .max_display_rate = 4278190080L,
+    .max_decode_rate = 4379443200L,
+    .max_header_rate = 300,
+    .main_mbps = 160.0,
+    .high_mbps = 800.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 128,
+    .max_tile_cols = 16 },
+  { .level = SEQ_LEVEL_6_3,
+    .max_picture_size = 35651584,
+    .max_h_size = 16384,
+    .max_v_size = 8704,
+    .max_display_rate = 4278190080L,
+    .max_decode_rate = 4706009088L,
+    .max_header_rate = 300,
+    .main_mbps = 160.0,
+    .high_mbps = 800.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 128,
+    .max_tile_cols = 16 },
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+};
+
+typedef enum {
+  LUMA_PIC_SIZE_TOO_LARGE,
+  LUMA_PIC_H_SIZE_TOO_LARGE,
+  LUMA_PIC_V_SIZE_TOO_LARGE,
+  LUMA_PIC_H_SIZE_TOO_SMALL,
+  LUMA_PIC_V_SIZE_TOO_SMALL,
+  TOO_MANY_TILE_COLUMNS,
+  TOO_MANY_TILES,
+  TILE_RATE_TOO_HIGH,
+  TILE_TOO_LARGE,
+  SUPERRES_TILE_WIDTH_TOO_LARGE,
+  CROPPED_TILE_WIDTH_TOO_SMALL,
+  CROPPED_TILE_HEIGHT_TOO_SMALL,
+  TILE_WIDTH_INVALID,
+  FRAME_HEADER_RATE_TOO_HIGH,
+  DISPLAY_RATE_TOO_HIGH,
+  DECODE_RATE_TOO_HIGH,
+  CR_TOO_SMALL,
+
+  TARGET_LEVEL_FAIL_IDS,
+  TARGET_LEVEL_OK,
+} TARGET_LEVEL_FAIL_ID;
+
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+  "The picture size is too large.",
+  "The picture width is too large.",
+  "The picture height is too large.",
+  "The picture width is too small.",
+  "The picture height is too small.",
+  "Too many tile columns are used.",
+  "Too many tiles are used.",
+  "The tile rate is too high.",
+  "The tile size is too large.",
+  "The superres tile width is too large.",
+  "The cropped tile width is less than 8.",
+  "The cropped tile height is less than 8.",
+  "The tile width is invalid.",
+  "The frame header rate is too high.",
+  "The display luma sample rate is too high.",
+  "The decoded luma sample rate is too high.",
+  "The compression ratio is too small.",
+};
+
+static double get_min_cr(const AV1LevelSpec *const level_spec, int tier,
+                         int is_still_picture, int64_t decoded_sample_rate) {
+  if (is_still_picture) return 0.8;
+  const double min_cr_basis = tier ? level_spec->high_cr : level_spec->main_cr;
+  const double speed_adj =
+      (double)decoded_sample_rate / level_spec->max_display_rate;
+  return AOMMAX(min_cr_basis * speed_adj, 0.8);
+}
+
+static TARGET_LEVEL_FAIL_ID check_level_constraints(
+    const AV1LevelSpec *const target_level_spec,
+    const AV1LevelSpec *const level_spec,
+    const AV1LevelStats *const level_stats, int tier, int is_still_picture) {
+  const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture,
+                                   level_spec->max_decode_rate);
+  TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK;
+
+  do {
+    if (level_spec->max_picture_size > target_level_spec->max_picture_size) {
+      fail_id = LUMA_PIC_SIZE_TOO_LARGE;
+      break;
+    }
+
+    if (level_spec->max_h_size > target_level_spec->max_h_size) {
+      fail_id = LUMA_PIC_H_SIZE_TOO_LARGE;
+      break;
+    }
+
+    if (level_spec->max_v_size > target_level_spec->max_v_size) {
+      fail_id = LUMA_PIC_V_SIZE_TOO_LARGE;
+      break;
+    }
+
+    if (level_spec->max_tile_cols > target_level_spec->max_tile_cols) {
+      fail_id = TOO_MANY_TILE_COLUMNS;
+      break;
+    }
+
+    if (level_spec->max_tiles > target_level_spec->max_tiles) {
+      fail_id = TOO_MANY_TILES;
+      break;
+    }
+
+    if (level_spec->max_header_rate > target_level_spec->max_header_rate) {
+      fail_id = FRAME_HEADER_RATE_TOO_HIGH;
+      break;
+    }
+
+    if (level_spec->max_display_rate > target_level_spec->max_display_rate) {
+      fail_id = DISPLAY_RATE_TOO_HIGH;
+      break;
+    }
+
+    if (level_spec->max_decode_rate > target_level_spec->max_decode_rate) {
+      fail_id = DECODE_RATE_TOO_HIGH;
+      break;
+    }
+
+    if (level_spec->max_tile_rate > target_level_spec->max_tiles * 120) {
+      fail_id = TILE_RATE_TOO_HIGH;
+      break;
+    }
+
+    if (level_stats->max_tile_size > 4096 * 2304) {
+      fail_id = TILE_TOO_LARGE;
+      break;
+    }
+
+    if (level_stats->max_superres_tile_width > MAX_TILE_WIDTH) {
+      fail_id = SUPERRES_TILE_WIDTH_TOO_LARGE;
+      break;
+    }
+
+    if (level_stats->min_cropped_tile_width < 8) {
+      fail_id = CROPPED_TILE_WIDTH_TOO_SMALL;
+      break;
+    }
+
+    if (level_stats->min_cropped_tile_height < 8) {
+      fail_id = CROPPED_TILE_HEIGHT_TOO_SMALL;
+      break;
+    }
+
+    if (level_stats->min_frame_width < 16) {
+      fail_id = LUMA_PIC_H_SIZE_TOO_SMALL;
+      break;
+    }
+
+    if (level_stats->min_frame_height < 16) {
+      fail_id = LUMA_PIC_V_SIZE_TOO_SMALL;
+      break;
+    }
+
+    if (!level_stats->tile_width_is_valid) {
+      fail_id = TILE_WIDTH_INVALID;
+      break;
+    }
+
+    if (level_stats->min_cr < min_cr) {
+      fail_id = CR_TOO_SMALL;
+      break;
+    }
+  } while (0);
+
+  return fail_id;
+}
+
+static INLINE int is_in_operating_point(int operating_point,
+                                        int temporal_layer_id,
+                                        int spatial_layer_id) {
+  if (!operating_point) return 1;
+
+  return ((operating_point >> temporal_layer_id) & 1) &&
+         ((operating_point >> (spatial_layer_id + 8)) & 1);
+}
+
+static void get_tile_stats(const AV1_COMP *const cpi, int *max_tile_size,
+                           int *max_superres_tile_width,
+                           int *min_cropped_tile_width,
+                           int *min_cropped_tile_height,
+                           int *tile_width_valid) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int superres_scale_denominator = cm->superres_scale_denominator;
+
+  *max_tile_size = 0;
+  *max_superres_tile_width = 0;
+  *min_cropped_tile_width = INT_MAX;
+  *min_cropped_tile_height = INT_MAX;
+  *tile_width_valid = 1;
+
+  for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      const TileInfo *const tile_info =
+          &cpi->tile_data[tile_row * cm->tile_cols + tile_col].tile_info;
+      const int tile_width =
+          (tile_info->mi_col_end - tile_info->mi_col_start) * MI_SIZE;
+      const int tile_height =
+          (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE;
+      const int tile_size = tile_width * tile_height;
+      *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+
+      const int supperres_tile_width =
+          tile_width * superres_scale_denominator / SCALE_NUMERATOR;
+      *max_superres_tile_width =
+          AOMMAX(*max_superres_tile_width, supperres_tile_width);
+
+      const int cropped_tile_width =
+          cm->width - tile_info->mi_col_start * MI_SIZE;
+      const int cropped_tile_height =
+          cm->height - tile_info->mi_row_start * MI_SIZE;
+      *min_cropped_tile_width =
+          AOMMIN(*min_cropped_tile_width, cropped_tile_width);
+      *min_cropped_tile_height =
+          AOMMIN(*min_cropped_tile_height, cropped_tile_height);
+
+      const int is_right_most_tile = tile_info->mi_col_end == cm->mi_cols;
+      if (!is_right_most_tile) {
+        if (av1_superres_scaled(cm))
+          *tile_width_valid &= tile_width >= 128;
+        else
+          *tile_width_valid &= tile_width >= 64;
+      }
+    }
+  }
+}
+
+static int store_frame_record(int64_t ts_start, int64_t ts_end, int pic_size,
+                              int frame_header_count, int tiles, int show_frame,
+                              int show_existing_frame,
+                              FrameWindowBuffer *const buffer) {
+  if (buffer->num < FRAME_WINDOW_SIZE) {
+    ++buffer->num;
+  } else {
+    buffer->start = (buffer->start + 1) % FRAME_WINDOW_SIZE;
+  }
+  const int new_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE;
+  FrameRecord *const record = &buffer->buf[new_idx];
+  record->ts_start = ts_start;
+  record->ts_end = ts_end;
+  record->pic_size = pic_size;
+  record->frame_header_count = frame_header_count;
+  record->tiles = tiles;
+  record->show_frame = show_frame;
+  record->show_existing_frame = show_existing_frame;
+
+  return new_idx;
+}
+
+// Count the number of frames encoded in the last "duration" ticks, in display
+// time.
+static int count_frames(const FrameWindowBuffer *const buffer,
+                        int64_t duration) {
+  const int current_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE;
+  // Assume current frame is shown frame.
+  assert(buffer->buf[current_idx].show_frame);
+
+  const int64_t current_time = buffer->buf[current_idx].ts_end;
+  const int64_t time_limit = AOMMAX(current_time - duration, 0);
+  int num_frames = 1;
+  int index = current_idx - 1;
+  for (int i = buffer->num - 2; i >= 0; --i, --index, ++num_frames) {
+    if (index < 0) index = FRAME_WINDOW_SIZE - 1;
+    const FrameRecord *const record = &buffer->buf[index];
+    if (!record->show_frame) continue;
+    const int64_t ts_start = record->ts_start;
+    if (ts_start < time_limit) break;
+  }
+
+  return num_frames;
+}
+
+// Scan previously encoded frames and update level metrics accordingly.
+static void scan_past_frames(const FrameWindowBuffer *const buffer,
+                             int num_frames_to_scan,
+                             AV1LevelSpec *const level_spec) {
+  const int num_frames_in_buffer = buffer->num;
+  int index = (buffer->start + num_frames_in_buffer - 1) % FRAME_WINDOW_SIZE;
+  int frame_headers = 0;
+  int tiles = 0;
+  int64_t display_samples = 0;
+  int64_t decoded_samples = 0;
+  for (int i = 0; i < AOMMIN(num_frames_in_buffer, num_frames_to_scan); ++i) {
+    const FrameRecord *const record = &buffer->buf[index];
+    if (!record->show_existing_frame) {
+      frame_headers += record->frame_header_count;
+      decoded_samples += record->pic_size;
+    }
+    if (record->show_frame) {
+      display_samples += record->pic_size;
+    }
+    tiles += record->tiles;
+    --index;
+    if (index < 0) index = FRAME_WINDOW_SIZE - 1;
+  }
+  level_spec->max_header_rate =
+      AOMMAX(level_spec->max_header_rate, frame_headers);
+  level_spec->max_display_rate =
+      AOMMAX(level_spec->max_display_rate, display_samples);
+  level_spec->max_decode_rate =
+      AOMMAX(level_spec->max_decode_rate, decoded_samples);
+  level_spec->max_tile_rate = AOMMAX(level_spec->max_tile_rate, tiles);
+}
+
+void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
+                           int64_t ts_end) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int upscaled_width = cm->superres_upscaled_width;
+  const int width = cm->width;
+  const int height = cm->height;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int tiles = tile_cols * tile_rows;
+  const int luma_pic_size = upscaled_width * height;
+  const int frame_header_count = cpi->frame_header_count;
+  const int show_frame = cm->show_frame;
+  const int show_existing_frame = cm->show_existing_frame;
+
+  // Store info. of current frame into FrameWindowBuffer.
+  FrameWindowBuffer *const buffer = &cpi->frame_window_buffer;
+  store_frame_record(ts_start, ts_end, luma_pic_size, frame_header_count, tiles,
+                     show_frame, show_existing_frame, buffer);
+  // Count the number of frames encoded in the past 1 second.
+  const int encoded_frames_in_last_second =
+      show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0;
+
+  int max_tile_size;
+  int min_cropped_tile_width;
+  int min_cropped_tile_height;
+  int max_superres_tile_width;
+  int tile_width_is_valid;
+  get_tile_stats(cpi, &max_tile_size, &max_superres_tile_width,
+                 &min_cropped_tile_width, &min_cropped_tile_height,
+                 &tile_width_is_valid);
+
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  const int pic_size_profile_factor =
+      profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
+  const size_t frame_compressed_size = (size > 129 ? size - 128 : 1);
+  const size_t frame_uncompressed_size =
+      (luma_pic_size * pic_size_profile_factor) >> 3;
+
+  aom_clear_system_state();
+  const double compression_ratio =
+      frame_uncompressed_size / (double)frame_compressed_size;
+  const double total_time_encoded =
+      (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+      (double)TICKS_PER_SEC;
+
+  const int temporal_layer_id = cm->temporal_layer_id;
+  const int spatial_layer_id = cm->spatial_layer_id;
+  const int is_still_picture = seq_params->still_picture;
+  // update level_stats
+  // TODO(kyslov@) fix the implementation according to buffer model
+  for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; ++i) {
+    if (!is_in_operating_point(seq_params->operating_point_idc[i],
+                               temporal_layer_id, spatial_layer_id)) {
+      continue;
+    }
+
+    AV1LevelInfo *const level_info = &cpi->level_info[i];
+    AV1LevelStats *const level_stats = &level_info->level_stats;
+
+    level_stats->max_tile_size =
+        AOMMAX(level_stats->max_tile_size, max_tile_size);
+    level_stats->max_superres_tile_width =
+        AOMMAX(level_stats->max_superres_tile_width, max_superres_tile_width);
+    level_stats->min_cropped_tile_width =
+        AOMMIN(level_stats->min_cropped_tile_width, min_cropped_tile_width);
+    level_stats->min_cropped_tile_height =
+        AOMMIN(level_stats->min_cropped_tile_height, min_cropped_tile_height);
+    level_stats->tile_width_is_valid &= tile_width_is_valid;
+    level_stats->min_frame_width = AOMMIN(level_stats->min_frame_width, width);
+    level_stats->min_frame_height =
+        AOMMIN(level_stats->min_frame_height, height);
+    level_stats->total_compressed_size += frame_compressed_size;
+    if (show_frame) level_stats->total_time_encoded = total_time_encoded;
+    level_stats->min_cr = AOMMIN(level_stats->min_cr, compression_ratio);
+
+    // update level_spec
+    // TODO(kyslov@) update all spec fields
+    AV1LevelSpec *const level_spec = &level_info->level_spec;
+    level_spec->max_picture_size =
+        AOMMAX(level_spec->max_picture_size, luma_pic_size);
+    level_spec->max_h_size =
+        AOMMAX(level_spec->max_h_size, cm->superres_upscaled_width);
+    level_spec->max_v_size = AOMMAX(level_spec->max_v_size, height);
+    level_spec->max_tile_cols = AOMMAX(level_spec->max_tile_cols, tile_cols);
+    level_spec->max_tiles = AOMMAX(level_spec->max_tiles, tiles);
+
+    if (show_frame) {
+      scan_past_frames(buffer, encoded_frames_in_last_second, level_spec);
+    }
+
+    // Check whether target level is met.
+    const AV1_LEVEL target_seq_level_idx = cpi->target_seq_level_idx[i];
+    if (target_seq_level_idx < SEQ_LEVELS) {
+      const AV1LevelSpec *const target_level_spec =
+          av1_level_defs + target_seq_level_idx;
+      const int tier = seq_params->tier[i];
+      const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
+          target_level_spec, level_spec, level_stats, tier, is_still_picture);
+      if (fail_id != TARGET_LEVEL_OK) {
+        const int target_level_major = 2 + (target_seq_level_idx >> 2);
+        const int target_level_minor = target_seq_level_idx & 3;
+        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                           "Failed to encode to the target level %d_%d. %s",
+                           target_level_major, target_level_minor,
+                           level_fail_messages[fail_id]);
+      }
+    }
+  }
+}
+
+aom_codec_err_t av1_get_seq_level_idx(const AV1_COMP *cpi, int *seq_level_idx) {
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  if (!cpi->keep_level_stats) {
+    for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
+      seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+    }
+    return AOM_CODEC_OK;
+  }
+
+  const int is_still_picture = seq_params->still_picture;
+  for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
+    seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+    const int tier = seq_params->tier[op];
+    const AV1LevelInfo *const level_info = &cpi->level_info[op];
+    const AV1LevelStats *const level_stats = &level_info->level_stats;
+    const AV1LevelSpec *const level_spec = &level_info->level_spec;
+    for (int level = 0; level < SEQ_LEVELS; ++level) {
+      const AV1LevelSpec *const target_level_spec = av1_level_defs + level;
+      const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
+          target_level_spec, level_spec, level_stats, tier, is_still_picture);
+      if (fail_id == TARGET_LEVEL_OK) {
+        seq_level_idx[op] = level;
+        break;
+      }
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
diff --git a/libaom/av1/encoder/level.h b/libaom/av1/encoder/level.h
new file mode 100644
index 0000000..9f1664d
--- /dev/null
+++ b/libaom/av1/encoder/level.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_LEVEL_H_
+#define AOM_AV1_ENCODER_LEVEL_H_
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+
+// AV1 Level Specifications
+typedef struct {
+  AV1_LEVEL level;
+  int max_picture_size;
+  int max_h_size;
+  int max_v_size;
+  int max_header_rate;
+  int max_tile_rate;
+  int max_tiles;
+  int max_tile_cols;
+  int64_t max_display_rate;
+  int64_t max_decode_rate;
+  double main_mbps;
+  double high_mbps;
+  double main_cr;
+  double high_cr;
+} AV1LevelSpec;
+
+typedef struct {
+  int64_t ts_start;
+  int64_t ts_end;
+  int pic_size;
+  int frame_header_count;
+  int tiles;
+  int show_frame;
+  int show_existing_frame;
+} FrameRecord;
+
+// Record frame info. in a rolling window.
+#define FRAME_WINDOW_SIZE 256
+typedef struct {
+  FrameRecord buf[FRAME_WINDOW_SIZE];
+  int num;    // Number of FrameRecord stored in the buffer.
+  int start;  // Buffer index of the first FrameRecord.
+} FrameWindowBuffer;
+
+// Used to keep track of AV1 Level Stats. Currently unimplemented.
+typedef struct {
+  uint64_t total_compressed_size;
+  int max_tile_size;
+  int max_superres_tile_width;
+  int min_cropped_tile_width;
+  int min_cropped_tile_height;
+  int tile_width_is_valid;
+  int min_frame_width;
+  int min_frame_height;
+  double total_time_encoded;
+  double min_cr;
+} AV1LevelStats;
+
+typedef struct {
+  AV1LevelStats level_stats;
+  AV1LevelSpec level_spec;
+} AV1LevelInfo;
+
+void av1_update_level_info(struct AV1_COMP *cpi, size_t size, int64_t ts_start,
+                           int64_t ts_end);
+
+// Return sequence level indices in seq_level_idx[MAX_NUM_OPERATING_POINTS].
+aom_codec_err_t av1_get_seq_level_idx(const struct AV1_COMP *cpi,
+                                      int *seq_level_idx);
+
+#endif  // AOM_AV1_ENCODER_LEVEL_H_
diff --git a/libaom/av1/encoder/lookahead.c b/libaom/av1/encoder/lookahead.c
index 1bf8ecb..f5298f7 100644
--- a/libaom/av1/encoder/lookahead.c
+++ b/libaom/av1/encoder/lookahead.c
@@ -43,7 +43,8 @@ void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
 
 struct lookahead_ctx *av1_lookahead_init(
     unsigned int width, unsigned int height, unsigned int subsampling_x,
-    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth) {
+    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
+    const int border_in_pixels, int is_scale) {
   struct lookahead_ctx *ctx = NULL;
 
   // Clamp the lookahead queue depth
@@ -61,10 +62,19 @@ struct lookahead_ctx *av1_lookahead_init(
     ctx->buf = calloc(depth, sizeof(*ctx->buf));
     if (!ctx->buf) goto bail;
     for (i = 0; i < depth; i++)
-      if (aom_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x,
-                                 subsampling_y, use_highbitdepth,
-                                 AOM_BORDER_IN_PIXELS, legacy_byte_alignment))
-        goto bail;
+      if (is_scale) {
+        if (aom_alloc_frame_buffer(
+                &ctx->buf[i].img, width, height, subsampling_x, subsampling_y,
+                use_highbitdepth, border_in_pixels, legacy_byte_alignment))
+          goto bail;
+      } else {
+        aom_free_frame_buffer(&ctx->buf[i].img);
+        if (aom_realloc_lookahead_buffer(
+                &ctx->buf[i].img, width, height, subsampling_x, subsampling_y,
+                use_highbitdepth, AOM_ENC_LOOKAHEAD_BORDER,
+                legacy_byte_alignment, NULL, NULL, NULL))
+          goto bail;
+      }
   }
   return ctx;
 bail:
diff --git a/libaom/av1/encoder/lookahead.h b/libaom/av1/encoder/lookahead.h
index e55224c..3b2d94b 100644
--- a/libaom/av1/encoder/lookahead.h
+++ b/libaom/av1/encoder/lookahead.h
@@ -46,7 +46,8 @@ struct lookahead_ctx {
  */
 struct lookahead_ctx *av1_lookahead_init(
     unsigned int width, unsigned int height, unsigned int subsampling_x,
-    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth);
+    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
+    const int border_in_pixels, int is_scale);
 
 /**\brief Destroys the lookahead stage
  */
diff --git a/libaom/av1/encoder/mbgraph.c b/libaom/av1/encoder/mbgraph.c
index cc50458..0cb6286 100644
--- a/libaom/av1/encoder/mbgraph.c
+++ b/libaom/av1/encoder/mbgraph.c
@@ -71,8 +71,8 @@ static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
   xd->mi[0]->mv[0] = x->best_mv;
   xd->mi[0]->ref_frame[1] = NONE_FRAME;
 
-  av1_build_inter_predictors_sby(&cpi->common, xd, mb_row, mb_col, NULL,
-                                 BLOCK_16X16);
+  av1_enc_build_inter_predictor(&cpi->common, xd, mb_row, mb_col, NULL,
+                                BLOCK_16X16, AOM_PLANE_Y, AOM_PLANE_Y);
 
   /* restore UMV window */
   x->mv_limits = tmp_mv_limits;
@@ -364,7 +364,7 @@ static void separate_arf_mbs(AV1_COMP *cpi) {
 void av1_update_mbgraph_stats(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int i, n_frames = av1_lookahead_depth(cpi->lookahead);
-  YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *golden_ref = &get_ref_frame_buf(cm, GOLDEN_FRAME)->buf;
 
   assert(golden_ref != NULL);
 
diff --git a/libaom/av1/encoder/mcomp.c b/libaom/av1/encoder/mcomp.c
index 63b4947..f077a4e 100644
--- a/libaom/av1/encoder/mcomp.c
+++ b/libaom/av1/encoder/mcomp.c
@@ -19,6 +19,7 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
 
 #include "av1/common/common.h"
 #include "av1/common/mvref_common.h"
@@ -28,6 +29,7 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/mcomp.h"
+#include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
 
@@ -336,7 +338,7 @@ static unsigned int setup_center_error(
     int *mvcost[2], unsigned int *sse1, int *distortion) {
   unsigned int besterr;
   if (second_pred != NULL) {
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
       uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
       if (mask) {
@@ -641,7 +643,7 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm,
                                 int mask_stride, int invert_mask, int w, int h,
                                 unsigned int *sse, int subpel_search) {
   unsigned int besterr;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
     uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
     if (second_pred != NULL) {
@@ -899,7 +901,8 @@ unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x,
   unsigned int mse;
   unsigned int sse;
 
-  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
   mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
   mse += mv_err_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost, x->mv_cost_stack,
                      x->errorperbit);
@@ -1797,11 +1800,11 @@ static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x,
                               MV *mvp_full, int step_param, int sadpb,
                               int further_steps, int do_refine, int *cost_list,
                               const aom_variance_fn_ptr_t *fn_ptr,
-                              const MV *ref_mv) {
+                              const MV *ref_mv, const search_site_config *cfg) {
   MV temp_mv;
   int thissme, n, num00 = 0;
-  int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
-                                        step_param, sadpb, &n, fn_ptr, ref_mv);
+  int bestsme = cpi->diamond_search_sad(x, cfg, mvp_full, &temp_mv, step_param,
+                                        sadpb, &n, fn_ptr, ref_mv);
   if (bestsme < INT_MAX)
     bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
   x->best_mv.as_mv = temp_mv;
@@ -1816,9 +1819,9 @@ static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (num00) {
       num00--;
     } else {
-      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
-                                        step_param + n, sadpb, &num00, fn_ptr,
-                                        ref_mv);
+      thissme =
+          cpi->diamond_search_sad(x, cfg, mvp_full, &temp_mv, step_param + n,
+                                  sadpb, &num00, fn_ptr, ref_mv);
       if (thissme < INT_MAX)
         thissme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
 
@@ -2094,11 +2097,222 @@ static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
   return is_allowed;
 }
 
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+  int best_sad = INT_MAX;
+  int this_sad;
+  int d;
+  int center, offset = 0;
+  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
+  for (d = 0; d <= bw; d += 16) {
+    this_sad = aom_vector_var(&ref[d], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      offset = d;
+    }
+  }
+  center = offset;
+
+  for (d = -8; d <= 8; d += 16) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -4; d <= 4; d += 8) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -2; d <= 2; d += 4) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -1; d <= 1; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+
+  return (center - (bw >> 1));
+}
+
+static const MV search_pos[4] = {
+  { -1, 0 },
+  { 0, -1 },
+  { 0, 1 },
+  { 1, 0 },
+};
+
+unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize, int mi_row,
+                                           int mi_col, const MV *ref_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  DECLARE_ALIGNED(16, int16_t, hbuf[256]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[256]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[128]);
+  int idx;
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  const int search_width = bw << 1;
+  const int search_height = bh << 1;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  uint8_t const *ref_buf, *src_buf;
+  MV *tmp_mv = &xd->mi[0]->mv[0].as_mv;
+  unsigned int best_sad, tmp_sad, this_sad[4];
+  MV this_mv;
+  const int norm_factor = 3 + (bw >> 5);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+  MvLimits subpel_mv_limits;
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+                         MAX_MB_PLANE);
+  }
+
+  if (xd->bd != 8) {
+    unsigned int sad;
+    tmp_mv->row = 0;
+    tmp_mv->col = 0;
+    sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                 xd->plane[0].pre[0].buf, ref_stride);
+
+    if (scaled_ref_frame) {
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+    }
+    return sad;
+  }
+
+  // Set up prediction 1-D reference set
+  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+  for (idx = 0; idx < search_width; idx += 16) {
+    aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    ref_buf += 16;
+  }
+
+  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+  for (idx = 0; idx < search_height; ++idx) {
+    vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
+    ref_buf += ref_stride;
+  }
+
+  // Set up src 1-D reference set
+  for (idx = 0; idx < bw; idx += 16) {
+    src_buf = x->plane[0].src.buf + idx;
+    aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+  }
+
+  src_buf = x->plane[0].src.buf;
+  for (idx = 0; idx < bh; ++idx) {
+    src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
+    src_buf += src_stride;
+  }
+
+  // Find the best match per 1-D search
+  tmp_mv->col = vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize]);
+  tmp_mv->row = vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize]);
+
+  this_mv = *tmp_mv;
+  src_buf = x->plane[0].src.buf;
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+  {
+    const uint8_t *const pos[4] = {
+      ref_buf - ref_stride,
+      ref_buf - 1,
+      ref_buf + 1,
+      ref_buf + ref_stride,
+    };
+
+    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+  }
+
+  for (idx = 0; idx < 4; ++idx) {
+    if (this_sad[idx] < best_sad) {
+      best_sad = this_sad[idx];
+      tmp_mv->row = search_pos[idx].row + this_mv.row;
+      tmp_mv->col = search_pos[idx].col + this_mv.col;
+    }
+  }
+
+  if (this_sad[0] < this_sad[3])
+    this_mv.row -= 1;
+  else
+    this_mv.row += 1;
+
+  if (this_sad[1] < this_sad[2])
+    this_mv.col -= 1;
+  else
+    this_mv.col += 1;
+
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+
+  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+  if (best_sad > tmp_sad) {
+    *tmp_mv = this_mv;
+    best_sad = tmp_sad;
+  }
+
+  tmp_mv->row *= 8;
+  tmp_mv->col *= 8;
+
+  set_subpel_mv_search_range(
+      &x->mv_limits, &subpel_mv_limits.col_min, &subpel_mv_limits.col_max,
+      &subpel_mv_limits.row_min, &subpel_mv_limits.row_max, ref_mv);
+  clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max,
+           subpel_mv_limits.row_min, subpel_mv_limits.row_max);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  return best_sad;
+}
+
 int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                           MV *mvp_full, int step_param, int method,
                           int run_mesh_search, int error_per_bit,
                           int *cost_list, const MV *ref_mv, int var_max, int rd,
-                          int x_pos, int y_pos, int intra) {
+                          int x_pos, int y_pos, int intra,
+                          const search_site_config *cfg) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
@@ -2138,7 +2352,7 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     case NSTEP:
       var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
                                MAX_MVSEARCH_STEPS - 1 - step_param, 1,
-                               cost_list, fn_ptr, ref_mv);
+                               cost_list, fn_ptr, ref_mv, cfg);
 
       // Should we allow a follow on exhaustive search?
       if (is_exhaustive_allowed(cpi, x)) {
@@ -2209,13 +2423,12 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
         // for the hashMap
         hash_table *ref_frame_hash =
-            intra
-                ? &cpi->common.cur_frame->hash_table
-                : av1_get_ref_frame_hash_map(cpi, x->e_mbd.mi[0]->ref_frame[0]);
+            intra ? &cpi->common.cur_frame->hash_table
+                  : av1_get_ref_frame_hash_map(&cpi->common,
+                                               x->e_mbd.mi[0]->ref_frame[0]);
 
-        av1_get_block_hash_value(
-            what, what_stride, block_width, &hash_value1, &hash_value2,
-            x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, x);
+        av1_get_block_hash_value(what, what_stride, block_width, &hash_value1,
+                                 &hash_value2, is_cur_buf_hbd(&x->e_mbd), x);
 
         const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
         // for intra, at lest one matching can be found, itself.
@@ -2334,7 +2547,7 @@ static int upsampled_obmc_pref_error(
   unsigned int besterr;
 
   DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
     aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
                               subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
@@ -2676,14 +2889,15 @@ static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
                                    MV *mvp_full, int step_param, int sadpb,
                                    int further_steps, int do_refine,
                                    const aom_variance_fn_ptr_t *fn_ptr,
-                                   const MV *ref_mv, MV *dst_mv,
-                                   int is_second) {
+                                   const MV *ref_mv, MV *dst_mv, int is_second,
+                                   const search_site_config *cfg) {
+  (void)cpi;  // to silence compiler warning
   const int32_t *wsrc = x->wsrc_buf;
   const int32_t *mask = x->mask_buf;
   MV temp_mv;
   int thissme, n, num00 = 0;
   int bestsme =
-      obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full, &temp_mv,
+      obmc_diamond_search_sad(x, cfg, wsrc, mask, mvp_full, &temp_mv,
                               step_param, sadpb, &n, fn_ptr, ref_mv, is_second);
   if (bestsme < INT_MAX)
     bestsme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr, 1,
@@ -2700,9 +2914,9 @@ static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
     if (num00) {
       num00--;
     } else {
-      thissme = obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full,
-                                        &temp_mv, step_param + n, sadpb, &num00,
-                                        fn_ptr, ref_mv, is_second);
+      thissme = obmc_diamond_search_sad(x, cfg, wsrc, mask, mvp_full, &temp_mv,
+                                        step_param + n, sadpb, &num00, fn_ptr,
+                                        ref_mv, is_second);
       if (thissme < INT_MAX)
         thissme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr,
                                       1, is_second);
@@ -2738,11 +2952,12 @@ int av1_obmc_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
                                int step_param, int sadpb, int further_steps,
                                int do_refine,
                                const aom_variance_fn_ptr_t *fn_ptr,
-                               const MV *ref_mv, MV *dst_mv, int is_second) {
+                               const MV *ref_mv, MV *dst_mv, int is_second,
+                               const search_site_config *cfg) {
   if (cpi->sf.obmc_full_pixel_search_level == 0) {
     return obmc_full_pixel_diamond(cpi, x, mvp_full, step_param, sadpb,
                                    further_steps, do_refine, fn_ptr, ref_mv,
-                                   dst_mv, is_second);
+                                   dst_mv, is_second, cfg);
   } else {
     const int32_t *wsrc = x->wsrc_buf;
     const int32_t *mask = x->mask_buf;
@@ -2851,3 +3066,119 @@ int av1_return_min_sub_pixel_mv(
   lower_mv_precision(bestmv, allow_hp, 0);
   return besterr;
 }
+
+void av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, int ref,
+                              MV ref_mv_full, int num_planes,
+                              int use_subpixel) {
+  assert(num_planes == 1 &&
+         "Currently simple_motion_search only supports luma plane");
+  assert(!frame_is_intra_only(&cpi->common) &&
+         "Simple motion search only enabled for non-key frames");
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->sb_type = bsize;
+  mbmi->ref_frame[0] = ref;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+  struct buf_2d backup_yv12;
+  // ref_mv is used to code the motion vector. ref_mv_full is the initial point.
+  // ref_mv is in units of 1/8 pel whereas ref_mv_full is in units of pel.
+  MV ref_mv = { 0, 0 };
+  const int step_param = cpi->mv_step_param;
+  const MvLimits tmp_mv_limits = x->mv_limits;
+  const SEARCH_METHODS search_methods = NSTEP;
+  const int do_mesh_search = 0;
+  const int sadpb = x->sadperbit16;
+  int cost_list[5];
+  const int ref_idx = 0;
+  int var;
+
+  av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
+                       get_ref_scale_factors(cm, ref), num_planes);
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  if (scaled_ref_frame) {
+    backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  // This overwrites the mv_limits so we will need to restore it later.
+  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
+  var = av1_full_pixel_search(
+      cpi, x, bsize, &ref_mv_full, step_param, search_methods, do_mesh_search,
+      sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
+      mi_col * MI_SIZE, mi_row * MI_SIZE, 0, &cpi->ss_cfg[SS_CFG_SRC]);
+  // Restore
+  x->mv_limits = tmp_mv_limits;
+
+  const int use_subpel_search =
+      var < INT_MAX && !cpi->common.cur_frame_force_integer_mv && use_subpixel;
+  if (scaled_ref_frame) {
+    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+  }
+  if (use_subpel_search) {
+    int not_used = 0;
+    if (cpi->sf.use_accurate_subpel_search) {
+      const int pw = block_size_wide[bsize];
+      const int ph = block_size_high[bsize];
+      cpi->find_fractional_mv_step(
+          x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+          x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+          cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+          x->nmv_vec_cost, x->mv_cost_stack, &not_used, &x->pred_sse[ref], NULL,
+          NULL, 0, 0, pw, ph, cpi->sf.use_accurate_subpel_search, 1);
+    } else {
+      cpi->find_fractional_mv_step(
+          x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+          x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+          cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+          x->nmv_vec_cost, x->mv_cost_stack, &not_used, &x->pred_sse[ref], NULL,
+          NULL, 0, 0, 0, 0, 0, 1);
+    }
+  } else {
+    // Manually convert from units of pixel to 1/8-pixels if we are not doing
+    // subpel search
+    x->best_mv.as_mv.row *= 8;
+    x->best_mv.as_mv.col *= 8;
+  }
+
+  mbmi->mv[0].as_mv = x->best_mv.as_mv;
+
+  // Get a copy of the prediction output
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+
+  aom_clear_system_state();
+
+  if (scaled_ref_frame) {
+    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+  }
+}
+
+void av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                               int mi_col, BLOCK_SIZE bsize,
+                               const MV ref_mv_full, int use_subpixel,
+                               unsigned int *sse, unsigned int *var) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const MV_REFERENCE_FRAME ref =
+      cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+
+  av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, ref_mv_full, 1,
+                           use_subpixel);
+
+  const uint8_t *src = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *dst = xd->plane[0].dst.buf;
+  const int dst_stride = xd->plane[0].dst.stride;
+
+  *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
+}
diff --git a/libaom/av1/encoder/mcomp.h b/libaom/av1/encoder/mcomp.h
index 3f8b3b1..71547da 100644
--- a/libaom/av1/encoder/mcomp.h
+++ b/libaom/av1/encoder/mcomp.h
@@ -13,6 +13,7 @@
 #define AOM_AV1_ENCODER_MCOMP_H_
 
 #include "av1/encoder/block.h"
+
 #include "aom_dsp/variance.h"
 
 #ifdef __cplusplus
@@ -83,6 +84,11 @@ int av1_refining_search_sad(struct macroblock *x, MV *ref_mv, int sad_per_bit,
                             int distance, const aom_variance_fn_ptr_t *fn_ptr,
                             const MV *center_mv);
 
+unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi,
+                                           MACROBLOCK *x, BLOCK_SIZE bsize,
+                                           int mi_row, int mi_col,
+                                           const MV *ref_mv);
+
 // Runs sequence of diamond searches in smaller steps for RD.
 int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param, int sadpb,
@@ -132,13 +138,15 @@ int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full, int step_param,
                           int method, int run_mesh_search, int error_per_bit,
                           int *cost_list, const MV *ref_mv, int var_max, int rd,
-                          int x_pos, int y_pos, int intra);
+                          int x_pos, int y_pos, int intra,
+                          const search_site_config *cfg);
 
 int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                MV *mvp_full, int step_param, int sadpb,
                                int further_steps, int do_refine,
                                const aom_variance_fn_ptr_t *fn_ptr,
-                               const MV *ref_mv, MV *dst_mv, int is_second);
+                               const MV *ref_mv, MV *dst_mv, int is_second,
+                               const search_site_config *cfg);
 int av1_find_best_obmc_sub_pixel_tree_up(
     MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
     MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
@@ -154,6 +162,19 @@ unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
                                   int mi_row, int mi_col, int *pts0,
                                   int *pts_inref0, int total_samples);
 
+// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
+// ref. Note that this sets the offset of mbmi, so we will need to reset it
+// after calling this function.
+void av1_simple_motion_search(struct AV1_COMP *const cpi, MACROBLOCK *x,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize, int ref,
+                              MV ref_mv_full, int num_planes, int use_subpixel);
+
+// Performs a simple motion search to calculate the sse and var of the residue
+void av1_simple_motion_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                               int mi_col, BLOCK_SIZE bsize,
+                               const MV ref_mv_full, int use_subpixel,
+                               unsigned int *sse, unsigned int *var);
+
 static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) {
   for (int z = 0; z < 3; z++) {
     fractional_best_mv[z].as_int = INVALID_MV;
diff --git a/libaom/av1/encoder/mips/msa/temporal_filter_msa.c b/libaom/av1/encoder/mips/msa/temporal_filter_msa.c
index 531ae09..effa75b 100644
--- a/libaom/av1/encoder/mips/msa/temporal_filter_msa.c
+++ b/libaom/av1/encoder/mips/msa/temporal_filter_msa.c
@@ -267,6 +267,7 @@ static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
   }
 }
 
+// TODO(yunqing) The following optimization is not used since c code changes.
 void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
                                    uint8_t *frame2_ptr, uint32_t blk_w,
                                    uint32_t blk_h, int32_t strength,
diff --git a/libaom/av1/encoder/ml.c b/libaom/av1/encoder/ml.c
index ad664ac..579900a 100644
--- a/libaom/av1/encoder/ml.c
+++ b/libaom/av1/encoder/ml.c
@@ -65,7 +65,9 @@ void av1_nn_softmax(const float *input, float *output, int n) {
   for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]);
   float sum_out = 0.0f;
   for (int i = 0; i < n; i++) {
-    output[i] = (float)exp(input[i] - max_inp);
+    // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
+    const float normalized_input = AOMMAX(input[i] - max_inp, -10.0f);
+    output[i] = (float)exp(normalized_input);
     sum_out += output[i];
   }
   for (int i = 0; i < n; i++) output[i] /= sum_out;
diff --git a/libaom/av1/encoder/partition_model_weights.h b/libaom/av1/encoder/partition_model_weights.h
index 271764a..b754c88 100644
--- a/libaom/av1/encoder/partition_model_weights.h
+++ b/libaom/av1/encoder/partition_model_weights.h
@@ -2441,145 +2441,20 @@ static const NN_CONFIG av1_rect_partition_nnconfig_128 = {
 #undef NUM_NODES
 #undef LABEL_SIZE
 
-#if CONFIG_ONE_PASS_SVM
-#define FEATURE_SIZE 24
-static const float av1_op_svm_early_term_weights_128[FEATURE_SIZE + 1] = {
-  -4.5893036051f, 6.9065208136f,  -9.1579514692f, 0.1353151366f,
-  -1.0271889653f, -0.0020988254f, -0.0094355949f, 0.0040209656f,
-  0.0073014747f,  0.7939705382f,  0.0254545714f,  0.0557559708f,
-  -0.0339662064f, -0.0496818300f, 0.3053600283f,  0.3699486845f,
-  0.0848271391f,  0.4091075988f,  0.1196729398f,  -0.0038137193f,
-  -0.0773495909f, -0.0651630642f, -0.0123704995f, -0.0036697401f,
-  -4.1930227095f,
-};
-
-static const float av1_op_svm_early_term_weights_64[FEATURE_SIZE + 1] = {
-  -2.7600454480f, 5.6822046712f,  -6.7576830133f, 0.1326457117f,
-  -1.0541818372f, 0.0107782654f,  0.0050469147f,  -0.0021362631f,
-  -0.0135151040f, -0.1020115005f, -0.0283409957f, -0.0176311233f,
-  0.0250648204f,  0.0196228570f,  0.5441528594f,  0.2767320141f,
-  0.1261231351f,  0.2998476408f,  0.1336215695f,  -0.1107823946f,
-  -0.0697279598f, -0.0577520545f, -0.0558441075f, -0.0699750617f,
-  -2.6995991503f,
-};
-
-static const float av1_op_svm_early_term_weights_32[FEATURE_SIZE + 1] = {
-  -0.8950734172f, 1.3559565008f,  -2.6733642653f, 0.2661361319f,
-  -0.0314731140f, 0.0044943456f,  0.0006438044f,  -0.0029066686f,
-  -0.0021903213f, 0.5845049496f,  -0.0003629350f, 0.0006982840f,
-  0.0014157386f,  -0.0017427528f, 0.7078456733f,  0.1600998068f,
-  0.0933852747f,  0.2822125876f,  0.1923826165f,  -0.0905903459f,
-  -0.0564717590f, -0.0591007486f, -0.0692268554f, -0.0677411981f,
-  -0.7101853206f,
-};
-
-static const float av1_op_svm_early_term_weights_16[FEATURE_SIZE + 1] = {
-  -0.1719124013f, -0.3192305362f, -1.1714597182f, 0.4437770294f,
-  -0.0042344643f, 0.0000027764f,  0.0018827450f,  -0.0015555613f,
-  -0.0003250050f, 0.9413693294f,  0.0076188418f,  -0.0067870352f,
-  0.0006329246f,  -0.0013059613f, 0.8596697254f,  0.0635558018f,
-  0.0447224598f,  0.0915706321f,  0.0741662273f,  -0.0269096547f,
-  -0.0244610614f, -0.0281113318f, -0.0326108845f, -0.0350908892f,
-  -0.0307521675f,
-};
-
-static const float av1_op_svm_early_term_mean_128[FEATURE_SIZE] = {
-  940540.3259649610f,    3988285.5905584921f, 575475302.3545289040f,
-  0.5775348803f,         866.9828469502f,     0.2503762393f,
-  0.2501466215f,         0.2513213770f,       0.2481557622f,
-  521994448.3219169378f, 0.2666920631f,       0.2535864361f,
-  0.2481589186f,         0.2315625823f,       100519.1049708007f,
-  12.1299754840f,        0.8279971004f,       12.6664603305f,
-  0.7313258998f,         935.8233056680f,     0.7436563032f,
-  0.7710055018f,         0.7376516970f,       0.6859818720f,
-};
-
-static const float av1_op_svm_early_term_mean_64[FEATURE_SIZE] = {
-  420419.7529613562f,    839754.4414347620f, 129360420.5256031156f,
-  0.6525652037f,         548.8972009954f,    0.2506918565f,
-  0.2488349076f,         0.2501724146f,      0.2503008213f,
-  113132974.7944754064f, 0.2479344278f,      0.2471446791f,
-  0.2524478512f,         0.2524730419f,      91147.9854189453f,
-  10.9642508460f,        0.8936554428f,      11.3877865621f,
-  0.8307555282f,         752.7787491956f,    0.7243363939f,
-  0.7198362119f,         0.7329432336f,      0.7245090283f,
-};
-
-static const float av1_op_svm_early_term_mean_32[FEATURE_SIZE] = {
-  105111.0236438536f,   184296.0939716828f, 29117017.6751756854f,
-  0.6402298612f,        140.2223339218f,    0.2495860872f,
-  0.2496407600f,        0.2506238629f,      0.2501492900f,
-  24480304.9390618578f, 0.2494442027f,      0.2496080963f,
-  0.2504881563f,        0.2504595447f,      60297.6762059058f,
-  9.4279752138f,        0.9287901132f,      9.6516813792f,
-  0.9009173677f,        591.5406335030f,    0.6944486917f,
-  0.6983941982f,        0.6927236901f,      0.6921613649f,
-};
-
-static const float av1_op_svm_early_term_mean_16[FEATURE_SIZE] = {
-  34080.7994802934f, 44108.1176228864f,   7494288.4946180154f, 0.6240636218f,
-  36.4539515827f,    0.2490867417f,       0.2499231014f,       0.2505361492f,
-  0.2504540077f,     5913397.2957480755f, 0.2487482536f,       0.2495500728f,
-  0.2503693302f,     0.2513323434f,       36574.9686737814f,   7.4345592768f,
-  0.9592429205f,     7.6001764585f,       0.9459867777f,       490.4635033056f,
-  0.6626215237f,     0.6580791886f,       0.6655481064f,       0.6589010119f,
-};
-
-static const float av1_op_svm_early_term_std_128[FEATURE_SIZE] = {
-  2054266.2732957317f,    7550554.6241466375f, 1078688147.1656334400f,
-  0.4939517611f,          1414.3139592985f,    0.1504634077f,
-  0.1515907199f,          0.1590329744f,       0.1515653324f,
-  1006422867.8989596367f, 0.1168668155f,       0.1195725959f,
-  0.1195825693f,          0.1123065533f,       195261.0940245980f,
-  4.5876675121f,          0.3773829648f,       4.8017339769f,
-  0.4432700397f,          973.7532938848f,     0.4790027843f,
-  0.5056275222f,          0.5262278749f,       0.4685586148f,
-};
-
-static const float av1_op_svm_early_term_std_64[FEATURE_SIZE] = {
-  1093636.0522712648f,   1749863.5221569177f, 255168612.8025657237f,
-  0.4761552884f,         1084.7927994662f,    0.1099344646f,
-  0.1100619440f,         0.1090853225f,       0.1115303745f,
-  232084513.1365262568f, 0.0759732385f,       0.0762942913f,
-  0.0785624106f,         0.0779284747f,       185687.9441778057f,
-  4.4371901245f,         0.3082781088f,       4.6670562831f,
-  0.3749677061f,         854.3212307408f,     0.4920531348f,
-  0.5073919158f,         0.5054698298f,       0.4904895620f,
-};
-
-static const float av1_op_svm_early_term_std_32[FEATURE_SIZE] = {
-  238229.7484988807f,   400136.8703966461f, 60267828.4581554681f,
-  0.4799328974f,        268.9377064297f,    0.1122938575f,
-  0.1126479260f,        0.1137018559f,      0.1126389337f,
-  52174139.1477040648f, 0.0715628767f,      0.0720997035f,
-  0.0728961434f,        0.0732065300f,      147785.0049793872f,
-  4.2092341484f,        0.2571751131f,      4.3893075417f,
-  0.2987729310f,        769.0253148602f,    0.5027558039f,
-  0.4982811444f,        0.5092312751f,      0.4991214994f,
-};
-
-static const float av1_op_svm_early_term_std_16[FEATURE_SIZE] = {
-  64177.9527087587f,    103729.9987511119f, 16632490.8146969266f,
-  0.4843637247f,        65.8114470725f,     0.0884226846f,
-  0.0912638659f,        0.0914771167f,      0.0916078800f,
-  13364581.3877149168f, 0.0677468925f,      0.0689631274f,
-  0.0689915367f,        0.0702648469f,      111397.2620676765f,
-  3.7858187888f,        0.1977269328f,      3.9420183951f,
-  0.2260437881f,        717.5336868275f,    0.5017939514f,
-  0.5066633533f,        0.5086806985f,      0.5085585987f,
-};
-
-#undef FEATURE_SIZE
-#endif  // CONFIG_ONE_PASS_SVM
+// Below are the models used for simple_motion_search_based_split
+static const float av1_simple_motion_search_based_split_thresh_128 = 2.0f;
+static const float av1_simple_motion_search_based_split_thresh_64 = 2.0f;
+static const float av1_simple_motion_search_based_split_thresh_32 = 2.0f;
+static const float av1_simple_motion_search_based_split_thresh_16 = 2.0f;
+static const float av1_simple_motion_search_based_split_thresh_8 = 2.0f;
 
-// Below are the models used for full_pixel_motion_search_based_split
 // BLOCK_128X128
 #define NUM_HIDDEN_LAYERS_128 1
 #define NUM_FEATURES_128 6
 #define NUM_LAYER_0_UNITS_128 16
 #define NUM_LOGITS_128 1
 
-static const float full_pixel_motion_search_based_split_layer_0_kernel_128[] = {
+static const float av1_simple_motion_search_based_split_layer_0_kernel_128[] = {
   -0.807346f,  0.242298f,   12.9862f,   -1.19161f,  5.21734f,    -1.1363f,
   -2.39127f,   0.930915f,   -2.44285f,  -2.42966f,  5.73476f,    0.0506879f,
   -0.234878f,  -0.317875f,  0.361322f,  0.431648f,  -0.39105f,   -0.110225f,
@@ -2598,23 +2473,23 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_128[] = {
   0.702545f,   -0.612227f,  -7.68881f,  9.52225f,   -1.18581f,   -2.56762f
 };
 
-static const float full_pixel_motion_search_based_split_logits_kernel_128[] = {
+static const float av1_simple_motion_search_based_split_logits_kernel_128[] = {
   0.364895f,    0.577553f,  0.115758f,  -0.999496f, 0.124885f, 3.23193f,
   -0.00386642f, 0.970794f,  0.136637f,  -4.28052f,  -1.49234f, 0.370436f,
   0.576981f,    -0.469656f, -0.124071f, 1.07669f
 };
 
-static const float full_pixel_motion_search_based_split_layer_0_bias_128[] = {
+static const float av1_simple_motion_search_based_split_layer_0_bias_128[] = {
   1.32916f,    0.817212f,  0.0f,       -0.921066f, 0.0f,      3.57649f,
   -0.0204517f, 2.97286f,   0.0f,       5.49957f,   -8.14518f, 0.0f,
   1.30826f,    -0.349536f, -0.638933f, 5.4496f
 };
 
-static const float full_pixel_motion_search_based_split_logits_bias_128[] = {
+static const float av1_simple_motion_search_based_split_logits_bias_128[] = {
   0.683442f
 };
 
-static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_128 = {
+static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_128 = {
   NUM_FEATURES_128,
   NUM_LOGITS_128,
   NUM_HIDDEN_LAYERS_128,
@@ -2622,17 +2497,15 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_128 = {
       NUM_LAYER_0_UNITS_128,
   },
   {
-      full_pixel_motion_search_based_split_layer_0_kernel_128,
-      full_pixel_motion_search_based_split_logits_kernel_128,
+      av1_simple_motion_search_based_split_layer_0_kernel_128,
+      av1_simple_motion_search_based_split_logits_kernel_128,
   },
   {
-      full_pixel_motion_search_based_split_layer_0_bias_128,
-      full_pixel_motion_search_based_split_logits_bias_128,
+      av1_simple_motion_search_based_split_layer_0_bias_128,
+      av1_simple_motion_search_based_split_logits_bias_128,
   },
 };
 
-static const float full_pixel_motion_search_based_split_thresh_128 = 2.0f;
-
 #undef NUM_HIDDEN_LAYERS_128
 #undef NUM_FEATURES_128
 #undef NUM_LAYER_0_UNITS_128
@@ -2644,7 +2517,7 @@ static const float full_pixel_motion_search_based_split_thresh_128 = 2.0f;
 #define NUM_LAYER_0_UNITS_64 16
 #define NUM_LOGITS_64 1
 
-static const float full_pixel_motion_search_based_split_layer_0_kernel_64[] = {
+static const float av1_simple_motion_search_based_split_layer_0_kernel_64[] = {
   0.0345945f,  -0.394064f,  0.0919978f, 0.270358f,  -0.384502f, -0.504608f,
   -0.25759f,   0.155981f,   2.62567f,   -10.7204f,  -0.709802f, 8.15948f,
   0.589866f,   -0.445645f,  -1.68232f,  10.0061f,   -3.17671f,  4.87259f,
@@ -2663,23 +2536,23 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_64[] = {
   -0.217072f,  -0.0984913f, -0.265515f, 0.360021f,  0.0779512f, 0.361516f
 };
 
-static const float full_pixel_motion_search_based_split_logits_kernel_64[] = {
+static const float av1_simple_motion_search_based_split_logits_kernel_64[] = {
   0.470821f, 0.474747f, -0.571292f, 0.403221f,  0.628966f,  -0.617029f,
   0.501105f, 0.499962f, -1.5451f,   -0.473518f, -0.730568f, -5.55817f,
   0.776761f, 0.42569f,  0.311925f,  0.469968f
 };
 
-static const float full_pixel_motion_search_based_split_layer_0_bias_64[] = {
+static const float av1_simple_motion_search_based_split_layer_0_bias_64[] = {
   -0.134085f, 0.0758715f, 1.10419f,  0.0f,       -5.75737f, 1.65494f,
   0.0f,       3.44047f,   0.394852f, 3.43858f,   3.65871f,  -4.84987f,
   1.21207f,   -1.7705f,   -5.46469f, -0.0889634f
 };
 
-static const float full_pixel_motion_search_based_split_logits_bias_64[] = {
+static const float av1_simple_motion_search_based_split_logits_bias_64[] = {
   -0.479491f
 };
 
-static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_64 = {
+static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_64 = {
   NUM_FEATURES_64,
   NUM_LOGITS_64,
   NUM_HIDDEN_LAYERS_64,
@@ -2687,17 +2560,15 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_64 = {
       NUM_LAYER_0_UNITS_64,
   },
   {
-      full_pixel_motion_search_based_split_layer_0_kernel_64,
-      full_pixel_motion_search_based_split_logits_kernel_64,
+      av1_simple_motion_search_based_split_layer_0_kernel_64,
+      av1_simple_motion_search_based_split_logits_kernel_64,
   },
   {
-      full_pixel_motion_search_based_split_layer_0_bias_64,
-      full_pixel_motion_search_based_split_logits_bias_64,
+      av1_simple_motion_search_based_split_layer_0_bias_64,
+      av1_simple_motion_search_based_split_logits_bias_64,
   },
 };
 
-static const float full_pixel_motion_search_based_split_thresh_64 = 2.0f;
-
 #undef NUM_HIDDEN_LAYERS_64
 #undef NUM_FEATURES_64
 #undef NUM_LAYER_0_UNITS_64
@@ -2709,7 +2580,7 @@ static const float full_pixel_motion_search_based_split_thresh_64 = 2.0f;
 #define NUM_LAYER_0_UNITS_32 16
 #define NUM_LOGITS_32 1
 
-static const float full_pixel_motion_search_based_split_layer_0_kernel_32[] = {
+static const float av1_simple_motion_search_based_split_layer_0_kernel_32[] = {
   -1.61796f,   0.0585128f,  1.57904f,   1.52703f,   0.367779f, 0.220434f,
   1.66652f,    -1.77782f,   6.41118f,   4.16976f,   4.97299f,  4.84111f,
   -0.0956536f, -0.163284f,  -0.143662f, 0.129329f,  0.449659f, -0.528844f,
@@ -2728,23 +2599,23 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_32[] = {
   -1.91327f,   -0.0356497f, 1.47611f,   1.27499f,   -1.76108f, -0.578954f
 };
 
-static const float full_pixel_motion_search_based_split_logits_kernel_32[] = {
+static const float av1_simple_motion_search_based_split_logits_kernel_32[] = {
   -0.220382f, -0.693902f, 0.424827f, 0.379952f, -0.413791f, -0.326785f,
   -0.455086f, 0.242402f,  0.307986f, 0.175746f, 0.498901f,  -0.628053f,
   0.285447f,  0.230052f,  0.415151f, -0.842946f
 };
 
-static const float full_pixel_motion_search_based_split_layer_0_bias_32[] = {
+static const float av1_simple_motion_search_based_split_layer_0_bias_32[] = {
   -1.80751f, 6.40356f,   -0.0512058f, -4.59163f, -0.369933f, -0.195755f,
   -0.16648f, -0.599755f, -5.35975f,   -1.21349f, 2.48414f,   1.07096f,
   -3.66684f, -6.17761f,  4.2159f,     -1.05286f
 };
 
-static const float full_pixel_motion_search_based_split_logits_bias_32[] = {
+static const float av1_simple_motion_search_based_split_logits_bias_32[] = {
   -2.58676f
 };
 
-static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_32 = {
+static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_32 = {
   NUM_FEATURES_32,
   NUM_LOGITS_32,
   NUM_HIDDEN_LAYERS_32,
@@ -2752,17 +2623,15 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_32 = {
       NUM_LAYER_0_UNITS_32,
   },
   {
-      full_pixel_motion_search_based_split_layer_0_kernel_32,
-      full_pixel_motion_search_based_split_logits_kernel_32,
+      av1_simple_motion_search_based_split_layer_0_kernel_32,
+      av1_simple_motion_search_based_split_logits_kernel_32,
   },
   {
-      full_pixel_motion_search_based_split_layer_0_bias_32,
-      full_pixel_motion_search_based_split_logits_bias_32,
+      av1_simple_motion_search_based_split_layer_0_bias_32,
+      av1_simple_motion_search_based_split_logits_bias_32,
   },
 };
 
-static const float full_pixel_motion_search_based_split_thresh_32 = 2.0f;
-
 #undef NUM_HIDDEN_LAYERS_32
 #undef NUM_FEATURES_32
 #undef NUM_LAYER_0_UNITS_32
@@ -2774,7 +2643,7 @@ static const float full_pixel_motion_search_based_split_thresh_32 = 2.0f;
 #define NUM_LAYER_0_UNITS_16 16
 #define NUM_LOGITS_16 1
 
-static const float full_pixel_motion_search_based_split_layer_0_kernel_16[] = {
+static const float av1_simple_motion_search_based_split_layer_0_kernel_16[] = {
   -0.611497f,  -0.0422086f, -0.555957f,   -0.632451f, -0.144179f, -0.152722f,
   -0.330265f,  -0.419866f,  0.287343f,    0.385295f,  -0.424486f, 0.424281f,
   2.27442f,    -2.47933f,   5.24731f,     4.33827f,   4.73215f,   3.41909f,
@@ -2793,23 +2662,23 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_16[] = {
   0.0333619f,  -0.377782f,  0.160767f,    -0.128169f, -0.484818f, -0.311973f
 };
 
-static const float full_pixel_motion_search_based_split_logits_kernel_16[] = {
+static const float av1_simple_motion_search_based_split_logits_kernel_16[] = {
   -0.132207f,   0.15176f,   -0.680086f, 0.605921f, -0.43294f,  0.485811f,
   -0.306286f,   0.551368f,  0.413904f,  0.548748f, -0.437391f, 0.560778f,
   -0.00685266f, -0.558657f, 0.122127f,  0.260165f
 };
 
-static const float full_pixel_motion_search_based_split_layer_0_bias_16[] = {
+static const float av1_simple_motion_search_based_split_layer_0_bias_16[] = {
   -0.200928f, -0.074132f, 8.69963f,    -9.00807f,  9.08983f, -6.83586f,
   -3.89329f,  10.4881f,   -0.0670618f, 0.0f,       9.21614f, 8.41773f,
   -0.145851f, 0.0f,       -1.43038f,   -0.0460311f
 };
 
-static const float full_pixel_motion_search_based_split_logits_bias_16[] = {
+static const float av1_simple_motion_search_based_split_logits_bias_16[] = {
   -4.19885f
 };
 
-static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_16 = {
+static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_16 = {
   NUM_FEATURES_16,
   NUM_LOGITS_16,
   NUM_HIDDEN_LAYERS_16,
@@ -2817,17 +2686,15 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_16 = {
       NUM_LAYER_0_UNITS_16,
   },
   {
-      full_pixel_motion_search_based_split_layer_0_kernel_16,
-      full_pixel_motion_search_based_split_logits_kernel_16,
+      av1_simple_motion_search_based_split_layer_0_kernel_16,
+      av1_simple_motion_search_based_split_logits_kernel_16,
   },
   {
-      full_pixel_motion_search_based_split_layer_0_bias_16,
-      full_pixel_motion_search_based_split_logits_bias_16,
+      av1_simple_motion_search_based_split_layer_0_bias_16,
+      av1_simple_motion_search_based_split_logits_bias_16,
   },
 };
 
-static const float full_pixel_motion_search_based_split_thresh_16 = 2.0f;
-
 #undef NUM_HIDDEN_LAYERS_16
 #undef NUM_FEATURES_16
 #undef NUM_LAYER_0_UNITS_16
@@ -2840,7 +2707,7 @@ static const float full_pixel_motion_search_based_split_thresh_16 = 2.0f;
 #define NUM_LAYER_0_UNITS_8 16
 #define NUM_LOGITS_8 1
 
-static const float full_pixel_motion_search_based_split_layer_0_kernel_8[] = {
+static const float av1_simple_motion_search_based_split_layer_0_kernel_8[] = {
   0.0370236f,   -0.580211f,  2.0134f,    1.69637f,    2.43181f,   -0.521648f,
   -0.00375187f, 0.122712f,   -4.74411f,  7.36187f,    5.42574f,   -5.53557f,
   0.0993344f,   -0.358843f,  0.0765453f, -0.615987f,  -0.754633f, -0.175846f,
@@ -2859,23 +2726,1240 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_8[] = {
   0.616966f,    -0.451472f,  -0.319365f, 0.00807278f, -0.303261f, -0.351679f
 };
 
-static const float full_pixel_motion_search_based_split_logits_kernel_8[] = {
+static const float av1_simple_motion_search_based_split_logits_kernel_8[] = {
   -0.625847f, 0.381323f, 0.342475f, 0.526161f,  -0.665965f, -0.515317f,
   -0.406218f, 0.568007f, 0.479397f, -0.426116f, 0.615638f,  0.338572f,
   0.185583f,  0.308031f, 0.260748f, 0.531619f
 };
 
-static const float full_pixel_motion_search_based_split_layer_0_bias_8[] = {
+static const float av1_simple_motion_search_based_split_layer_0_bias_8[] = {
   4.73775f,  -1.12658f, -0.258038f, -6.06696f, 1.79131f, 2.49609f,
   4.28388f,  0.0f,      -4.63598f,  3.06034f,  5.31994f, -0.152142f,
   0.514738f, -1.30098f, 3.00296f,   -3.83481f
 };
 
-static const float full_pixel_motion_search_based_split_logits_bias_8[] = {
+static const float av1_simple_motion_search_based_split_logits_bias_8[] = {
   -3.44508f
 };
 
-static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_8 = {
+static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_8 = {
+  NUM_FEATURES_8,
+  NUM_LOGITS_8,
+  NUM_HIDDEN_LAYERS_8,
+  {
+      NUM_LAYER_0_UNITS_8,
+  },
+  {
+      av1_simple_motion_search_based_split_layer_0_kernel_8,
+      av1_simple_motion_search_based_split_logits_kernel_8,
+  },
+  {
+      av1_simple_motion_search_based_split_layer_0_bias_8,
+      av1_simple_motion_search_based_split_logits_bias_8,
+  },
+};
+
+#endif
+
+// Model based on simple_motion_search
+
+// Thresholds for doing a single type of partition
+// TODO(chiyotsai@google.com): Set the thresholds for PARTITION_SPLIT.
+static const float av1_simple_motion_search_prune_part_only_thresh_128[10] = {
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+};
+
+static const float av1_simple_motion_search_prune_part_only_thresh_64[10] = {
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+};
+
+static const float av1_simple_motion_search_prune_part_only_thresh_32[10] = {
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+};
+
+static const float av1_simple_motion_search_prune_part_only_thresh_16[10] = {
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+};
+
+static const float av1_simple_motion_search_prune_part_only_thresh_8[10] = {
+  1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+};
+
+// Thresholds for pruning a partition type
+static const float av1_simple_motion_search_prune_part_prune_thresh_128[10] = {
+  0.0f, 0.0288721601835f, 0.0288721601835f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+  0.0f
+};
+
+static const float av1_simple_motion_search_prune_part_prune_thresh_64[10] = {
+  0.0f, 0.0281573780991f, 0.0281573780991f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+  0.0f
+};
+
+static const float av1_simple_motion_search_prune_part_prune_thresh_32[10] = {
+  0.0f, 0.0225501403434f, 0.0225501403434f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+  0.0f
+};
+
+static const float av1_simple_motion_search_prune_part_prune_thresh_16[10] = {
+  0.0f,
+  0.000961189195907f,
+  0.000961189195907f,
+  0.0f,
+  0.0f,
+  0.0f,
+  0.0f,
+  0.0f,
+  0.0f,
+  0.0f
+};
+
+static const float av1_simple_motion_search_prune_part_prune_thresh_8[10] = {
+  0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
+};
+
+// Mean and std
+static const float av1_simple_motion_search_prune_part_mean_128[25] = {
+  13.292176f, 13.231236f, 11.098058f, 11.049944f, 10.481336f,
+  10.431587f, 10.789337f, 10.732787f, 10.233817f, 10.173738f,
+  12.214045f, 12.157505f, 11.863353f, 11.802220f, 12.204053f,
+  12.152315f, 11.517566f, 11.465651f, 5.383040f,  0.757934f,
+  4.012611f,  4.052191f,  0.853365f,  3.954503f,  3.944135f,
+};
+
+static const float av1_simple_motion_search_prune_part_std_128[25] = {
+  2.589217f, 2.559396f, 2.268402f, 2.282274f, 3.341234f, 3.341994f, 3.033007f,
+  3.041550f, 3.786247f, 3.784053f, 2.523459f, 2.511275f, 3.349364f, 3.340481f,
+  2.390149f, 2.384226f, 3.599467f, 3.587460f, 2.319911f, 0.428335f, 1.241087f,
+  1.208679f, 0.353742f, 1.228122f, 1.211777f,
+};
+
+static const float av1_simple_motion_search_prune_part_mean_64[25] = {
+  11.439831f, 11.382639f, 9.647134f, 9.578121f, 9.146770f,
+  9.084122f,  8.559063f,  8.499496f, 8.095865f, 8.041795f,
+  10.547537f, 10.486240f, 9.362147f, 9.308391f, 10.548071f,
+  10.484358f, 10.002225f, 9.944480f, 4.964504f, 0.897164f,
+  3.306144f,  3.351039f,  0.928582f, 3.319739f, 3.287726f,
+};
+
+static const float av1_simple_motion_search_prune_part_std_64[25] = {
+  2.033404f, 2.050657f, 2.064671f, 2.081519f, 2.916312f, 2.914649f, 3.628949f,
+  3.618760f, 4.011421f, 3.996068f, 2.087127f, 2.103106f, 3.885277f, 3.876166f,
+  2.035599f, 2.052976f, 3.052501f, 3.050985f, 2.232998f, 0.303745f, 1.111161f,
+  1.081292f, 0.257521f, 1.112510f, 1.089404f,
+};
+
+static const float av1_simple_motion_search_prune_part_mean_32[25] = {
+  9.862349f, 9.793658f, 8.043962f, 7.954083f, 8.058867f, 7.966165f, 8.046844f,
+  7.956817f, 8.061414f, 7.967906f, 8.966450f, 8.890165f, 8.968315f, 8.891513f,
+  8.953573f, 8.877070f, 8.974275f, 8.895363f, 4.387239f, 0.954143f, 2.701000f,
+  2.751266f, 0.963302f, 2.716584f, 2.709725f,
+};
+
+static const float av1_simple_motion_search_prune_part_std_32[25] = {
+  1.971555f, 1.985517f, 1.935986f, 1.944743f, 1.924122f, 1.932169f, 1.943151f,
+  1.950612f, 1.931156f, 1.938242f, 1.987803f, 1.997670f, 2.000859f, 2.009913f,
+  1.938270f, 1.949277f, 1.922999f, 1.933145f, 1.991504f, 0.209175f, 0.973824f,
+  0.952221f, 0.188018f, 0.985295f, 0.946228f,
+};
+
+static const float av1_simple_motion_search_prune_part_mean_16[25] = {
+  8.391692f, 8.303431f, 6.590342f, 6.459725f, 6.460719f, 6.333274f, 6.592615f,
+  6.461661f, 6.464787f, 6.337191f, 7.499753f, 7.395166f, 7.503220f, 7.398344f,
+  7.498312f, 7.395039f, 7.353743f, 7.253139f, 3.874267f, 0.979701f, 2.087404f,
+  2.131698f, 0.981005f, 2.110868f, 2.106539f,
+};
+
+static const float av1_simple_motion_search_prune_part_std_16[25] = {
+  1.865867f, 1.870012f, 1.773885f, 1.770447f, 1.972922f, 1.961361f, 1.777224f,
+  1.772864f, 1.974519f, 1.962281f, 1.831632f, 1.831837f, 1.837595f, 1.837008f,
+  1.822791f, 1.822053f, 2.074991f, 2.067200f, 1.676261f, 0.141022f, 0.840297f,
+  0.829935f, 0.136507f, 0.828972f, 0.808563f,
+};
+
+static const float av1_simple_motion_search_prune_part_mean_8[25] = {
+  6.997798f, 6.867032f, 5.134819f, 4.883330f, 5.134804f, 4.879707f, 5.140518f,
+  4.886751f, 5.142186f, 4.885262f, 6.069946f, 5.896944f, 6.080442f, 5.906130f,
+  6.077539f, 5.905929f, 6.083087f, 5.909298f, 3.552709f, 0.990654f, 1.497349f,
+  1.531762f, 0.989606f, 1.496581f, 1.484139f,
+};
+
+static const float av1_simple_motion_search_prune_part_std_8[25] = {
+  1.727562f, 1.725050f, 1.633396f, 1.618773f, 1.633586f, 1.620657f, 1.620798f,
+  1.604892f, 1.621570f, 1.607439f, 1.691024f, 1.684225f, 1.676065f, 1.668442f,
+  1.680016f, 1.672452f, 1.677775f, 1.671586f, 1.451902f, 0.096223f, 0.751190f,
+  0.754040f, 0.101419f, 0.738239f, 0.729455f,
+};
+
+#define NUM_HIDDEN_LAYERS_128 1
+#define NUM_FEATURES_128 25
+#define NUM_LAYER_0_UNITS_128 8
+#define NUM_LOGITS_128 4
+
+static const float av1_simple_motion_search_prune_part_logits_kernel_128[] = {
+  -0.129103f, 0.457758f,  -0.489986f, 0.65462f,   -0.184312f, 3.81202f,
+  -0.444407f, -0.64198f,  -0.575008f, 0.0311711f, 0.525243f,  -20.892f,
+  1.08811f,   -65.0976f,  -12.3973f,  -1.38278f,  -0.264233f, 0.241636f,
+  -10.6925f,  -0.725414f, -18.8987f,  -40.2284f,  -16.08f,    0.995331f,
+  1.47614f,   -0.964864f, 0.405506f,  0.140449f,  0.459534f,  -1.9093f,
+  0.398452f,  0.696949f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_bias_128[] = {
+  1.22789f, -1.34527f, 0.759048f,  0.315086f,
+  1.0834f,  -1.58019f, -0.465158f, 1.20716f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_kernel_128[] = {
+  -0.668677f,  0.58694f,    -0.417094f,   0.754735f,   -0.7859f,
+  0.377479f,   -0.0415929f, -0.0140585f,  -0.730001f,  0.747528f,
+  -0.135247f,  0.406505f,   -0.234184f,   0.956362f,   -0.637555f,
+  0.791884f,   0.0303722f,  1.04424f,     -0.727859f,  -0.274321f,
+  -0.122986f,  0.066312f,   -0.00559175f, -0.239643f,  -0.0188767f,
+  -0.102787f,  -0.262967f,  0.071882f,    -0.283398f,  0.111607f,
+  -0.425826f,  0.02699f,    0.108873f,    -0.180558f,  -0.0794057f,
+  0.29665f,    -0.0252969f, -0.0266213f,  -0.277462f,  -0.361973f,
+  0.512552f,   0.395011f,   -0.225876f,   0.301924f,   0.136954f,
+  0.507259f,   1.23425f,    0.0137135f,   0.662572f,   0.591583f,
+  0.101564f,   0.416805f,   -0.645081f,   -0.179086f,  -0.36747f,
+  -0.332213f,  0.095177f,   0.220739f,    -0.153256f,  0.706155f,
+  0.161701f,   0.696815f,   -1.21531f,    -0.115059f,  0.486764f,
+  -0.396093f,  0.784883f,   0.535357f,    -0.278021f,  0.143496f,
+  -0.44931f,   -0.144543f,  0.319326f,    0.0190167f,  -0.206295f,
+  0.373995f,   -0.247897f,  -0.608095f,   -0.41796f,   -0.137129f,
+  -0.709562f,  0.678273f,   0.537607f,    0.557474f,   0.453308f,
+  0.21405f,    -0.0466495f, 0.519139f,    -0.168832f,  0.902911f,
+  0.681131f,   -0.139876f,  -0.2052f,     -0.393271f,  0.262222f,
+  -0.246246f,  -0.213993f,  0.646619f,    0.0496181f,  -0.00354157f,
+  0.822927f,   0.0939522f,  0.180738f,    0.118355f,   0.120456f,
+  -0.0472214f, -0.144958f,  0.173405f,    -0.886644f,  -0.0949769f,
+  -0.813518f,  -0.3947f,    -0.128021f,   0.356196f,   0.469169f,
+  -0.413702f,  1.04242f,    0.428853f,    -0.387293f,  0.0850877f,
+  0.279409f,   -0.142276f,  0.0579376f,   0.211112f,   0.0703013f,
+  -1.9274f,    -0.729147f,  0.534193f,    0.773586f,   0.922864f,
+  0.642881f,   1.15127f,    0.621032f,    0.933942f,   1.01837f,
+  -0.660282f,  -0.40059f,   -1.11279f,    -0.77088f,   -0.43349f,
+  0.202361f,   -0.0840912f, 0.0935707f,   0.056333f,   -0.0779369f,
+  0.0173447f,  -0.0104756f, 0.0115005f,   -0.0195593f, 0.03592f,
+  -0.343454f,  -0.618048f,  0.258172f,    -0.412322f,  -0.0463746f,
+  -0.0413654f, -0.0400194f, 0.615981f,    -0.452094f,  0.644555f,
+  0.0822476f,  -0.359791f,  -0.0904274f,  0.209427f,   0.0116338f,
+  -0.190978f,  0.890233f,   0.737769f,    -1.66663f,   -0.392605f,
+  0.0785728f,  -0.224553f,  -0.128258f,   -0.227227f,  -0.0777773f,
+  0.685976f,   0.347042f,   -0.555325f,   -0.249221f,  0.0919837f,
+  -0.0660016f, -0.272316f,  0.0390632f,   -0.619624f,  -0.0565801f,
+  0.585026f,   0.597375f,   0.54114f,     0.593389f,   0.604391f,
+  0.0820294f,  -0.85339f,   -1.40741f,    -0.391675f,  0.0579205f,
+  -0.197626f,  0.130044f,   -0.234488f,   -0.0373991f, -0.0717973f
+};
+
+static const float av1_simple_motion_search_prune_part_logits_bias_128[] = {
+  1.58571f, -4.6314f, -2.00273f, 0.543699f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_128 = {
+  NUM_FEATURES_128,
+  NUM_LOGITS_128,
+  NUM_HIDDEN_LAYERS_128,
+  {
+      NUM_LAYER_0_UNITS_128,
+  },
+  {
+      av1_simple_motion_search_prune_part_layer_0_kernel_128,
+      av1_simple_motion_search_prune_part_logits_kernel_128,
+  },
+  {
+      av1_simple_motion_search_prune_part_layer_0_bias_128,
+      av1_simple_motion_search_prune_part_logits_bias_128,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_128
+#undef NUM_FEATURES_128
+#undef NUM_LAYER_0_UNITS_128
+#undef NUM_LOGITS_128
+
+#define NUM_HIDDEN_LAYERS_64 1
+#define NUM_FEATURES_64 25
+#define NUM_LAYER_0_UNITS_64 32
+#define NUM_LOGITS_64 10
+
+static const float av1_simple_motion_search_prune_part_logits_kernel_64[] = {
+  0.10424f,    -0.346025f,  0.534547f,   -0.385925f,  2.58341f,    -0.256414f,
+  -0.232498f,  0.329823f,   -0.0777376f, -0.590939f,  0.062657f,   -0.628252f,
+  0.0934588f,  2.04029f,    -0.224448f,  0.371168f,   -0.385348f,  -0.589883f,
+  -3.73627f,   -0.943144f,  0.346409f,   -0.211215f,  -0.351008f,  0.418807f,
+  0.943663f,   0.173267f,   1.16585f,    -0.0840888f, 0.227464f,   0.374412f,
+  0.0422597f,  -0.338868f,  0.222576f,   0.431713f,   1.12366f,    0.00753411f,
+  0.248412f,   -0.0902425f, 0.542455f,   -0.665629f,  -0.311245f,  -0.205639f,
+  -0.447149f,  -0.0502733f, -0.290186f,  -0.794384f,  0.0940881f,  -0.0686117f,
+  -0.0199961f, -0.587965f,  0.777096f,   -0.083381f,  -1.21282f,   0.652959f,
+  -1.18238f,   0.539991f,   0.352497f,   -0.540076f,  -0.26222f,   -0.568556f,
+  0.409102f,   -0.131146f,  -0.407161f,  -0.188287f,  -0.478657f,  0.000401932f,
+  -0.689324f,  0.351064f,   -1.43704f,   -0.315185f,  -0.868726f,  0.376341f,
+  -0.0566277f, 0.364831f,   0.611298f,   -0.495253f,  -0.0193132f, 0.617978f,
+  0.189586f,   -0.236758f,  -0.608246f,  -0.149017f,  -1.78303f,   0.143023f,
+  0.698386f,   -0.994086f,  -0.673327f,  0.233868f,   0.360425f,   0.0294123f,
+  -0.248683f,  -0.148392f,  0.0861829f,  -0.190843f,  -0.414906f,  0.607378f,
+  -0.756715f,  -0.511713f,  -0.321556f,  1.0078f,     -1.18141f,   0.519751f,
+  0.834629f,   -0.359343f,  0.612262f,   -0.0730553f, 0.262935f,   0.488276f,
+  0.387071f,   -1.44123f,   1.08269f,    0.554402f,   -0.069f,     0.14113f,
+  0.323817f,   0.824314f,   -0.431417f,  -0.349448f,  0.950728f,   -0.587836f,
+  -0.83914f,   -0.10844f,   0.26602f,    0.831933f,   -0.271315f,  0.231563f,
+  0.417049f,   0.190627f,   -0.0940667f, 0.255363f,   -0.0741022f, -0.0987662f,
+  -0.847522f,  0.00287554f, 0.0615741f,  -0.0832218f, 0.0847148f,  -0.392843f,
+  -0.938068f,  -0.10621f,   -0.260859f,  -0.825175f,  -0.401039f,  0.315213f,
+  -0.108269f,  0.288036f,   -8.66166f,   -0.970752f,  -0.66678f,   -0.593405f,
+  -0.518294f,  -0.138722f,  -0.454698f,  -0.22969f,   -0.553006f,  -0.440111f,
+  0.462661f,   -0.536854f,  0.0108295f,  -0.522888f,  0.00111157f, 0.229999f,
+  0.0267768f,  0.176266f,   -1.57043f,   0.0318106f,  0.257534f,   -0.198583f,
+  0.175564f,   -0.251465f,  -0.262441f,  -1.65283f,   -0.319603f,  -0.875282f,
+  -0.301303f,  0.0170948f,  -0.227075f,  0.0299545f,  -4.98346f,   0.470046f,
+  -1.28051f,   -0.213809f,  -0.486585f,  -0.906463f,  -0.169984f,  -0.333153f,
+  -0.376733f,  0.108016f,   0.486744f,   -0.186936f,  -0.429259f,  0.056501f,
+  -0.266545f,  0.265447f,   -0.137718f,  -0.490687f,  -0.935668f,  -0.16229f,
+  -0.696932f,  0.173157f,   0.434959f,   -0.140595f,  0.345845f,   -1.08013f,
+  -0.0205929f, -0.815874f,  -0.179812f,  0.02767f,    -0.141727f,  0.471936f,
+  -7.29453f,   -1.04362f,   -0.745482f,  -0.28725f,   -0.214997f,  -0.0850651f,
+  -0.748471f,  0.161325f,   -1.04387f,   -0.705305f,  0.489427f,   -0.765373f,
+  -0.301576f,  0.0742467f,  -0.331282f,  0.0372328f,  -0.90298f,   -0.0608646f,
+  -2.18756f,   0.170384f,   -0.258357f,  0.106287f,   -0.161684f,  -0.103799f,
+  -0.127774f,  -0.156313f,  0.0705286f,  -0.977908f,  -0.281191f,  -0.056757f,
+  -0.309474f,  0.050476f,   -9.78198f,   -2.42795f,   -0.289626f,  -1.07579f,
+  -0.439256f,  -1.09948f,   -0.564671f,  0.0913182f,  -0.417216f,  -1.19909f,
+  0.287063f,   0.402315f,   -0.17646f,   0.540488f,   0.00840239f, 0.397492f,
+  0.702393f,   -0.10566f,   0.655296f,   -0.0443876f, 0.154918f,   -0.760479f,
+  -0.0523153f, -0.366199f,  -1.08212f,   -0.398556f,  -0.415203f,  -1.10488f,
+  0.208349f,   0.27079f,    0.101546f,   -0.205752f,  -13.7923f,   -0.218637f,
+  -1.10077f,   0.355735f,   -0.306196f,  0.627434f,   -0.473101f,  -0.308027f,
+  -1.12724f,   0.301597f,   0.660785f,   0.0576217f,  -0.155925f,  -0.56107f,
+  -0.223537f,  0.114299f,   -0.53803f,   -0.252674f,  -2.66103f,   -0.185245f,
+  -0.314673f,  0.403337f,   0.679821f,   -0.69231f,   0.506264f,   -0.999705f,
+  -0.549097f,  0.353745f,   0.188249f,   0.414484f,   -0.615853f,  0.525681f,
+  -5.23065f,   -3.05174f,   1.02074f,    -0.965499f,  -0.158947f,  0.0436088f,
+  -0.485824f,  0.0375094f,  -1.39985f,   -0.481392f,  0.485785f,   -0.24874f,
+  -0.359633f,  0.668108f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_bias_64[] = {
+  0.0735592f, -0.045064f, -0.0114103f, 1.39246f,    -0.683467f,  0.155765f,
+  -0.667652f, -0.202425f, -0.585433f,  -0.146752f,  -0.0812931f, 0.580642f,
+  0.578542f,  -0.831916f, 0.610063f,   0.0101856f,  -0.235863f,  0.538141f,
+  -2.91334f,  -1.71887f,  0.126616f,   0.582497f,   -0.438879f,  0.221833f,
+  0.850773f,  -0.280886f, 0.443233f,   -0.0964873f, -0.216161f,  0.34413f,
+  0.656818f,  0.0169274f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_kernel_64[] = {
+  -0.310947f,   -0.232675f,    0.0171092f,    0.0834474f,   0.373977f,
+  0.300429f,    0.215072f,     -0.454074f,    0.187565f,    0.282742f,
+  0.562562f,    -0.0419322f,   0.000978486f,  -0.298267f,   0.216934f,
+  -0.388722f,   -0.146866f,    -0.275946f,    0.202361f,    0.225847f,
+  1.42868f,     0.473127f,     -0.145747f,    -0.104986f,   0.153459f,
+  0.69382f,     0.162266f,     0.0207715f,    -0.45095f,    -0.412071f,
+  -0.235109f,   -0.130199f,    0.231741f,     0.460193f,    0.0378202f,
+  0.429516f,    0.387691f,     -0.272479f,    0.0723884f,   -0.453914f,
+  -0.150618f,   -0.10745f,     -0.258615f,    0.0838312f,   -0.00554958f,
+  0.105377f,    -0.0415479f,   0.13228f,      1.09044f,     -0.73053f,
+  -0.422553f,   -0.435842f,    0.211416f,     0.420332f,    0.0181353f,
+  -0.030891f,   0.522788f,     0.613526f,     0.374032f,    0.287986f,
+  -0.403118f,   -0.287362f,    -1.11523f,     -0.577713f,   -0.020228f,
+  0.86465f,     -0.0590579f,   0.341274f,     -0.0115644f,  -0.260236f,
+  0.192123f,    -0.0849825f,   0.0501709f,    0.444382f,    0.0762727f,
+  0.0926596f,   -0.101157f,    -0.142787f,    0.40861f,     0.555805f,
+  -0.00614654f, -0.122846f,    0.203163f,     0.234266f,    0.409795f,
+  -0.0206245f,  -0.224679f,    0.025081f,     0.518044f,    -0.287186f,
+  0.016494f,    -0.0886331f,   0.236438f,     -1.01032f,    0.118332f,
+  0.364217f,    0.061438f,     0.0381303f,    0.128418f,    0.0257077f,
+  -0.975751f,   -0.694894f,    0.00351914f,   0.278179f,    0.29363f,
+  0.525576f,    0.0604849f,    0.531734f,     0.406643f,    0.812497f,
+  -0.403196f,   -0.16664f,     -0.620887f,    -0.428194f,   0.275401f,
+  0.432063f,    -0.00378342f,  0.295758f,     0.105615f,    -0.00683626f,
+  0.00396146f,  0.00598654f,   -0.0131701f,   -0.0115787f,  0.00386643f,
+  -0.69686f,    -0.139623f,    -0.440817f,    0.0542873f,   0.217962f,
+  0.527035f,    -0.0201046f,   0.0471354f,    0.0271858f,   -0.0775197f,
+  -0.309797f,   0.184879f,     -0.232854f,    -0.407081f,   0.706227f,
+  -0.0877534f,  0.306843f,     0.455075f,     -0.333961f,   0.0759148f,
+  0.0444791f,   -0.0693626f,   -0.0850289f,   -0.513063f,   -0.643971f,
+  -0.630279f,   -0.153889f,    0.123315f,     0.00548238f,  0.170707f,
+  0.734339f,    -0.176988f,    0.322519f,     0.178365f,    0.183519f,
+  -0.698683f,   -0.12043f,     -0.349914f,    -0.0696762f,  -0.53986f,
+  -0.104738f,   1.05264f,      0.983568f,     -0.109035f,   0.0113748f,
+  0.0815189f,   -0.0628812f,   0.0769389f,    0.010261f,    0.146573f,
+  -0.433194f,   -0.211572f,    -0.000397392f, 0.445325f,    0.145091f,
+  -0.0625902f,  0.29394f,      0.302315f,     0.0892226f,   -0.209504f,
+  -0.0150374f,  0.242608f,     0.216223f,     0.366857f,    0.209829f,
+  -0.540035f,   0.117599f,     -0.329315f,    0.0471133f,   -0.0115449f,
+  -0.0638235f,  0.0527461f,    0.348149f,     0.360802f,    1.06624f,
+  -0.615991f,   -0.341396f,    0.18972f,      0.0709888f,   -0.0414466f,
+  -0.0193809f,  0.0938933f,    0.209058f,     0.575042f,    0.483608f,
+  -0.285875f,   -0.115905f,    -0.363637f,    0.375425f,    0.336217f,
+  0.0336358f,   -0.00265618f,  -0.406854f,    -0.792959f,   -0.219354f,
+  0.0331615f,   0.0298859f,    -0.211446f,    -0.00280773f, -0.194011f,
+  0.262109f,    0.548076f,     0.120183f,     -0.661603f,   0.241855f,
+  -0.501428f,   0.00102718f,   -0.347331f,    -0.58306f,    0.0977254f,
+  0.117491f,    0.0840667f,    0.00693675f,   0.000600294f, 0.649569f,
+  -0.0553811f,  -0.197198f,    0.397236f,     -0.523737f,   -0.564192f,
+  -0.374679f,   -0.249344f,    0.00861428f,   0.00393439f,  -0.0834608f,
+  0.124389f,    -0.0393049f,   0.0425391f,    -0.153383f,   -0.182346f,
+  0.420953f,    0.464221f,     0.288984f,     0.570921f,    -0.239965f,
+  0.247239f,    -0.083434f,    0.714418f,     0.986323f,    -0.460244f,
+  -0.260993f,   -0.947743f,    -1.0789f,      -0.0391231f,  0.612407f,
+  -0.0306767f,  0.281419f,     0.0072426f,    -0.37623f,    0.188744f,
+  0.221666f,    -0.424914f,    0.29703f,      0.261715f,    0.277809f,
+  -0.0617616f,  -0.000611999f, -0.0547053f,   -0.0901018f,  -0.347669f,
+  0.856072f,    0.596675f,     -0.467639f,    -1.09324f,    -0.184224f,
+  -0.56051f,    -0.0144704f,   0.102894f,     -0.122982f,   -0.0020749f,
+  -0.0423487f,  0.0328702f,    -0.0154263f,   0.0349021f,   -0.00315595f,
+  0.0254802f,   -0.729191f,    0.207296f,     -0.0212349f,  -0.207078f,
+  0.20636f,     -0.156883f,    0.429765f,     -0.42672f,    0.138775f,
+  -0.0267343f,  0.631528f,     0.300646f,     -0.4793f,     -0.273833f,
+  -0.0135367f,  -0.530819f,    -0.534881f,    0.830896f,    0.0266992f,
+  0.473744f,    0.210334f,     0.0234739f,    0.255394f,    0.123531f,
+  -0.489341f,   -0.796627f,    0.372617f,     0.190136f,    0.275342f,
+  0.739505f,    0.402354f,     0.782806f,     0.437374f,    1.04948f,
+  -0.55963f,    0.382704f,     -0.698321f,    0.0817868f,   -0.440108f,
+  -0.0635004f,  -0.277851f,    -0.524194f,    0.286157f,    -0.01097f,
+  -0.0293145f,  -0.0405071f,   -0.035662f,    -0.012871f,   -0.0516409f,
+  -0.406671f,   0.709259f,     -0.525177f,    0.521123f,    -0.44813f,
+  0.48412f,     -0.0546513f,   0.305253f,     -0.468328f,   0.316453f,
+  -0.36307f,    0.497515f,     -0.0606276f,   0.315764f,    -0.422066f,
+  0.554025f,    -0.679183f,    0.616914f,     0.00283324f,  -0.000643824f,
+  0.0639999f,   0.0488285f,    -0.141031f,    0.068003f,    -0.0792678f,
+  -0.425307f,   -0.152235f,    0.269917f,     -0.352327f,   0.44792f,
+  -0.116514f,   -0.465868f,    0.154287f,     0.0161028f,   -0.16848f,
+  -0.255487f,   0.189832f,     0.254883f,     0.0240822f,   0.432638f,
+  -0.136564f,   0.137036f,     0.0375734f,    0.989246f,    -0.126287f,
+  0.111416f,    -0.0271002f,   0.718755f,     -0.0412969f,  0.00645681f,
+  0.253811f,    -0.0186998f,   0.691971f,     -0.282042f,   -0.0783915f,
+  0.274592f,    -0.358449f,    0.34155f,      -0.186374f,   -0.136907f,
+  -0.192334f,   -0.251168f,    -0.100874f,    -0.166578f,   -0.336507f,
+  0.402373f,    0.173695f,     0.108788f,     0.00885581f,  -0.310063f,
+  1.05545f,     0.0295867f,    0.180785f,     -0.173469f,   -0.469924f,
+  -0.224155f,   0.665862f,     -0.126546f,    0.240691f,    -0.0415301f,
+  -0.598534f,   0.0012723f,    -0.122297f,    -0.558947f,   0.268844f,
+  0.241193f,    0.0524422f,    -0.1683f,      0.575588f,    -0.139012f,
+  0.0636691f,   -0.446709f,    -0.094532f,    0.883809f,    -0.112981f,
+  -0.224047f,   0.0811193f,    -0.140571f,    -0.09683f,    -0.0796143f,
+  -0.102246f,   -0.863392f,    -0.0755124f,   0.23125f,     -0.0301361f,
+  -0.153029f,   -0.172238f,    -0.0286382f,   -0.338495f,   -0.317216f,
+  -0.146629f,   -0.242264f,    -0.702306f,    -0.285052f,   0.0623479f,
+  0.265735f,    0.00674475f,   0.666196f,     0.883586f,    0.278416f,
+  -0.341692f,   -0.509931f,    -0.156263f,    0.635885f,    -0.544143f,
+  -0.572632f,   -0.213285f,    0.443396f,     -0.268329f,   0.0638439f,
+  -0.185397f,   0.071126f,     0.386503f,     -0.402212f,   -0.140784f,
+  -0.411661f,   0.049398f,     -0.0672907f,   -0.267034f,   -0.0560875f,
+  0.0607937f,   0.0445484f,    -0.547651f,    0.574718f,    0.417189f,
+  -0.0610166f,  0.0632293f,    0.391619f,     -0.00671215f, -0.136883f,
+  -0.339346f,   0.0356183f,    0.511993f,     0.178676f,    0.286998f,
+  0.136511f,    -0.00796929f,  0.203985f,     0.0423532f,   -0.175196f,
+  0.378534f,    0.770417f,     0.593778f,     0.0256067f,   -0.82394f,
+  -0.500691f,   -0.425725f,    -0.623708f,    -0.0406241f,  -0.00226464f,
+  0.0207836f,   0.30732f,      -0.00784268f,  0.0065445f,   -0.0991039f,
+  -0.20871f,    -0.206835f,    0.281219f,     0.119361f,    0.259346f,
+  -0.102713f,   0.186488f,     -0.034455f,    -0.00198392f, -0.279107f,
+  -0.638993f,   -0.374404f,    -0.48601f,     -0.262345f,   0.624532f,
+  0.620632f,    -0.227014f,    0.433579f,     -0.0455096f,  1.22123f,
+  -0.429156f,   0.12396f,      0.0815152f,    -0.0837355f,  0.0282623f,
+  -0.407475f,   0.787321f,     -0.434974f,    0.312904f,    -0.230805f,
+  0.213042f,    -0.250929f,    0.302997f,     -0.354709f,   0.0504905f,
+  -0.561706f,   0.595558f,     0.374951f,     0.802969f,    -0.674902f,
+  0.33136f,     0.156606f,     0.0218968f,    -0.694188f,   -0.0221949f,
+  -0.00639123f, 0.0146536f,    0.0104145f,    0.021635f,    -0.0499428f,
+  -0.575116f,   -0.239035f,    -0.0588276f,   0.599722f,    0.541932f,
+  0.437433f,    0.716268f,     0.193207f,     0.548351f,    0.326951f,
+  -0.197124f,   0.0355353f,    -0.0952009f,   -0.217265f,   -0.389789f,
+  0.0528124f,   -0.21334f,     -0.190296f,    -1.17367f,    0.108905f,
+  0.109397f,    -0.0192577f,   0.0343813f,    0.085004f,    -0.0556737f,
+  -0.0411158f,  -0.534989f,    0.0361896f,    0.124415f,    0.291603f,
+  -0.0311974f,  -0.326726f,    0.343131f,     0.0276456f,   -0.231827f,
+  -0.373894f,   -0.208898f,    -0.273011f,    0.061323f,    -0.0910538f,
+  -0.30746f,    -0.108644f,    -0.190736f,    1.58048f,     -0.0739711f,
+  -0.0623489f,  -0.137967f,    -0.0601359f,   -0.133004f,   -0.0857153f,
+  0.00955987f,  -0.365561f,    -0.0329051f,   0.463463f,    0.14758f,
+  -0.512256f,   -0.227463f,    -0.26008f,     -0.567777f,   0.0646234f,
+  1.02161f,     0.66157f,      -0.16733f,     0.264921f,    -0.242036f,
+  0.214622f,    0.0712054f,    -0.260377f,    0.0849665f,   0.735094f,
+  0.11001f,     0.297301f,     -0.333342f,    0.066978f,    -0.123625f,
+  1.07596f,     0.401263f,     0.0800875f,    -0.340862f,   -0.115587f,
+  -0.32692f,    -0.300842f,    0.0277397f,    0.0630788f,   -0.261198f,
+  0.428695f,    -0.0544757f,   -0.124511f,    0.036992f,    0.126322f,
+  0.0317603f,   0.0820762f,    0.117277f,     -1.14594f,    -0.108076f,
+  -0.0258198f,  -0.00337525f,  -0.00512531f,  0.1274f,      -0.0660535f,
+  -0.640733f,   0.197142f,     0.147278f,     0.489271f,    0.226507f,
+  -0.0668414f,  0.0946318f,    0.0994164f,    -0.820516f,   0.512939f,
+  -0.305172f,   -0.715187f,    -0.195125f,    0.279346f,    0.462144f,
+  0.913882f,    -0.453879f,    0.0582033f,    -0.462866f,   0.0538736f,
+  0.0115737f,   0.00626993f,   -0.0185185f,   0.0114601f,   -0.0181164f,
+  0.41588f,     -0.0447331f,   0.611756f,     0.43385f,     0.834465f,
+  0.122019f,    -0.352983f,    0.340429f,     -0.245425f,   -0.365328f,
+  -0.521825f,   0.0371057f,    0.172188f,     -0.387949f,   0.221054f,
+  0.0126359f,   0.422958f,     0.584198f,     -0.581498f,   -0.019466f,
+  -0.0271737f,  -0.0740885f,   0.00540879f,   0.186086f,    -0.0324402f,
+  -0.563462f,   -0.458759f,    -0.425296f,    -0.0118862f,  -0.641508f,
+  0.0132084f,   0.0581128f,    0.0231444f,    0.468587f,    0.258838f,
+  0.0296665f,   0.0562801f,    0.630014f,     0.381816f,    -0.269761f,
+  -0.135515f,   0.046186f,     1.07632f,      -0.050616f,   0.104987f,
+  0.29991f,     0.119316f,     0.117248f,     0.0795009f,   0.242573f,
+  0.0416634f,   -0.0577639f,   -0.0974078f,   0.106255f,    -0.13098f,
+  0.0141486f,   -0.00418257f,  0.144848f,     -0.463934f,   0.0452591f,
+  0.252617f,    0.205222f,     -0.189843f,    0.0652245f,   -0.135386f,
+  0.0500646f,   -0.200368f,    -0.0142312f,   -0.0286832f,  -0.254355f,
+  -1.02752f,    -0.73549f,     0.0364518f,    0.0416227f,   -0.13185f,
+  -0.0886515f,  -0.502314f,    -0.102916f,    0.410911f,    -0.355655f,
+  0.400416f,    -0.340217f,    0.208829f,     0.245972f,    0.149739f,
+  -0.49458f,    0.589482f,     0.550827f,     0.912709f,    -0.351275f,
+  -0.128076f,   -0.285172f,    -0.672752f,    0.090583f,    -0.245286f,
+  -0.737297f,   -0.201515f,    -0.025122f,    -0.109854f,   0.36738f
+};
+
+static const float av1_simple_motion_search_prune_part_logits_bias_64[] = {
+  0.346819f,  0.442965f,  -0.0216032f,  0.0229235f, -0.402797f,
+  -0.666074f, -0.455388f, -0.00353411f, -0.595511f, -0.845667f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_64 = {
+  NUM_FEATURES_64,
+  NUM_LOGITS_64,
+  NUM_HIDDEN_LAYERS_64,
+  {
+      NUM_LAYER_0_UNITS_64,
+  },
+  {
+      av1_simple_motion_search_prune_part_layer_0_kernel_64,
+      av1_simple_motion_search_prune_part_logits_kernel_64,
+  },
+  {
+      av1_simple_motion_search_prune_part_layer_0_bias_64,
+      av1_simple_motion_search_prune_part_logits_bias_64,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_64
+#undef NUM_FEATURES_64
+#undef NUM_LAYER_0_UNITS_64
+#undef NUM_LOGITS_64
+
+#define NUM_HIDDEN_LAYERS_32 1
+#define NUM_FEATURES_32 25
+#define NUM_LAYER_0_UNITS_32 28
+#define NUM_LOGITS_32 10
+
+static const float av1_simple_motion_search_prune_part_logits_kernel_32[] = {
+  0.486581f,    0.340847f,   -0.109226f,   0.467224f,   -0.541561f,
+  0.0943619f,   -0.429442f,  -0.207442f,   0.959963f,   0.618666f,
+  -0.0636751f,  0.144508f,   -0.0278289f,  0.332293f,   -0.751493f,
+  0.245438f,    -0.917758f,  0.612128f,    -0.32648f,   0.534618f,
+  -0.615239f,   2.71641f,    0.233759f,    0.820558f,   -0.249758f,
+  -0.427783f,   -0.359361f,  0.0375732f,   0.806973f,   0.352512f,
+  -0.0532192f,  0.0576861f,  -0.464178f,   -0.334877f,  -0.697042f,
+  0.0538218f,   0.0919659f,  -0.00765812f, 0.0603847f,  -0.460315f,
+  0.37979f,     -0.0867612f, -0.670683f,   -0.188619f,  -0.570586f,
+  0.233418f,    0.153581f,   0.290905f,    -0.624885f,  -0.557842f,
+  -0.555567f,   0.463773f,   -0.123909f,   -0.277731f,  0.0374468f,
+  0.409903f,    0.287638f,   -0.593066f,   -0.223434f,  0.154263f,
+  -0.250464f,   -0.077696f,  0.229652f,    -0.304174f,  0.308053f,
+  0.33155f,     -0.502825f,  0.361216f,    -0.499294f,  0.00595444f,
+  -0.307201f,   0.5766f,     -0.438384f,   -0.093701f,  -0.118586f,
+  0.202337f,    -0.486623f,  0.261552f,    0.139756f,   -0.655642f,
+  -0.0627001f,  -0.213053f,  -0.243037f,   0.205918f,   0.0718368f,
+  0.188041f,    0.141529f,   -0.132239f,   0.425827f,   -0.218353f,
+  0.153114f,    0.33268f,    0.0226116f,   0.167394f,   0.269854f,
+  -0.457001f,   0.1973f,     -0.526087f,   0.467528f,   0.290934f,
+  1.16267f,     0.0823663f,  -0.754389f,   -0.83716f,   0.270157f,
+  -1.41229f,    0.148511f,   -0.286832f,   0.664796f,   0.492254f,
+  0.360567f,    -0.533993f,  0.0435672f,   -0.103001f,  0.220668f,
+  0.594621f,    -0.0213356f, -0.347638f,   -0.694457f,  0.0759505f,
+  0.161358f,    -0.389384f,  -0.0455192f,  -0.61252f,   -0.174173f,
+  -0.00788878f, -1.22487f,   0.332233f,    -0.0457021f, -0.225918f,
+  -0.197657f,   -0.115408f,  -0.240589f,   -2.05681f,   0.00914629f,
+  -1.92213f,    0.0268578f,  -0.49076f,    -0.0120123f, 0.291157f,
+  0.267116f,    -0.0775724f, 0.181115f,    -0.392441f,  -0.488114f,
+  -0.28842f,    -0.115465f,  0.128974f,    -0.0829899f, -0.14096f,
+  -0.140145f,   -0.700281f,  0.0368945f,   -0.437598f,  0.243485f,
+  -1.00301f,    0.332324f,   0.125014f,    -0.0604481f, -0.0652028f,
+  -0.207295f,   -1.0209f,    -0.341525f,   0.191326f,   -0.147578f,
+  0.0878327f,   0.129827f,   -0.0848319f,  0.187381f,   -1.28663f,
+  0.00537885f,  -0.134277f,  -0.0411126f,  -0.3434f,    -0.0456494f,
+  0.37861f,     0.409095f,   0.237177f,    -0.396855f,  -0.205418f,
+  -1.31701f,    -0.319032f,  -0.123404f,   -0.240005f,  -0.305206f,
+  -0.0258176f,  -0.26367f,   -0.142396f,   0.191672f,   -1.44061f,
+  0.0554776f,   -0.571839f,  -0.284789f,   -0.425677f,  -0.0307376f,
+  0.20275f,     -0.223146f,  0.144612f,    0.0212636f,  0.0238303f,
+  -0.253802f,   -0.188922f,  -0.0637066f,  -0.340836f,  0.124774f,
+  0.130474f,    -0.154099f,  -0.0292733f,  0.158148f,   -0.246989f,
+  -0.259059f,   0.220224f,   0.228449f,    -0.41956f,   -0.321848f,
+  -0.2396f,     -0.316449f,  -1.3363f,     0.0264099f,  -1.46865f,
+  0.113073f,    0.0722885f,  -0.166986f,   -0.164877f,  0.0360911f,
+  0.534472f,    -0.551152f,  -0.328501f,   0.0781121f,  -0.378112f,
+  -0.459502f,   0.28015f,    -0.212302f,   -0.521641f,  0.618993f,
+  -0.347709f,   0.266253f,   -0.0280894f,  0.348511f,   -0.0155031f,
+  -0.100693f,   0.0447673f,  0.277519f,    -0.233998f,  -0.0796738f,
+  -1.73644f,    -0.160776f,  0.53092f,     -0.180406f,  0.056447f,
+  0.385356f,    -0.262337f,  -0.241479f,   -0.271426f,  -0.457354f,
+  -0.266788f,   0.367371f,   -0.103065f,   0.47783f,    -0.188327f,
+  -0.159636f,   0.00142907f, -0.409756f,   0.454889f,   -0.24566f,
+  -0.0760084f,  0.286355f,   0.462102f,    0.0431695f,  -0.127395f,
+  -0.200476f,   -0.350557f,  0.217275f,    -0.23975f,   0.255148f,
+  -0.280626f,   0.42476f,    0.157411f,    0.0358675f,  -0.192591f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_bias_32[] = {
+  0.940498f,  0.15602f,   -0.234831f, 0.0268585f, 0.144769f,  0.243081f,
+  0.611406f,  0.366093f,  0.361868f,  0.39668f,   0.401479f,  0.369467f,
+  0.0909503f, 0.710595f,  0.032786f,  0.525891f,  -1.0232f,   0.732557f,
+  -0.064425f, 0.865222f,  -0.042917f, -0.237191f, -0.527006f, -0.0172101f,
+  0.59681f,   -0.472405f, 0.0969218f, -0.250624f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_kernel_32[] = {
+  0.355607f,    0.126701f,    -0.0825159f,  0.200675f,     -0.011308f,
+  -0.280057f,   0.559816f,    0.142689f,    0.0422419f,    -0.151692f,
+  -0.0275637f,  -0.283101f,   -0.20822f,    -0.200394f,    0.465427f,
+  0.344491f,    -0.525319f,   -0.358813f,   -0.39767f,     0.0974486f,
+  0.00559058f,  -0.00546089f, 0.0506486f,   0.114475f,     -0.0436463f,
+  -0.574152f,   -0.376294f,   0.16563f,     -0.0967032f,   0.00579838f,
+  0.0639909f,   -0.037129f,   0.407574f,    -0.231428f,    0.489326f,
+  -0.221566f,   -0.270382f,   -0.784628f,   -0.155502f,    0.481698f,
+  -0.0296057f,  0.431855f,    0.840807f,    0.112291f,     0.773874f,
+  -0.0610936f,  -0.012892f,   0.365154f,    0.0267687f,    -0.0751114f,
+  0.25043f,     0.516472f,    -0.186133f,   -0.12762f,     -0.168804f,
+  -0.146309f,   0.139314f,    -0.367113f,   -0.601079f,    0.0559856f,
+  0.176081f,    0.22397f,     0.434113f,    0.0363256f,    0.313051f,
+  0.0143976f,   0.190076f,    0.474607f,    -0.681134f,    -0.0709097f,
+  -0.253289f,   -0.216277f,   -0.0593789f,  -0.107795f,    -0.194842f,
+  0.513945f,    0.239171f,    -0.720561f,   0.0136723f,    -0.391147f,
+  -0.272043f,   -0.164766f,   0.124248f,    0.147178f,     -0.35497f,
+  0.397725f,    -0.117603f,   0.262937f,    -0.331964f,    0.182418f,
+  0.315671f,    -0.0385649f,  0.488769f,    -0.334568f,    0.00596018f,
+  0.0661557f,   -0.0446985f,  -0.0928255f,  -0.0221032f,   -0.019045f,
+  -0.20881f,    0.197907f,    -0.381881f,   0.0598071f,    -0.0434551f,
+  0.159283f,    -0.110631f,   0.266996f,    -0.0265494f,   0.135199f,
+  -0.00833162f, 0.804482f,    -0.114698f,   -0.15066f,     -0.479553f,
+  0.448407f,    -0.344069f,   -0.0280952f,  -0.208211f,    -0.102269f,
+  -0.679066f,   -0.37476f,    -0.0228875f,  0.0535049f,    0.111015f,
+  -0.18125f,    -0.167584f,   0.0110497f,   0.262723f,     -0.413839f,
+  -0.0611238f,  0.358499f,    0.0807514f,   0.208254f,     0.214499f,
+  0.11137f,     -0.14262f,    -0.0513973f,  0.243718f,     -0.373716f,
+  -0.00413366f, 0.216501f,    -0.164149f,   -0.064935f,    -0.0840282f,
+  0.0566148f,   0.0377686f,   0.289835f,    0.769388f,     0.891198f,
+  -0.592739f,   0.40744f,     -0.153095f,   0.657311f,     0.140737f,
+  0.28209f,     0.158344f,    0.353546f,    0.0868246f,    0.116887f,
+  0.402004f,    0.437184f,    0.589219f,    0.760594f,     -0.575419f,
+  -0.754308f,   -0.709219f,   -0.297814f,   -0.418609f,    -0.0262104f,
+  0.0411959f,   0.0597708f,   -0.143728f,   -0.136642f,    0.099614f,
+  -0.257601f,   -0.2404f,     0.305893f,    0.254009f,     -0.0301398f,
+  -0.0653091f,  -0.459002f,   -0.163404f,   0.123152f,     -0.0284252f,
+  -0.457272f,   0.00788622f,  -0.828399f,   -0.0534199f,   0.586877f,
+  0.982728f,    0.424581f,    0.0891856f,   0.383182f,     -0.122053f,
+  0.0808408f,   -0.00384914f, -0.0560201f,  -0.0524772f,   -0.263444f,
+  -0.239287f,   -0.882777f,   0.0180592f,   -0.0948711f,   -0.177946f,
+  0.0296473f,   0.096082f,    0.0455604f,   -0.108608f,    0.00777951f,
+  -0.140896f,   0.117187f,    -0.342467f,   -0.0691604f,   0.0761611f,
+  -0.0892053f,  0.111386f,    -0.167456f,   1.40616f,      -0.00478793f,
+  0.00547665f,  -0.0441829f,  0.0151323f,   -0.0674099f,   -0.0380578f,
+  0.16072f,     0.31882f,     0.245486f,    -0.424318f,    0.101845f,
+  -0.203343f,   -0.197402f,   -0.163025f,   -0.0771961f,   -0.264435f,
+  0.319429f,    0.250076f,    0.782726f,    0.386003f,     0.00700673f,
+  -0.375715f,   0.151453f,    -0.296265f,   -0.560183f,    -0.00767249f,
+  -0.109593f,   -0.119419f,   -0.0161516f,  0.0380283f,    -0.156417f,
+  0.131708f,    0.396268f,    -0.221796f,   0.232099f,     0.128852f,
+  0.0567268f,   0.297297f,    0.173269f,    0.213411f,     0.0384426f,
+  -0.290985f,   -0.0426841f,  -0.488292f,   -0.087101f,    -0.311582f,
+  0.83009f,     -0.153163f,   0.903335f,    -1.15644f,     -0.0378635f,
+  -0.0552129f,  -0.126362f,   -0.176945f,   0.0653115f,    0.0989368f,
+  -0.333543f,   -0.330586f,   0.29775f,     -0.103535f,    0.210824f,
+  -0.00300509f, 0.317105f,    0.216852f,    0.479718f,     0.0485808f,
+  -0.15662f,    0.718199f,    0.327513f,    0.115169f,     -0.423598f,
+  -0.456633f,   -0.575814f,   -0.494454f,   0.304411f,     0.0493055f,
+  -0.381171f,   0.467251f,    -0.122872f,   -0.167441f,    0.017253f,
+  -0.0583646f,  -0.1586f,     0.214046f,    -0.0284424f,   -0.217112f,
+  0.606567f,    -0.107533f,   0.36615f,     -0.0709227f,   0.604761f,
+  -0.244657f,   -0.296651f,   -0.595611f,   -0.156629f,    -0.693468f,
+  -0.310603f,   0.499272f,    0.282941f,    0.295043f,     -0.178704f,
+  0.281186f,    0.014329f,    -0.120819f,   0.154234f,     0.0131325f,
+  -0.472231f,   -0.631281f,   0.422955f,    0.711432f,     -0.118025f,
+  0.0864996f,   0.343971f,    -0.301477f,   -0.246638f,    0.165068f,
+  0.218044f,    0.224236f,    -0.0848522f,  0.00671216f,   0.401141f,
+  -0.218857f,   -0.0298495f,  -0.135725f,   -0.377618f,    0.022473f,
+  0.106955f,    -0.0582005f,  0.0468484f,   -0.0217442f,   0.130911f,
+  -0.0926905f,  0.383007f,    -0.159353f,   -0.222711f,    -0.0286419f,
+  0.372315f,    -0.469095f,   0.797571f,    -0.301315f,    0.239327f,
+  -0.997507f,   -0.363409f,   0.353717f,    0.676686f,     -0.0500028f,
+  0.0638539f,   -0.431927f,   0.243852f,    0.000884826f,  -0.00166585f,
+  0.0613292f,   -0.029558f,   -0.0248432f,  -0.0125607f,   -0.0309674f,
+  -0.743308f,   0.0409806f,   0.0921015f,   0.167816f,     0.406849f,
+  0.095677f,    0.0308913f,   0.139956f,    -0.400472f,    0.396617f,
+  0.936517f,    0.355057f,    -0.423816f,   -0.232472f,    -0.220188f,
+  -0.399746f,   -0.409623f,   -0.158797f,   0.361153f,     0.0327019f,
+  0.0690844f,   -0.032197f,   0.0248558f,   0.00438518f,   0.0222724f,
+  -0.326832f,   -0.314295f,   0.156563f,    0.0562703f,    0.332694f,
+  0.299424f,    0.228206f,    0.322038f,    0.0136098f,    0.0060297f,
+  -0.165851f,   -0.306512f,   0.0796508f,   -0.37158f,     0.239395f,
+  -0.349442f,   0.198515f,    -0.253854f,   -1.13694f,     0.0202873f,
+  -0.0504009f,  -0.130528f,   -0.017126f,   -0.0370001f,   -0.087458f,
+  -0.119952f,   -0.130404f,   0.0333733f,   -0.184736f,    0.182162f,
+  0.227776f,    -0.166563f,   -0.156162f,   0.118215f,     -0.220183f,
+  0.00474779f,  -0.107792f,   0.260493f,    0.11884f,      0.156587f,
+  0.303936f,    -0.131788f,   -0.314774f,   0.310606f,     0.0935523f,
+  0.790767f,    0.26461f,     0.0236426f,   0.0629469f,    0.0344072f,
+  -0.151513f,   0.211498f,    0.0245435f,   0.0629973f,    0.052019f,
+  -0.03308f,    0.123487f,    0.0885027f,   0.159172f,     -0.0510615f,
+  0.0298033f,   -0.130515f,   -0.121799f,   -0.104915f,    0.208822f,
+  -0.310496f,   -0.314106f,   0.303307f,    -0.0196736f,   0.0420045f,
+  0.461777f,    -0.433699f,   0.00345407f,  0.703139f,     -0.655637f,
+  -0.210767f,   -0.201278f,   0.163694f,    -0.236534f,    0.300877f,
+  0.0769982f,   -0.282453f,   0.149721f,    -0.0303466f,   -0.191473f,
+  -0.406056f,   -0.213472f,   0.1619f,      -0.245953f,    0.00544399f,
+  -0.121434f,   0.193012f,    -0.307165f,   1.45431f,      -0.161468f,
+  -0.12444f,    -0.146129f,   -0.0528212f,  -0.0925165f,   -0.134528f,
+  -0.479475f,   0.315525f,    0.133845f,    0.382158f,     -0.0799693f,
+  -0.151041f,   0.255772f,    0.409536f,    -0.240663f,    -0.323741f,
+  -0.205876f,   0.03699f,     -0.217541f,   0.108511f,     0.640628f,
+  0.705993f,    -0.423899f,   -0.78314f,    -0.100733f,    -0.00859087f,
+  0.0251879f,   0.0458335f,   0.00210128f,  -0.047576f,    -0.0560518f,
+  -1.23869f,    -0.829914f,   0.0346551f,   0.350505f,     0.193688f,
+  0.459154f,    0.137898f,    0.503818f,    0.260867f,     0.649539f,
+  0.0150802f,   0.0239274f,   -0.276069f,   -0.0621478f,   -0.193106f,
+  -0.0375665f,  -0.654529f,   0.189493f,    0.446625f,     -0.0208265f,
+  0.019838f,    -0.0201955f,  0.00180428f,  -0.0110678f,   -0.0172414f,
+  0.0276489f,   -0.252882f,   -0.0351807f,  -0.0518874f,   0.279098f,
+  -0.245122f,   0.101287f,    -0.114202f,   -0.0812187f,   0.572429f,
+  -0.0821731f,  0.564183f,    0.0222552f,   0.190111f,     -0.0417497f,
+  -0.00385925f, -0.182995f,   -0.240482f,   -0.291572f,    -0.0450444f,
+  0.0962974f,   -0.165973f,   -0.0954637f,  -0.163841f,    -0.833405f,
+  -1.31541f,    -0.336473f,   -0.0920702f,  0.816105f,     0.393377f,
+  0.0340241f,   -0.0844545f,  0.61729f,     -0.17596f,     0.241149f,
+  -0.42825f,    -0.59091f,    -0.290702f,   0.0796465f,    0.0982819f,
+  0.466934f,    0.261666f,    0.0373333f,   0.332509f,     -0.0266694f,
+  -0.0476951f,  -0.00642167f, -0.0132542f,  -0.000320841f, 0.00475532f,
+  0.000502778f, 0.296534f,    -0.13297f,    -0.113082f,    -0.327923f,
+  0.35901f,     -0.302246f,   0.189799f,    -0.37994f,     0.16107f,
+  -0.20414f,    0.548575f,    -0.460821f,   0.591878f,     -0.213113f,
+  -0.169373f,   -0.07332f,    0.228841f,    0.682302f,     -0.0665316f,
+  -0.142456f,   -0.0873117f,  0.00607451f,  0.0376443f,    0.0536673f,
+  -0.0109536f,  -0.400279f,   0.550058f,    0.820871f,     -0.666373f,
+  -0.471962f,   -0.315925f,   -0.313142f,   0.952742f,     0.473928f,
+  -0.119006f,   0.153241f,    -0.0383078f,  0.631869f,     -0.343423f,
+  -0.233473f,   -0.218195f,   -0.077688f,   -0.728291f,    0.0382408f,
+  -0.00662886f, -0.0419666f,  0.0309776f,   -0.0281592f,   0.0154229f,
+  -0.198534f,   0.0206324f,   0.0152272f,   -0.235067f,    0.0330486f,
+  0.139198f,    -0.0612118f,  0.133154f,    -0.258675f,    0.0900275f,
+  -0.127771f,   0.157322f,    -0.00767807f, -0.329258f,    0.327458f,
+  0.0528581f,   -0.181125f,   0.409995f,    -0.162979f,    -0.0193475f,
+  0.186009f,    0.0519501f,   0.651877f,    -0.37821f,     -1.10341f,
+  -0.189776f,   -0.0922788f,  0.460256f,    0.168011f,     0.440295f,
+  0.478135f,    0.374573f,    0.384048f,    0.116953f,     0.68886f,
+  -0.427727f,   -0.36676f,    -0.500013f,   -0.228685f,    -0.218859f,
+  0.208396f,    -0.0173765f,  -0.0680241f,  -0.00538013f,  -0.0674409f,
+  -0.092764f,   0.0295707f,   -0.0462887f,  -0.00636006f,  0.0334169f
+};
+
+static const float av1_simple_motion_search_prune_part_logits_bias_32[] = {
+  0.176459f,  0.154405f, 0.281821f,  0.375264f,  -0.882863f,
+  -0.240261f, -1.17075f, -0.280216f, -0.743836f, -0.317511f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_32 = {
+  NUM_FEATURES_32,
+  NUM_LOGITS_32,
+  NUM_HIDDEN_LAYERS_32,
+  {
+      NUM_LAYER_0_UNITS_32,
+  },
+  {
+      av1_simple_motion_search_prune_part_layer_0_kernel_32,
+      av1_simple_motion_search_prune_part_logits_kernel_32,
+  },
+  {
+      av1_simple_motion_search_prune_part_layer_0_bias_32,
+      av1_simple_motion_search_prune_part_logits_bias_32,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_32
+#undef NUM_FEATURES_32
+#undef NUM_LAYER_0_UNITS_32
+#undef NUM_LOGITS_32
+
+#define NUM_HIDDEN_LAYERS_16 1
+#define NUM_FEATURES_16 25
+#define NUM_LAYER_0_UNITS_16 32
+#define NUM_LOGITS_16 10
+
+static const float av1_simple_motion_search_prune_part_logits_kernel_16[] = {
+  -0.520913f,   0.395611f,    0.0369091f,   -0.318591f,  -0.463252f,
+  0.134992f,    -0.43154f,    -0.0739112f,  -0.118817f,  0.476373f,
+  -0.281406f,   0.3413f,      0.456255f,    0.33307f,    0.2942f,
+  0.1317f,      0.498113f,    1.95406f,     -0.165726f,  -0.219306f,
+  -0.302656f,   -1.31157f,    -0.433662f,   0.151716f,   -0.214817f,
+  0.504523f,    -0.710049f,   0.359616f,    -0.412695f,  -0.103193f,
+  0.341912f,    0.351378f,    -0.181486f,   0.573862f,   -0.0396254f,
+  -0.17855f,    -0.276163f,   0.0367465f,   -0.353905f,  -0.204689f,
+  0.309581f,    -0.0439686f,  -0.147855f,   0.152745f,   0.290871f,
+  0.131049f,    -0.27808f,    -0.142997f,   0.207843f,   -1.23074f,
+  -0.267714f,   -0.336923f,   0.313781f,    -0.61488f,   -0.161984f,
+  0.238059f,    -0.0879942f,  -0.085543f,   -0.260156f,  -0.13614f,
+  -0.242196f,   0.201216f,    -0.248691f,   0.0936671f,  -0.350522f,
+  -0.35002f,    -0.156583f,   -0.00579001f, 0.300578f,   -0.341269f,
+  -0.290712f,   0.354802f,    -0.31629f,    0.509107f,   -0.236953f,
+  -0.0923519f,  0.544509f,    -0.280991f,   -0.017437f,  -0.202721f,
+  -0.116388f,   -0.7191f,     0.324586f,    0.254249f,   0.125505f,
+  0.00658697f,  -0.333322f,   -0.126537f,   -0.140004f,  -0.0241202f,
+  -0.172466f,   0.210035f,    -0.270833f,   0.0579044f,  0.0950352f,
+  -0.120382f,   0.063292f,    -0.394925f,   0.482165f,   0.147753f,
+  0.331465f,    -0.187444f,   0.1083f,      0.414028f,   0.279238f,
+  -0.486889f,   -0.674349f,   -0.313656f,   -0.131186f,  -0.100662f,
+  0.238191f,    -1.19083f,    -0.30667f,    -2.4324f,    0.235311f,
+  0.108605f,    1.67197f,     0.476157f,    0.30055f,    0.0839538f,
+  0.408469f,    -0.473517f,   0.560283f,    -0.0188136f, 0.273824f,
+  -0.43707f,    -0.0346978f,  -0.438315f,   -0.0196275f, -0.0567921f,
+  -0.220166f,   0.216175f,    -0.0180461f,  0.0116429f,  -0.0096949f,
+  -0.32613f,    0.176829f,    -0.243563f,   -0.240972f,  -0.621819f,
+  -0.00619648f, -0.145525f,   0.124324f,    -0.0306925f, 0.172208f,
+  -2.04631f,    -0.200087f,   -0.594135f,   -0.352303f,  -0.309826f,
+  0.0922786f,   -0.698371f,   -0.0366823f,  0.0244036f,  0.338775f,
+  -0.115947f,   0.144971f,    -0.0607037f,  -0.762412f,  0.0125584f,
+  -0.262427f,   -0.0830273f,  -0.291252f,   -0.176059f,  -0.203983f,
+  0.0871455f,   -0.0894925f,  0.0426263f,   -0.060001f,  -0.542355f,
+  -0.407837f,   -0.0419273f,  0.226608f,    -0.114844f,  0.158733f,
+  -0.187237f,   0.113163f,    -1.86337f,    -0.367544f,  -0.547048f,
+  -0.24192f,    -0.226764f,   0.090912f,    0.819604f,   0.433766f,
+  -0.841657f,   0.446987f,    -0.622761f,   -0.0296385f, -0.130176f,
+  -0.0518136f,  -0.640326f,   -0.330107f,   -0.137832f,  -0.0119033f,
+  0.39401f,     0.111331f,    -0.141367f,   -0.230289f,  0.171054f,
+  -0.924059f,   -0.107317f,   -0.347983f,   0.0261109f,  0.423002f,
+  -0.305817f,   0.247696f,    0.0436002f,   0.0305862f,  -1.52448f,
+  -0.595587f,   -0.155552f,   -1.11949f,    -0.513937f,  0.138347f,
+  -0.301487f,   0.352144f,    -0.615801f,   0.0326701f,  -0.215322f,
+  -0.0608176f,  -0.416557f,   -0.306073f,   -0.441512f,  -0.0569277f,
+  -0.709768f,   -0.602527f,   -0.311134f,   0.152471f,   -0.255299f,
+  0.354505f,    0.194464f,    0.0144251f,   0.110732f,   -0.4452f,
+  -0.804814f,   0.205325f,    -0.0957486f,  0.502684f,   0.09112f,
+  -0.533087f,   -1.77979f,    0.556992f,    -0.176157f,  -0.642633f,
+  0.11553f,     -0.232561f,   0.161277f,    -0.0631125f, -0.20759f,
+  0.489253f,    -0.067533f,   0.0231024f,   -0.179831f,  -0.272985f,
+  -0.390059f,   0.3089f,      0.185733f,    -0.257065f,  -0.508838f,
+  -0.550028f,   0.0665621f,   -0.138288f,   -0.413188f,  0.191193f,
+  -1.32969f,    -0.431025f,   0.270242f,    -0.340062f,  0.0817257f,
+  0.0376051f,   -0.18633f,    0.0828274f,   0.00670051f, -0.431295f,
+  -0.450316f,   -0.173042f,   -0.322248f,   0.370628f,   0.10019f,
+  0.317293f,    -0.266613f,   0.0752441f,   -0.425656f,  -0.112223f,
+  0.557991f,    -0.324368f,   -0.195261f,   -0.0526129f, -0.807472f,
+  -0.387466f,   0.192186f,    0.353213f,    -0.120238f,  0.107686f,
+  0.200678f,    -0.75363f,    0.466857f,    -0.282345f,  -0.0849236f,
+  -0.0490695f,  -0.00643182f, 0.123047f,    -0.207805f,  -0.130456f,
+  -1.09455f,    0.340973f,    0.334784f,    0.0706643f,  -1.65681f,
+  -0.319952f,   -0.198514f,   -0.0787972f,  0.089524f,   0.0531034f,
+  -0.202705f,   -0.0852339f,  -0.62572f,    -0.0734234f, -0.838088f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_bias_16[] = {
+  -0.0616197f, 0.939947f, 0.521161f,  0.213886f,  0.130324f,  -0.127443f,
+  -0.0538715f, 0.708746f, 0.445031f,  0.418781f,  -0.114539f, 0.521941f,
+  1.13719f,    0.606545f, -0.32193f,  -0.150788f, 0.158487f,  -0.224005f,
+  0.654715f,   0.115729f, -0.286506f, -2.06223f,  0.0117697f, 0.503905f,
+  -0.102339f,  0.653256f, -0.813561f, 0.905235f,  -0.417269f, -0.206265f,
+  0.661496f,   0.95533f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_kernel_16[] = {
+  -0.203489f,   0.00686229f,  -0.161414f,   0.0637276f,   0.27516f,
+  0.512219f,    0.164205f,    0.00326062f,  -0.41914f,    -0.400334f,
+  0.554419f,    0.715772f,    -0.295569f,   -0.703503f,   0.0137744f,
+  -0.0934259f,  0.174234f,    -0.148618f,   -0.0360558f,  -0.0986598f,
+  -0.138502f,   -0.0770713f,  0.122922f,    -0.00784415f, 0.0953234f,
+  -0.255754f,   -0.310967f,   0.185306f,    0.464554f,    0.147338f,
+  -0.0612304f,  0.164783f,    0.301097f,    0.161364f,    -0.12723f,
+  -0.0265984f,  -0.471361f,   0.0578776f,   -0.362865f,   0.425789f,
+  0.402758f,    -0.190235f,   0.00549738f,  -0.570908f,   1.27206f,
+  0.048868f,    -0.0097675f,  0.0708324f,   0.0456103f,   0.0149062f,
+  -0.563032f,   -0.420573f,   0.107278f,    0.0938258f,   0.142712f,
+  -0.00251036f, -0.250583f,   0.522272f,    0.0113175f,   0.126751f,
+  -0.433028f,   -0.035542f,   -0.536686f,   -0.0668722f,  0.253094f,
+  0.254007f,    -0.435505f,   0.343001f,    0.0531542f,   -0.361914f,
+  -0.102664f,   0.0404874f,   0.132686f,    0.0762298f,   0.0236971f,
+  -0.419454f,   0.230877f,    -0.223714f,   0.037813f,    0.0818604f,
+  0.383705f,    -0.235028f,   -0.0554801f,  0.429851f,    0.0845829f,
+  0.166295f,    0.355111f,    -0.421197f,   0.298949f,    0.0218224f,
+  0.445705f,    -0.392217f,   -0.429578f,   -0.076276f,   -0.0963531f,
+  -0.631425f,   -0.225977f,   8.06349e-06f, 0.0676679f,   0.0779651f,
+  0.0706891f,   0.101377f,    0.517103f,    0.0945502f,   -0.52522f,
+  -0.312022f,   0.0358089f,   0.616509f,    -0.0507444f,  -0.465814f,
+  -0.0326024f,  0.591298f,    0.188544f,    -0.0633316f,  -0.199987f,
+  0.403118f,    -0.511281f,   -0.696263f,   0.112996f,    0.103875f,
+  0.0495595f,   -0.0107449f,  0.521539f,    -0.0123823f,  -0.0642751f,
+  0.08548f,     -0.0679207f,  0.526558f,    0.0651114f,   -0.342643f,
+  -0.349934f,   0.307437f,    0.368763f,    -0.194851f,   -0.134117f,
+  0.102448f,    -0.0520666f,  0.0415824f,   -0.175085f,   0.272685f,
+  0.0675856f,   0.120627f,    0.391408f,    -0.135249f,   -0.357024f,
+  0.019666f,    -0.0622677f,  0.407427f,    0.22655f,     -0.129432f,
+  -0.165327f,   0.004893f,    0.5479f,      0.0613981f,   -0.479682f,
+  -0.144228f,   -0.130106f,   0.206458f,    -0.342086f,   0.12691f,
+  -0.113554f,   0.231164f,    -0.051419f,   0.0401286f,   -0.560429f,
+  -0.070609f,   0.420232f,    0.442465f,    -0.237501f,   -0.000293732f,
+  -1.017f,      -0.210222f,   0.0157063f,   0.0488178f,   0.0734721f,
+  -0.52626f,    -0.276441f,   -0.521579f,   0.443532f,    -0.0819051f,
+  -0.0732633f,  -0.17999f,    0.258525f,    -0.0374872f,  0.150115f,
+  0.0510939f,   0.168116f,    0.473372f,    0.824489f,    0.302195f,
+  -0.348613f,   0.238569f,    0.176444f,    -0.633945f,   -0.0567195f,
+  -0.0305827f,  -0.0551851f,  0.85822f,     -0.0628099f,  0.0364294f,
+  -0.234823f,   0.179067f,    0.143208f,    -0.0511014f,  -0.404191f,
+  0.428035f,    0.0235506f,   0.371991f,    -0.312909f,   0.550933f,
+  -0.389265f,   -0.271813f,   -0.293461f,   -0.583752f,   0.179991f,
+  0.191698f,    0.659094f,    1.07941f,     -0.509555f,   -0.100638f,
+  0.079988f,    -0.0519107f,  -0.112723f,   -0.0663326f,  0.0353569f,
+  -0.795055f,   -0.465999f,   0.283579f,    0.340913f,    0.152738f,
+  0.294664f,    0.527839f,    0.187735f,    0.359461f,    0.164629f,
+  0.107512f,    0.390402f,    0.236702f,    0.114674f,    -0.525655f,
+  -0.555476f,   -0.6589f,     -0.266601f,   -0.0946547f,  0.6306f,
+  0.0248513f,   0.038497f,    0.432706f,    -0.0715465f,  0.0410172f,
+  -0.115313f,   -0.428684f,   0.136283f,    0.0913185f,   0.11277f,
+  0.0968689f,   -0.00437052f, 0.0888981f,   0.10304f,     0.02442f,
+  -0.211315f,   0.00981596f,  -0.0974827f,  0.208611f,    0.140644f,
+  0.0315567f,   0.350332f,    -0.291049f,   -0.0715449f,  -0.352992f,
+  -0.858004f,   0.828658f,    0.439092f,    0.0151291f,   0.0503828f,
+  0.0656112f,   -0.710749f,   -0.0951757f,  0.193908f,    0.00908018f,
+  0.141486f,    -0.0657711f,  0.099791f,    0.153729f,    -0.419576f,
+  -0.892636f,   -0.0449268f,  -0.170786f,   -0.156564f,   0.384511f,
+  0.296565f,    0.0569815f,   -0.103938f,   1.27479f,     -0.0406475f,
+  0.154083f,    -0.186442f,   0.0282588f,   0.0312102f,   -0.188994f,
+  0.284243f,    -0.564693f,   0.425525f,    -0.00924596f, 0.810003f,
+  0.233812f,    -0.0180273f,  0.121082f,    -0.209096f,   0.151437f,
+  0.286921f,    -0.348095f,   0.174813f,    -0.413798f,   0.108994f,
+  -0.34266f,    -0.0337981f,  -0.459f,      -0.409812f,   -0.0890104f,
+  0.0834802f,   -0.00259191f, -0.105914f,   -0.164207f,   0.0697689f,
+  -0.312098f,   -0.00650536f, -0.486758f,   -0.248486f,   0.24314f,
+  -0.0857144f,  0.0884781f,   -0.65615f,    -0.121744f,   0.0709335f,
+  -0.0237193f,  0.10764f,     -0.0409452f,  -0.0824305f,  0.42329f,
+  0.138258f,    0.502607f,    0.228545f,    0.0687789f,   0.0361586f,
+  0.39074f,     0.0722654f,   -0.0133148f,  0.283278f,    0.0743384f,
+  0.310292f,    -0.297675f,   -0.359935f,   0.521021f,    -0.10082f,
+  -0.272333f,   0.0120283f,   0.138118f,    -0.123711f,   -0.0711386f,
+  0.0170747f,   0.831039f,    0.0509626f,   0.790608f,    -0.0863406f,
+  -0.31962f,    0.0631013f,   0.0873453f,   -0.472331f,   -0.0826027f,
+  -0.241722f,   0.148835f,    -0.131611f,   0.000195347f, -0.0615804f,
+  -0.838663f,   -0.586979f,   0.247713f,    0.362254f,    0.492727f,
+  -0.132163f,   0.0516545f,   0.477838f,    -0.0395182f,  0.0124993f,
+  -0.771514f,   0.0386912f,   -0.118525f,   -0.346172f,   -0.265905f,
+  -0.175257f,   -0.406287f,   0.393837f,    0.409096f,    -0.408501f,
+  -0.0207146f,  0.0487809f,   0.0636982f,   0.0276368f,   0.0878249f,
+  0.0425889f,   0.0868633f,   0.17423f,     -0.128217f,   -0.477068f,
+  -0.321294f,   0.0393771f,   0.00812823f,  -0.350529f,   -0.129012f,
+  0.439953f,    0.396662f,    0.410475f,    -0.123129f,   -0.565966f,
+  0.0298635f,   -0.614611f,   -0.477514f,   0.453651f,    0.0617068f,
+  0.0530563f,   0.0479074f,   0.213551f,    0.039034f,    0.0449095f,
+  -1.06868f,    -1.2654f,     -0.175482f,   0.595068f,    -0.230095f,
+  0.719838f,    -0.272148f,   0.696564f,    0.0485396f,   0.468584f,
+  0.0695439f,   -0.0842122f,  -0.228978f,   0.161397f,    -0.000441421f,
+  -0.0297514f,  -0.250599f,   0.196656f,    0.608423f,    -0.0112096f,
+  0.0236881f,   -0.00167311f, 0.0040709f,   0.015495f,    0.00757698f,
+  -0.165886f,   0.359767f,    -0.0214696f,  0.377208f,    0.0303547f,
+  0.0657094f,   0.140775f,    0.21867f,     -0.203922f,   0.263878f,
+  -0.0529099f,  0.202438f,    -0.243226f,   0.156659f,    -0.627056f,
+  -0.845036f,   -0.500873f,   0.172588f,    0.402972f,    -0.147734f,
+  0.151792f,    -0.075579f,   0.443519f,    0.0311335f,   -0.0328222f,
+  -0.0299781f,  0.435956f,    -0.0987376f,  0.288402f,    0.135902f,
+  -0.173584f,   -0.186255f,   0.224524f,    -0.249645f,   0.123702f,
+  -0.0846244f,  0.491317f,    0.544846f,    0.338677f,    -0.258885f,
+  -0.617434f,   -0.629003f,   -0.347233f,   0.181262f,    -0.0606015f,
+  -0.537766f,   0.215089f,    -0.334527f,   0.0488534f,   0.0577997f,
+  -1.12431f,    -0.932292f,   -0.11559f,    0.573715f,    0.151128f,
+  0.693818f,    -0.16956f,    0.802591f,    -0.231531f,   1.04318f,
+  -0.476417f,   0.293452f,    -0.610136f,   0.27506f,     -0.384012f,
+  0.305366f,    -0.0540464f,  -0.337583f,   -0.174285f,   0.157248f,
+  0.0477345f,   -0.0229535f,  0.0475766f,   -0.00603319f, 0.00856119f,
+  -0.702893f,   -0.0579673f,  0.183024f,    -0.166222f,   0.109763f,
+  -0.148019f,   -0.258873f,   -0.0820157f,  -0.186716f,   -0.449265f,
+  -0.0534138f,  0.15732f,     0.46357f,     0.00502591f,  -0.0282085f,
+  0.152277f,    -0.855199f,   -0.357115f,   0.0366159f,   0.0131101f,
+  -0.0407758f,  0.0462835f,   0.146309f,    -0.00276278f, -0.0591814f,
+  -0.109437f,   0.506764f,    -0.044421f,   0.465907f,    0.114444f,
+  -0.241053f,   -0.362649f,   -0.432615f,   0.199989f,    -0.00635866f,
+  -0.521886f,   0.0958924f,   -0.485725f,   0.0430527f,   0.069746f,
+  0.681091f,    -0.288144f,   0.505671f,    0.0489065f,   -0.0373836f,
+  0.266079f,    0.145173f,    -0.011481f,   -0.225074f,   -0.754501f,
+  -0.122939f,   -0.294213f,   0.334738f,    0.281561f,    0.558977f,
+  -0.21551f,    -0.346507f,   -0.0625635f,  0.0782034f,   -0.236999f,
+  -0.803783f,   -0.601117f,   0.091192f,    0.636122f,    -0.250626f,
+  0.0354961f,   0.103915f,    0.508571f,    0.329911f,    -0.0425999f,
+  -0.0867587f,  -0.0385824f,  1.13914f,     -0.0261992f,  0.00484478f,
+  0.124603f,    -0.012173f,   -0.377358f,   -0.243563f,   0.236094f,
+  0.145663f,    -0.132752f,   0.347497f,    -0.529315f,   0.271632f,
+  -0.372805f,   0.0261836f,   0.126169f,    0.0941008f,   0.283773f,
+  0.765701f,    -0.226477f,   -0.181549f,   -0.306896f,   0.110165f,
+  -0.0784234f,  -0.0827892f,  -0.0374252f,  -0.0950872f,  -0.451015f,
+  -0.995793f,   -0.452663f,   0.293338f,    -0.380865f,   0.032683f,
+  0.0178248f,   0.0699194f,   -0.0811722f,  -0.0866096f,  0.139289f,
+  0.296604f,    0.192293f,    -0.0589607f,  -0.179878f,   0.00360266f,
+  -0.0905794f,  0.136744f,    -0.191555f,   1.31877f,     -0.0592033f,
+  -0.158766f,   0.0214746f,   -0.190113f,   -0.116671f,   0.0449292f,
+  -0.109533f,   -0.709307f,   0.386424f,    0.40201f,     0.262211f,
+  -0.155244f,   0.233988f,    -0.0166317f,  0.462665f,    0.0484462f,
+  0.210902f,    -0.352798f,   0.38698f,     -0.228261f,   -0.084309f,
+  -0.220751f,   -0.170879f,   -0.352617f,   -1.24277f,    0.266004f,
+  -0.0125749f,  -0.0380073f,  0.101838f,    -0.0483024f,  -0.0629178f,
+  -0.0695577f,  -0.103439f,   0.242131f,    -0.0796858f,  0.349718f,
+  -0.332045f,   0.0138352f,   -0.380235f,   -0.28717f,    -0.176276f,
+  0.865903f,    0.36593f,     0.243925f,    -0.422289f,   -0.117327f,
+  0.21876f,     0.245393f,    -0.426134f,   -0.186077f,   0.0352515f,
+  -0.123742f,   0.249376f,    1.3281f,      0.0707771f,   0.071415f,
+  -0.286827f,   -0.131691f,   -0.270881f,   -0.434378f,   0.376064f,
+  0.35966f,     0.513374f,    0.439378f,    -0.222716f,   -0.5874f,
+  0.487997f,    -0.293271f,   -0.184245f,   -0.037256f,   0.17723f,
+  -0.438651f,   0.428184f,    0.112983f,    -0.449287f,   -0.0451963f,
+  0.0854929f,   0.0735442f,   -0.0148642f,  -0.0586782f,  -0.176455f,
+  -0.438979f,   -0.127109f,   0.211478f,    0.388035f,    -0.0372021f,
+  0.220575f,    0.382144f,    0.302121f,    0.0857121f,   0.193445f,
+  -0.488858f,   -0.195288f,   -0.316184f,   -0.314026f,   -0.111956f,
+  0.0744768f,   0.292709f,    0.30187f,     -0.285506f,   -0.105006f,
+  0.0851402f,   -0.082318f,   0.277518f,    0.725294f,    -0.756304f,
+  0.0155309f,   -0.378542f,   0.293377f,    -0.347252f,   -0.338458f,
+  0.221449f,    -0.176443f,   -0.131972f,   0.0129163f,   -0.290649f,
+  0.198596f,    -0.0721333f,  0.620591f,    0.568736f,    0.174001f,
+  -0.205186f,   -0.265606f,   -0.249155f,   0.299163f,    1.11842f,
+  0.17423f,     0.196417f,    -0.014484f,   0.0735422f,   0.26329f,
+  0.12284f,     -0.750305f,   -0.351337f,   0.121994f,    -0.00542878f,
+  -0.295707f,   -0.094124f,   0.300993f,    0.412408f,    -0.170761f,
+  -0.0676329f,  -0.106638f,   -0.419785f,   -0.43878f,    0.22421f,
+  0.0339903f,   0.619851f,    0.0615381f,   0.514631f,    1.35424f,
+  -0.0679228f,  -0.203457f,   0.131948f,    -0.0041251f,  -0.209054f
+};
+
+static const float av1_simple_motion_search_prune_part_logits_bias_16[] = {
+  0.304025f,  0.131887f, 0.259279f,  -0.561564f, -0.161729f,
+  -0.208036f, 0.102206f, -0.162937f, -1.42311f,  -0.708305f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_16 = {
+  NUM_FEATURES_16,
+  NUM_LOGITS_16,
+  NUM_HIDDEN_LAYERS_16,
+  {
+      NUM_LAYER_0_UNITS_16,
+  },
+  {
+      av1_simple_motion_search_prune_part_layer_0_kernel_16,
+      av1_simple_motion_search_prune_part_logits_kernel_16,
+  },
+  {
+      av1_simple_motion_search_prune_part_layer_0_bias_16,
+      av1_simple_motion_search_prune_part_logits_bias_16,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_16
+#undef NUM_FEATURES_16
+#undef NUM_LAYER_0_UNITS_16
+#undef NUM_LOGITS_16
+
+#define NUM_HIDDEN_LAYERS_8 1
+#define NUM_FEATURES_8 25
+#define NUM_LAYER_0_UNITS_8 32
+#define NUM_LOGITS_8 4
+
+static const float av1_simple_motion_search_prune_part_logits_kernel_8[] = {
+  -0.266303f,  -0.387676f,  0.204501f,   -0.120842f,  -0.0752326f, 0.0337739f,
+  0.0243477f,  -0.356748f,  0.0143051f,  -0.16403f,   -0.139013f,  0.175003f,
+  -0.206754f,  0.349059f,   0.181763f,   0.212768f,   -0.313783f,  0.182829f,
+  0.00205376f, -0.939525f,  -0.0992424f, 0.306254f,   0.083329f,   -0.133137f,
+  -0.179022f,  -0.0237902f, 0.0601026f,  -0.216698f,  -0.551149f,  0.081711f,
+  -0.442191f,  0.0680832f,  -0.0353678f, 0.237704f,   0.23155f,    -0.36097f,
+  0.123389f,   -0.288927f,  0.178133f,   -0.152222f,  -0.235648f,  -0.0495293f,
+  -0.316522f,  0.034207f,   0.0463139f,  -0.817825f,  0.417443f,   -0.110984f,
+  -0.402371f,  0.0341694f,  -0.37383f,   0.414532f,   0.093993f,   0.0039505f,
+  0.0803175f,  -0.511859f,  -0.0154802f, 0.0979595f,  0.0909049f,  -0.120938f,
+  -0.577382f,  -0.155041f,  -0.404295f,  0.122223f,   -0.084703f,  0.00415336f,
+  0.149135f,   0.113219f,   0.124236f,   -0.240905f,  0.163909f,   -0.154202f,
+  -0.208917f,  0.00200158f, -0.71796f,   0.105984f,   -0.131996f,  -0.539603f,
+  0.223768f,   -0.0710733f, -0.346679f,  -0.0745909f, 0.171032f,   0.215701f,
+  0.218519f,   0.105981f,   -0.096209f,  -0.166453f,  -0.468894f,  -0.401578f,
+  -0.239222f,  0.111382f,   0.38747f,    -0.164734f,  -0.175955f,  0.336621f,
+  -0.0305501f, -0.0576765f, 0.0672671f,  -0.183692f,  0.412082f,   -0.262951f,
+  -0.153429f,  -0.128589f,  -0.530472f,  0.0936412f,  -1.08296f,   -0.45147f,
+  0.0714904f,  -3.96842f,   0.438125f,   -0.313945f,  0.231104f,   -0.00183851f,
+  -0.0192768f, -0.637531f,  -0.109296f,  0.0531702f,  0.00262162f, -0.615951f,
+  -0.546241f,  -0.635305f,  -0.0762367f, 0.0122019f,  0.423693f,   -0.129142f,
+  -0.112242f,  0.295184f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_bias_8[] = {
+  -2.16023f,  -3.12831f, -0.213206f,  -2.97875f, -1.83791f, -2.84713f,
+  -0.909636f, -2.05893f, 0.00525274f, -1.51672f, -3.95017f, 1.82847f,
+  -0.853224f, -3.29503f, -0.537517f,  0.923106f, -3.18665f, -1.29905f,
+  1.64506f,   -1.99848f, -2.24315f,   0.408613f, 0.503671f, -3.83393f,
+  -2.88388f,  -3.52337f, 1.46818f,    -1.67169f, -3.83253f, 1.52644f,
+  -0.490783f, -0.415782f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_kernel_8[] = {
+  -0.702198f,  -0.102148f,   0.0564545f,   -0.0555548f,  0.16184f,
+  0.0950792f,  0.136974f,    -0.00824146f, 0.05746f,     0.0447542f,
+  0.145978f,   0.0855769f,   -0.041449f,   0.301347f,    -0.0206691f,
+  -0.0662514f, -0.0525079f,  -0.0998387f,  -0.0891438f,  0.110545f,
+  -0.863098f,  -1.83798f,    0.238818f,    0.127797f,    0.116872f,
+  -0.270655f,  -0.21057f,    0.197013f,    -0.123332f,   0.137104f,
+  -0.174766f,  -0.00803025f, 0.0234369f,   -0.0894175f,  -0.0380927f,
+  0.00827928f, -0.134148f,   0.110575f,    -0.250173f,   0.116273f,
+  0.0197749f,  0.270391f,    0.108437f,    0.173197f,    -0.0650348f,
+  0.0884626f,  0.262792f,    0.0649228f,   0.5573f,      -2.81315f,
+  -0.479801f,  -1.15825f,    0.0807932f,   -0.19144f,    0.404016f,
+  -0.211521f,  0.233269f,    -0.391414f,   0.160381f,    -0.277233f,
+  0.426354f,   0.156839f,    0.494315f,    -0.214259f,   -0.0132062f,
+  0.148628f,   -0.0899568f,  0.161845f,    0.467689f,    0.229474f,
+  0.590634f,   -0.705793f,   -0.0486113f,  -0.439088f,   0.994566f,
+  0.679065f,   0.777869f,    -0.225291f,   -0.0303006f,  -0.638782f,
+  -0.0824632f, -0.128561f,   -0.327603f,   0.105624f,    0.567581f,
+  -0.396135f,  -0.471028f,   0.181286f,    0.274604f,    0.180169f,
+  0.0612144f,  -0.865004f,   0.0306804f,   0.142985f,    -0.0914358f,
+  -0.243284f,  0.358359f,    -0.443847f,   -0.371978f,   0.606933f,
+  -0.900408f,  -0.52076f,    0.472118f,    0.0610973f,   0.152526f,
+  -0.550379f,  0.309331f,    -0.141573f,   0.203046f,    -0.231485f,
+  0.505156f,   0.393224f,    0.435487f,    -0.218681f,   0.123707f,
+  -0.270383f,  -0.033565f,   0.210373f,    -2.33967f,    0.367434f,
+  0.0308118f,  -0.205771f,   0.546141f,    0.19837f,     0.035648f,
+  -0.467007f,  -1.50995f,    -0.0314176f,  0.11762f,     -0.15307f,
+  0.618257f,   -0.139502f,   0.303386f,    -0.00758681f, 0.228107f,
+  -0.594499f,  -0.201984f,   -0.239666f,   0.114878f,    -0.922174f,
+  -0.530137f,  -0.379366f,   -0.319582f,   0.0889624f,   -0.00544663f,
+  0.316264f,   -0.204262f,   -0.0959358f,  0.23552f,     0.141369f,
+  -0.207129f,  -1.04067f,    -0.0780501f,  0.226768f,    -0.246752f,
+  0.0823105f,  0.114783f,    0.49315f,     0.0197732f,   0.705433f,
+  0.158076f,   -0.250584f,   -0.157326f,   -0.0439547f,  -0.139047f,
+  0.090531f,   -0.38833f,    0.743143f,    -1.47418f,    -0.155009f,
+  0.511466f,   -0.726716f,   -0.181075f,   0.450133f,    -0.390204f,
+  0.292725f,   0.00811462f,  -0.347738f,   0.613381f,    -0.237124f,
+  0.750748f,   -0.383123f,   0.410309f,    -0.204166f,   0.667199f,
+  -0.313197f,  0.436059f,    -0.607571f,   0.193681f,    0.409399f,
+  0.631747f,   -0.0454149f,  0.198232f,    0.345591f,    -0.0137374f,
+  -0.307014f,  -0.535515f,   0.764678f,    -0.225686f,   -0.451621f,
+  -2.75564f,   -1.52877f,    0.0511933f,   0.905979f,    0.145029f,
+  0.759615f,   0.130166f,    0.83827f,     0.0655081f,   1.07555f,
+  -0.529777f,  0.682967f,    -0.412052f,   0.611947f,    -0.83676f,
+  0.940695f,   -0.465681f,   0.51505f,     -0.883659f,   -0.105524f,
+  -0.0344173f, -0.0683618f,  -0.00698688f, -0.139349f,   0.135741f,
+  -0.294455f,  -0.377834f,   -0.602084f,   -1.00128f,    0.483291f,
+  1.25327f,    0.178987f,    0.75068f,     -0.520731f,   -0.325517f,
+  0.272032f,   0.144144f,    -0.279453f,   0.564907f,    0.144036f,
+  0.297448f,   -0.504243f,   -0.250508f,   -1.26395f,    0.4816f,
+  0.392771f,   -0.389961f,   -0.261585f,   -0.127124f,   -0.202945f,
+  -0.709716f,  -0.174719f,   0.113613f,    0.477753f,    -0.226659f,
+  0.0697828f,  -0.177994f,   0.300726f,    -0.185504f,   0.339424f,
+  -0.316746f,  0.369693f,    -0.339723f,   -0.143886f,   -0.0326589f,
+  -0.268761f,  -0.241094f,   0.284876f,    -0.0270867f,  -0.207397f,
+  -1.42738f,   0.495612f,    -0.0277732f,  0.199675f,    1.48638f,
+  -0.659257f,  -1.28199f,    0.498702f,    0.140695f,    0.571152f,
+  0.416368f,   0.14153f,     0.126876f,    0.521114f,    -0.00150571f,
+  0.375581f,   0.00537624f,  0.1286f,      -0.332227f,   0.417663f,
+  -0.539023f,  0.217124f,    -0.787111f,   -0.0335266f,  1.56751f,
+  0.0640563f,  -0.158791f,   0.118195f,    0.000970493f, -0.0403852f,
+  -0.0572557f, -0.0201181f,  -0.10255f,    0.63237f,     0.156662f,
+  0.418696f,   -0.274802f,   -0.663923f,   -0.375232f,   -0.40846f,
+  0.462092f,   1.2176f,      -0.301532f,   -0.779704f,   -0.112876f,
+  0.0806591f,  -0.0141923f,  0.00960801f,  -0.663557f,   0.0979948f,
+  -0.0575999f, -0.012847f,   0.0403853f,   -0.133666f,   -0.00330217f,
+  -0.931518f,  -0.774599f,   -0.21391f,    0.377601f,    -0.183365f,
+  0.299094f,   0.0238552f,   0.206716f,    -0.18959f,    0.346013f,
+  -0.150991f,  -0.192817f,   -0.293962f,   -0.0537604f,  -0.0648171f,
+  -0.275941f,  -0.144854f,   -0.224092f,   2.43113f,     0.0422494f,
+  -0.047236f,  -0.0262028f,  0.0282119f,   -0.175553f,   0.0888502f,
+  0.580682f,   0.951055f,    -0.284441f,   -0.120133f,   -0.268058f,
+  -0.312083f,  -0.411556f,   0.21431f,     -0.28033f,    0.324851f,
+  -1.02787f,   -0.936816f,   -0.577628f,   0.544743f,    0.295807f,
+  0.406157f,   0.447927f,    0.25369f,     -0.811421f,   -0.0424979f,
+  -0.189867f,  0.00778673f,  -0.113587f,   -0.116175f,   -0.0542222f,
+  -1.80089f,   -1.44175f,    -0.35332f,    0.191314f,    -0.236691f,
+  -0.0261926f, -0.502363f,   0.252278f,    -0.485478f,   0.296495f,
+  0.455612f,   -0.0489631f,  0.227255f,    0.170975f,    0.473487f,
+  0.257812f,   0.178048f,    0.2506f,      2.04637f,     -0.173857f,
+  0.0583379f,  0.00765589f,  -0.025772f,   -0.162666f,   -0.016214f,
+  -0.607486f,  -0.0808025f,  0.0551611f,   -0.0772291f,  0.126421f,
+  0.10869f,    -0.0877463f,  -0.111527f,   -0.0775766f,  0.503886f,
+  -0.002757f,  -0.0421354f,  -0.247857f,   0.140827f,    0.383576f,
+  0.228232f,   -0.157877f,   -0.0927911f,  0.344687f,    0.191181f,
+  0.236533f,   0.00102869f,  -0.0184502f,  -1.4509f,     -1.15945f,
+  -0.521978f,  -0.643225f,   0.133139f,    0.0660321f,   0.0851957f,
+  0.0303648f,  0.0296239f,   0.0455713f,   0.175647f,    0.080532f,
+  0.0445691f,  -0.257356f,   -0.125602f,   -0.138829f,   -0.167057f,
+  -0.0992552f, -0.13944f,    0.507531f,    0.444997f,    0.221452f,
+  -0.308384f,  -0.327554f,   0.13235f,     2.1487f,      -1.15453f,
+  -0.280239f,  -0.363582f,   -0.00358745f, 0.012866f,    0.251088f,
+  0.0676416f,  0.178492f,    -0.136631f,   0.197938f,    -0.078198f,
+  0.812439f,   1.1173f,      0.712113f,    1.10124f,     -0.836503f,
+  -1.22433f,   -1.07894f,    -1.29215f,    0.56057f,     2.23928f,
+  -0.419029f,  0.282178f,    -0.0719266f,  -0.172192f,   0.28034f,
+  -2.99124f,   -2.01481f,    0.0688982f,   0.697466f,    0.00635555f,
+  0.566069f,   0.047534f,    0.507755f,    -0.00690707f, 0.712594f,
+  -0.191467f,  0.355733f,    -0.480016f,   0.664669f,    -0.390619f,
+  0.351199f,   -0.482342f,   0.325005f,    1.9089f,      0.155987f,
+  0.17032f,    0.132729f,    0.0402649f,   0.146991f,    0.0314905f,
+  -0.775316f,  -0.208892f,   -0.105993f,   0.0181653f,   -0.12735f,
+  0.0897852f,  0.0470231f,   0.25807f,     0.127406f,    -0.0893252f,
+  -0.279776f,  0.190844f,    0.110384f,    -0.148833f,   0.025293f,
+  0.239838f,   0.00932245f,  0.35103f,     -0.128268f,   -0.0536754f,
+  0.506899f,   -0.16793f,    0.0955582f,   -2.01108f,    0.721433f,
+  -2.31413f,   -2.08646f,    0.033315f,    0.689828f,    -0.271213f,
+  0.790425f,   -0.114234f,   0.755325f,    -0.211533f,   0.774544f,
+  -0.263268f,  0.795762f,    -0.551455f,   0.953602f,    -0.168454f,
+  0.529055f,   -0.768991f,   0.882371f,    0.29763f,     -0.155017f,
+  0.00464101f, 0.121093f,    0.948271f,    0.113138f,    -0.110332f,
+  -2.0492f,    -1.31322f,    -0.129212f,   0.464778f,    -0.181465f,
+  0.618403f,   0.0627984f,   0.465228f,    0.165729f,    0.278277f,
+  -0.563276f,  -0.358358f,   -0.590638f,   0.0104993f,   0.731206f,
+  0.752569f,   0.631615f,    0.811822f,    0.129804f,    -0.0558327f,
+  0.570081f,   -0.417922f,   -0.168275f,   0.0703671f,   0.269127f,
+  0.240457f,   -0.197159f,   -0.00179261f, 0.220065f,    0.463511f,
+  0.0714626f,  -0.716477f,   -0.441865f,   -0.717028f,   -0.149176f,
+  0.452182f,   0.662699f,    -0.906534f,   -0.817133f,   0.237747f,
+  0.26024f,    -7.7441e-05f, 0.0934616f,   0.824641f,    -0.0404494f,
+  -0.088297f,  -0.157899f,   0.037408f,    0.132435f,    -0.316155f,
+  -0.276785f,  0.0117868f,   0.185008f,    0.32369f,     -0.465855f,
+  -0.302127f,  0.303289f,    0.338597f,    -0.665408f,   -0.507594f,
+  0.526979f,   0.532091f,    0.234395f,    0.754063f,    0.116769f,
+  0.0800309f,  -0.939344f,   -1.51269f,    1.4583f,      0.178444f,
+  0.0106756f,  -0.213468f,   -0.00369439f, 0.071015f,    -0.192798f,
+  -0.0933147f, -0.129901f,   -0.368279f,   -0.246564f,   0.126966f,
+  0.478565f,   -0.476246f,   -0.762863f,   0.168883f,    0.536136f,
+  -0.272969f,  0.2573f,      -0.161577f,   0.311428f,    -0.777994f,
+  -1.29752f,   0.216046f,    0.329016f,    1.57265f,     0.168075f,
+  -0.192518f,  0.0829308f,   -0.073533f,   -0.0202034f,  0.114716f,
+  -0.34888f,   -0.519215f,   0.190809f,    0.0138507f,   0.133635f,
+  0.14194f,    0.410618f,    -0.165106f,   0.214438f,    0.0438265f,
+  -0.8481f,    -1.19182f,    -1.07878f,    -0.882217f,   0.45616f,
+  0.977385f,   0.74929f,     0.918466f,    0.904704f,    0.041938f,
+  0.0362776f,  0.0757255f,   1.14007f,     0.0516825f,   -0.160068f,
+  0.219535f,   0.638634f,    -0.0284544f,  -0.222849f,   -0.0344915f,
+  -0.0350256f, -0.0504452f,  -0.0458416f,  0.146099f,    0.0783083f,
+  0.206579f,   0.241264f,    0.28401f,     0.0425312f,   -0.802049f,
+  -0.746271f,  -0.578969f,   -0.078218f,   0.436176f,    -0.281465f,
+  -2.5539f,    0.237868f,    -0.121796f,   0.0715619f,   0.106992f,
+  -0.621862f,  -0.167142f,   0.153716f,    0.0570912f,   -0.06525f,
+  -0.923773f,  0.130759f,    0.0517066f,   0.0729862f,   -0.873064f,
+  0.0403328f,  -0.186499f,   -0.0831918f,  -0.223723f,   0.144697f,
+  0.212845f,   0.416876f,    0.361598f,    0.138229f,    0.0728777f,
+  -1.95419f,   -0.00382816f, -0.0440387f,  0.433627f,    0.44781f,
+  -1.05229f,   -1.54506f,    0.564827f,    -0.263456f,   0.296105f,
+  -0.158055f,  0.388274f,    -0.366639f,   0.212006f,    -0.245619f,
+  0.593064f,   0.088727f,    0.410632f,    -0.263462f,   0.507075f,
+  -0.0974155f, 0.275268f,    -0.1293f,     0.136679f,    1.98276f,
+  0.411766f,   0.391987f,    0.34283f,     -0.114077f,   0.258462f,
+  -0.302443f,  0.301138f,    -0.00726621f, 0.276441f,    -0.291582f,
+  0.66498f,    -0.321451f,   -0.332805f,   0.0943272f,   0.572253f,
+  -0.45818f,   -0.0219593f,  -0.151679f,   0.402033f,    -1.15502f,
+  -0.882955f,  0.772904f,    0.88126f,     -0.149555f,   0.709525f,
+  0.350116f,   -0.21531f,    0.797893f,    0.0230234f,   0.0203034f,
+  0.2744f,     1.08273f,     0.039349f,    0.503909f,    -0.45892f,
+  -0.579516f,  -0.344058f,   0.390628f,    -0.386941f,   -0.430317f,
+  -0.0807066f, 0.435906f,    0.522996f,    0.724476f,    -0.74371f,
+  -0.05376f,   -0.340898f,   -0.962646f,   -0.0278005f,  0.0981149f,
+  -0.0811161f, 0.00237994f,  0.850042f,    0.0665473f,   0.134413f
+};
+
+static const float av1_simple_motion_search_prune_part_logits_bias_8[] = {
+  1.63404f, -0.715866f, -1.0132f, -2.08745f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_8 = {
   NUM_FEATURES_8,
   NUM_LOGITS_8,
   NUM_HIDDEN_LAYERS_8,
@@ -2883,22 +3967,839 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_8 = {
       NUM_LAYER_0_UNITS_8,
   },
   {
-      full_pixel_motion_search_based_split_layer_0_kernel_8,
-      full_pixel_motion_search_based_split_logits_kernel_8,
+      av1_simple_motion_search_prune_part_layer_0_kernel_8,
+      av1_simple_motion_search_prune_part_logits_kernel_8,
   },
   {
-      full_pixel_motion_search_based_split_layer_0_bias_8,
-      full_pixel_motion_search_based_split_logits_bias_8,
+      av1_simple_motion_search_prune_part_layer_0_bias_8,
+      av1_simple_motion_search_prune_part_logits_bias_8,
   },
 };
 
-static const float full_pixel_motion_search_based_split_thresh_8 = 2.0f;
+#undef NUM_HIDDEN_LAYERS_8
+#undef NUM_FEATURES_8
+#undef NUM_LAYER_0_UNITS_8
+#undef NUM_LOGITS_8
+
+#define FEATURE_SIZE 19
+static const float av1_2pass_split_partition_weights_128[FEATURE_SIZE + 1] = {
+  2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f,
+  0.125296f, -1.134961f, 0.862757f,  -0.418799f, -0.637666f,
+  0.016232f, 0.345013f,  0.018823f,  -0.393394f, -1.130700f,
+  0.695357f, 0.112569f,  -0.341975f, -0.513882f, 5.7488966f,
+};
+
+static const float av1_2pass_split_partition_weights_64[FEATURE_SIZE + 1] = {
+  2.990993f,  0.423273f,  -0.926544f, 0.454646f,  -0.292698f,
+  -1.311632f, -0.284432f, 0.717141f,  -0.419257f, -0.574760f,
+  -0.674444f, 0.669047f,  -0.374255f, 0.380624f,  -0.804036f,
+  0.264021f,  0.004163f,  1.896802f,  0.924287f,  0.13490619f,
+};
+
+static const float av1_2pass_split_partition_weights_32[FEATURE_SIZE + 1] = {
+  2.795181f,  -0.136943f, -0.924842f, 0.405330f,  -0.463505f,
+  -0.584076f, -0.831472f, 0.382985f,  -0.597544f, -0.138915f,
+  -1.354350f, 0.466035f,  -0.553961f, 0.213202f,  -1.166429f,
+  0.010776f,  -0.096236f, 2.335084f,  1.699857f,  -0.58178353f,
+};
+
+static const float av1_2pass_split_partition_weights_16[FEATURE_SIZE + 1] = {
+  1.987888f,  -0.431100f, -1.687703f, 0.262602f,  -0.425298f,
+  -0.463870f, -1.493457f, 0.470917f,  -0.528457f, -0.087700f,
+  -1.815092f, 0.152883f,  -0.337908f, 0.093679f,  -1.548267f,
+  -0.042387f, -0.000861f, 2.556746f,  1.619192f,  0.03643292f,
+};
+
+static const float av1_2pass_split_partition_weights_8[FEATURE_SIZE + 1] = {
+  2.188344f,  -0.817528f, -2.119219f, 0.000000f,  -0.348167f,
+  -0.658074f, -1.960362f, 0.000000f,  -0.403080f, 0.282699f,
+  -2.061088f, 0.000000f,  -0.431919f, -0.127960f, -1.099550f,
+  0.000000f,  0.121622f,  2.017455f,  2.058228f,  -0.15475988f,
+};
+
+static const float av1_2pass_none_partition_weights_128[FEATURE_SIZE + 1] = {
+  -1.006689f, 0.777908f,  4.461072f,  -0.395782f, -0.014610f,
+  -0.853863f, 0.729997f,  -0.420477f, 0.282429f,  -1.194595f,
+  3.181220f,  -0.511416f, 0.117084f,  -1.149348f, 1.507990f,
+  -0.477212f, 0.202963f,  -1.469581f, 0.624461f,  -0.89081228f,
+};
+
+static const float av1_2pass_none_partition_weights_64[FEATURE_SIZE + 1] = {
+  -1.241117f, 0.844878f,  5.638803f,  -0.489780f, -0.108796f,
+  -4.576821f, 1.540624f,  -0.477519f, 0.227791f,  -1.443968f,
+  1.586911f,  -0.505125f, 0.140764f,  -0.464194f, 1.466658f,
+  -0.641166f, 0.195412f,  1.427905f,  2.080007f,  -1.98272777f,
+};
+
+static const float av1_2pass_none_partition_weights_32[FEATURE_SIZE + 1] = {
+  -2.130825f, 0.476023f,  5.907343f,  -0.516002f, -0.097471f,
+  -2.662754f, 0.614858f,  -0.576728f, 0.085261f,  -0.031901f,
+  0.727842f,  -0.600034f, 0.079326f,  0.324328f,  0.504502f,
+  -0.547105f, -0.037670f, 0.304995f,  0.369018f,  -2.66299987f,
+};
+
+static const float av1_2pass_none_partition_weights_16[FEATURE_SIZE + 1] = {
+  -1.626410f, 0.872047f,  5.414965f,  -0.554781f, -0.084514f,
+  -3.020550f, 0.467632f,  -0.382280f, 0.199568f,  0.426220f,
+  0.829426f,  -0.467100f, 0.153098f,  0.662994f,  0.327545f,
+  -0.560106f, -0.141610f, 0.403372f,  0.523991f,  -3.02891231f,
+};
+
+static const float av1_2pass_none_partition_weights_8[FEATURE_SIZE + 1] = {
+  -1.463349f, 0.375376f,  4.751430f, 0.000000f, -0.184451f,
+  -1.655447f, 0.443214f,  0.000000f, 0.127961f, 0.152435f,
+  0.083288f,  0.000000f,  0.143105f, 0.438012f, 0.073238f,
+  0.000000f,  -0.278137f, 0.186134f, 0.073737f, -1.6494962f,
+};
+#undef FEATURE_SIZE
+
+// nn model for predicting max square partition level of a superblock
+#define NUM_HIDDEN_LAYERS 1
+#define NUM_FEATURES 13
+#define NUM_LAYER_0_UNITS 48
+#define NUM_LOGITS 4
+
+static const float av1_max_part_pred_logits_kernel[] = {
+  -0.304561f,   0.0885596f,   -0.988539f,   1.08147f,    0.215213f,
+  0.202965f,    -0.828457f,   -0.233945f,   -0.0866977f, -0.115521f,
+  0.02079f,     0.196491f,    -0.0285075f,  0.05067f,    -0.00872862f,
+  0.00281844f,  -0.238954f,   0.0253801f,   0.0257775f,  0.339269f,
+  0.176174f,    -0.152545f,   -0.0588704f,  -1.62275f,   -0.189329f,
+  0.0808033f,   0.233844f,    -4.53798f,    0.674968f,   -0.0361688f,
+  -0.0754075f,  1.16129f,     -0.0188879f,  0.113255f,   -3.04378f,
+  0.814728f,    -0.568517f,   -0.00179383f, -3.61223f,   -1.67535f,
+  -2.20417f,    -0.197196f,   0.0507745f,   -0.0909394f, -0.0507879f,
+  -1.27999f,    -0.055623f,   0.0318497f,   0.192867f,   0.138726f,
+  0.0443392f,   -0.595075f,   -0.166774f,   0.0882958f,  -0.348161f,
+  0.0214428f,   -0.0599275f,  -0.0995385f,  -0.82358f,   0.141205f,
+  -0.053232f,   0.00508296f,  -1.90872f,    1.15004f,    -0.194219f,
+  0.0229019f,   -0.00354318f, 0.22016f,     0.154101f,   -0.159231f,
+  -0.0446647f,  -0.197503f,   0.0408453f,   0.197659f,   0.797858f,
+  -0.189722f,   0.343653f,    0.124666f,    -1.03083f,   0.603059f,
+  0.101565f,    0.0932993f,   0.462484f,    0.295984f,   1.11198f,
+  0.143709f,    -0.846232f,   -0.464392f,   -1.06058f,   -0.124889f,
+  0.0727475f,   1.18446f,     -0.100302f,   0.0641918f,  -0.101622f,
+  0.10219f,     0.130189f,    0.0915623f,   -0.166904f,  -1.10606f,
+  -0.16726f,    -0.146152f,   0.145443f,    -0.177091f,  -0.0215214f,
+  0.0158506f,   -0.553294f,   0.0784749f,   -0.0416628f, -0.027785f,
+  0.280027f,    0.484898f,    -0.164225f,   0.0238317f,  -0.0345254f,
+  0.0410244f,   0.131529f,    0.0239622f,   -0.0749436f, -0.0224914f,
+  0.128926f,    0.224539f,    0.413297f,    0.0638572f,  0.103308f,
+  0.0913242f,   -0.119274f,   0.0163103f,   0.113828f,   0.119809f,
+  0.297057f,    -0.124889f,   -0.533108f,   -0.181408f,  -0.129896f,
+  0.0221064f,   -0.0773281f,  -0.0386467f,  0.0342961f,  0.126575f,
+  -0.24114f,    0.0735576f,   0.0524791f,   0.246896f,   -0.130674f,
+  -0.03979f,    0.173639f,    1.95193f,     -0.113029f,  -0.0305852f,
+  -0.00671737f, 0.157159f,    -0.00102858f, -0.543688f,  0.566772f,
+  0.124124f,    -0.0294064f,  -0.0699021f,  -0.0704103f, -0.766097f,
+  -0.0625802f,  -0.0906173f,  -0.0520414f,  -0.0272724f, 0.283064f,
+  0.236213f,    -0.127319f,   0.019392f,    0.170042f,   -0.0214542f,
+  0.0740938f,   0.356578f,    -0.236257f,   0.269021f,   0.114759f,
+  -0.641166f,   0.136308f,    -0.0386959f,  -0.112024f,  -0.361209f,
+  0.686095f,    0.183906f,    0.288656f,    0.182007f,   0.337458f,
+  0.058974f,    -0.305512f,   -0.841708f,   -0.243779f,  -0.0614058f,
+  0.208747f,    0.448697f
+};
+
+static const float av1_max_part_pred_layer_0_bias[] = {
+  -0.776544f, -2.0022f,    -0.330294f, 2.47665f,  1.90206f,   -1.61571f,
+  0.536246f,  1.00455f,    5.24561f,   1.55111f,  -0.816399f, -4.88703f,
+  -1.06417f,  -1.15359f,   -0.145289f, 1.91831f,  0.630915f,  -1.94256f,
+  -3.35239f,  -1.05007f,   -1.05186f,  1.36824f,  -5.2878f,   1.10482f,
+  -5.00077f,  -0.0445198f, 3.41427f,   2.3439f,   -0.413306f, -1.88152f,
+  -2.28638f,  8.24783f,    -1.91961f,  -1.49324f, 1.96599f,   -6.32309f,
+  -0.332426f, -0.425506f,  4.06511f,   5.84386f,  4.15747f,   1.22402f,
+  2.8512f,    2.53027f,    0.0170272f, -1.43966f, -0.997785f, 5.43064f
+};
+
+static const float av1_max_part_pred_logits_bias[] = { -4.25432f, 0.144758f,
+                                                       1.96217f, 0.728905f };
+
+static const float av1_max_part_pred_layer_0_kernel[] = {
+  0.992471f,    0.533006f,    0.143743f,     -2.51788f,    -0.468337f,
+  -0.201376f,   -0.151834f,   0.479883f,     1.16061f,     -0.278878f,
+  -0.814954f,   -0.152405f,   -0.0521608f,   0.797104f,    -2.08912f,
+  0.385839f,    -2.22889f,    -0.106858f,    -0.239766f,   -0.951128f,
+  -0.698753f,   0.0831051f,   1.1702f,       0.342834f,    -0.0352795f,
+  -0.0847639f,  -0.802086f,   0.258982f,     1.14174f,     0.645885f,
+  -1.19226f,    -0.592888f,   -0.343659f,    1.1912f,      1.45411f,
+  -1.22927f,    0.152858f,    0.00373585f,   -1.60637f,    0.592611f,
+  0.0857475f,   -0.346147f,   -0.150784f,    -0.0817408f,  -0.189918f,
+  -0.804952f,   -1.33036f,    -1.03307f,     0.0248769f,   0.16607f,
+  -2.896f,      -2.1293f,     0.12293f,      -0.173179f,   -0.212128f,
+  -6.76221f,    0.033188f,    0.0231787f,    0.905957f,    0.0551327f,
+  -0.356276f,   0.0181795f,   0.0977523f,    -0.0352873f,  -0.0396386f,
+  2.3241f,      0.0632874f,   -0.11804f,     -6.32521f,    0.0224659f,
+  -0.00188896f, 0.267992f,    0.272337f,     0.00936963f,  0.659969f,
+  -2.25707f,    -0.0278229f,  -0.0185089f,   -1.14466f,    0.104827f,
+  0.0435885f,   0.558586f,    -0.00697004f,  0.0312611f,   0.540574f,
+  -0.568625f,   0.218608f,    0.378911f,     -0.0289192f,  -0.0734742f,
+  -1.08782f,    -2.42069f,    -0.0127239f,   0.0493651f,   -1.15837f,
+  0.261831f,    0.401824f,    -1.04545f,     0.284173f,    0.784972f,
+  -0.511243f,   -0.982599f,   -0.106134f,    -0.325964f,   -1.44107f,
+  -1.42434f,    -1.02402f,    -1.52034f,     0.0737116f,   0.0462242f,
+  0.628722f,    -1.0405f,     -0.113718f,    2.20573f,     -4.33951f,
+  -0.0192695f,  -0.0229314f,  -1.89156f,     0.645942f,    0.375708f,
+  -1.97447f,    -0.267014f,   0.0989443f,    -0.450534f,   -1.01737f,
+  -0.642416f,   -0.0897288f,  -2.08724f,     -0.190965f,   -0.279135f,
+  -0.830178f,   0.808754f,    -0.139091f,    1.11004f,     -0.454439f,
+  -0.479238f,   -1.44001f,    0.0888059f,    0.885689f,    -0.642505f,
+  -0.00773651f, -0.0265721f,  -0.906346f,    1.68504f,     0.084257f,
+  -0.951101f,   -8.06495f,    0.19231f,      0.16389f,     -0.193678f,
+  0.729837f,    -1.98392f,    -5.98513f,     3.32638f,     -0.0658378f,
+  -0.0910426f,  -0.666567f,   -0.315339f,    0.123124f,    -2.66375f,
+  -0.714852f,   -0.136176f,   -0.460166f,    -0.567551f,   -1.06193f,
+  -1.21389f,    -0.83865f,    0.00280695f,   -0.199519f,   -0.534704f,
+  0.419311f,    -0.149008f,   -3.68707f,     0.00285113f,  -0.0718198f,
+  -1.41026f,    -1.34155f,    -0.538687f,    -0.623666f,   -2.56462f,
+  -0.0183333f,  -0.323532f,   -1.27141f,     -0.0212039f,  0.198633f,
+  0.459554f,    -4.65103f,    -1.01293f,     -1.39512f,    -0.289026f,
+  0.208724f,    -0.665226f,   1.13369f,      -1.96734f,    -1.45442f,
+  -3.46172f,    0.810681f,    -0.603973f,    0.842764f,    -3.90371f,
+  -0.394561f,   -3.61363f,    -2.88085f,     0.031645f,    -0.23125f,
+  -2.63898f,    -1.35314f,    -0.46726f,     1.33145f,     1.20269f,
+  1.38682f,     -0.331637f,   0.069021f,     0.149523f,    -1.24957f,
+  -0.878857f,   -0.200368f,   0.465744f,     1.01365f,     -0.0122221f,
+  -0.550586f,   -1.12581f,    -0.422132f,    -0.0744868f,  -2.4804f,
+  -1.07072f,    -0.479006f,   0.101817f,     -0.118947f,   0.341576f,
+  -1.0538f,     -0.812346f,   -1.13727f,     -0.00939806f, 10.1571f,
+  -0.0441302f,  0.00280407f,  -21.5044f,     0.0181152f,   -0.0143246f,
+  3.23462f,     -1.38624f,    -1.80416f,     4.89763f,     -2.67364f,
+  2.31771e-05f, 0.000393989f, 0.352204f,     -0.193455f,   0.531455f,
+  0.488757f,    -0.442555f,   -0.518528f,    0.431482f,    -2.67727f,
+  -2.00626f,    -0.39729f,    -0.221494f,    -0.0188888f,  -0.0377649f,
+  -1.80169f,    0.0810332f,   -0.0408335f,   -1.28675f,    -0.0353824f,
+  -0.666723f,   -1.07281f,    0.252912f,     -1.24547f,    -1.7831f,
+  -1.14354f,    -0.137662f,   0.00230182f,   0.736862f,    0.175872f,
+  -0.187556f,   0.43963f,     -0.796524f,    0.056219f,    -0.387874f,
+  0.0710224f,   -0.16548f,    -0.100993f,    0.931481f,    -3.20738f,
+  -0.0197576f,  0.266148f,    -0.173909f,    -0.337795f,   -0.0682381f,
+  0.176844f,    0.140286f,    1.12033f,      0.429064f,    -2.24192f,
+  -1.54682f,    2.23646f,     -0.0371138f,   -0.0475339f,  -3.21766f,
+  0.0412858f,   0.387811f,    6.6711f,       0.140649f,    0.0559547f,
+  -0.802839f,   0.599977f,    0.64552f,      -2.08103f,    -0.503401f,
+  -0.0407036f,  -0.0299199f,  0.0849445f,    -0.111657f,   -1.63462f,
+  3.33762f,     0.0441394f,   0.0466889f,    -0.951806f,   0.0723954f,
+  0.00348661f,  -1.36903f,    2.24625f,      -0.0348915f,  -0.0508893f,
+  -0.240891f,   -0.120143f,   -0.17991f,     -2.09137f,    0.0150871f,
+  0.0480333f,   1.72012f,     0.0309551f,    -0.0370507f,  -0.377075f,
+  0.103916f,    -0.0169255f,  -0.0145395f,   -4.02144f,    0.83193f,
+  -0.316502f,   6.3832f,      -1.70038f,     -1.97215f,    -1.94501f,
+  1.45479f,     0.711725f,    -0.348496f,    -0.279056f,   -1.13396f,
+  -1.51744f,    -0.853307f,   1.53131f,      -0.0032358f,  1.41808f,
+  -1.32989f,    -0.245221f,   -0.161614f,    -0.500845f,   -0.449252f,
+  0.0724151f,   -0.116333f,   -0.0946182f,   -2.0945f,     0.0564572f,
+  0.393261f,    -1.06861f,    -0.111458f,    -0.839943f,   -0.0880348f,
+  0.0365742f,   0.415339f,    -1.57494f,     -0.713697f,   1.02349f,
+  -0.221371f,   -0.0446281f,  1.89223f,      -0.0811754f,  -0.402773f,
+  -0.930987f,   0.0243194f,   0.0678332f,    -0.0233014f,  0.165372f,
+  -0.44083f,    -1.2404f,     0.35675f,      -0.040916f,   -0.0512548f,
+  -2.9071f,     0.861174f,    -0.778133f,    2.14436f,     -0.688427f,
+  -0.480371f,   -1.69032f,    0.706687f,     -0.281982f,   -2.30451f,
+  1.61541f,     -0.0213638f,  -0.740509f,    -0.266677f,   0.0268434f,
+  -0.0116908f,  -3.17595f,    0.0114825f,    0.0196997f,   -0.144005f,
+  0.0550181f,   -0.851459f,   -0.000285073f, -0.538441f,   -0.0254868f,
+  -0.0104454f,  -0.0661998f,  -0.196469f,    -0.346372f,   -5.52892f,
+  -0.643683f,   -0.622224f,   -0.31463f,     -0.555956f,   -0.520132f,
+  -0.843166f,   -2.59479f,    -0.750195f,    0.00635995f,  -0.338615f,
+  -0.216676f,   -0.391544f,   -1.62185f,     -0.718471f,   -0.475406f,
+  -0.782041f,   -0.608824f,   -1.09633f,     -1.27308f,    -0.560719f,
+  -0.207539f,   -0.0196445f,  -1.05519f,     -0.575249f,   -1.0642f,
+  1.01615f,     -0.873633f,   -0.417953f,    -0.428051f,   0.350259f,
+  -2.53833f,    -2.72203f,    0.672846f,     -0.503094f,   -1.1374f,
+  0.214291f,    0.013305f,    0.0112064f,    1.10532f,     0.030455f,
+  0.0239614f,   0.628072f,    0.0539135f,    -0.472441f,   -0.688439f,
+  -0.32044f,    -0.0234867f,  -0.0158436f,   -0.949314f,   -0.0453161f,
+  -1.18306f,    0.626845f,    -0.426925f,    -0.688371f,   0.415062f,
+  0.0640985f,   -0.638387f,   -2.01399f,     -0.209744f,   -0.762892f,
+  -0.0753296f,  -0.879315f,   -0.520433f,    -0.111375f,   0.389742f,
+  -0.398862f,   -0.643227f,   -0.246396f,    0.0317051f,   1.06973f,
+  0.413617f,    0.180506f,    -0.0507897f,   -0.00650435f, 0.620892f,
+  0.046312f,    0.475032f,    0.906993f,     -0.0388061f,  -0.256271f,
+  -1.03323f,    0.0125266f,   -0.31116f,     -0.377611f,   -0.0386407f,
+  -0.0232745f,  -0.353644f,   -2.27289f,     0.0571779f,   -0.00865006f,
+  1.65101f,     0.0175711f,   0.0184585f,    0.558458f,    0.2213f,
+  -0.285089f,   0.433445f,    -0.427177f,    -0.0103682f,  -0.0101273f,
+  0.214085f,    -0.0459885f,  0.00761981f,   0.836381f,    0.0175293f,
+  0.02508f,     -1.51778f,    0.0143956f,    -0.162589f,   0.595418f,
+  0.21445f,     -0.0335848f,  -0.0136684f,   -0.16686f,    -0.14612f,
+  0.0816238f,   0.499636f,    0.12458f,      -2.41673f,    -0.261721f,
+  -0.676805f,   -1.88366f,    0.730462f,     0.69196f,     -0.0288489f,
+  -2.38272f,    0.329876f,    0.014517f,     -0.115145f,   -3.48151f,
+  -0.00209072f, -0.0732377f,  0.820443f,     -0.0118701f,  0.112145f,
+  0.272315f,    0.137531f,    -0.0200997f,   -0.0397883f,  -2.19458f,
+  0.183554f,    -0.639716f,   0.481605f,     -0.621639f,   -0.0980299f,
+  -0.710534f,   -0.143105f,   -6.77626f,     -1.65139f,    -2.37718f,
+  -0.533127f,   -1.12574f,    3.34182f,      -0.0758663f,  0.0334238f,
+  -9.48647f,    0.0674974f,   0.0507665f,    0.523007f,    -0.0668f,
+  0.5736f,      -0.589761f,   -1.1692f,      -0.0236497f,  -0.00828928f,
+  -0.265823f,   1.15284f,     0.307927f,     -0.695308f,   0.13725f,
+  -0.20394f,    -0.363965f,   -0.331159f,    -1.50927f,    -1.20051f,
+  -0.0205825f,  -0.0381859f,  -0.0579876f,   -1.6913f,     -1.94626f,
+  3.4214f,      3.3922f,      -2.13798f,     -0.679848f,   -0.890735f,
+  0.235017f,    -0.253202f,   -1.0571f,      1.40354f,     0.00719052f,
+  -1.54365f,    -0.7289f,     -1.05492f,     0.0238169f,   -0.00543592f,
+  -0.0510353f,  -0.175386f,   -0.724207f,    -0.788936f,   0.039976f,
+  1.36966f,     0.869475f,    -0.0302774f,   -0.0537556f
+};
+
+static const NN_CONFIG av1_max_part_pred_nn_config = {
+  NUM_FEATURES,
+  NUM_LOGITS,
+  NUM_HIDDEN_LAYERS,
+  {
+      NUM_LAYER_0_UNITS,
+  },
+  {
+      av1_max_part_pred_layer_0_kernel,
+      av1_max_part_pred_logits_kernel,
+  },
+  {
+      av1_max_part_pred_layer_0_bias,
+      av1_max_part_pred_logits_bias,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS
+#undef NUM_FEATURES
+#undef NUM_LAYER_0_UNITS
+#undef NUM_LOGITS
+
+// Early termination in second pass
+static const float av1_simple_motion_search_term_none_mean_128[28] = {
+  12.661922f, 12.638062f, 10.896497f, 10.865719f, 10.978963f, 10.940105f,
+  11.012235f, 10.972760f, 11.069924f, 11.018533f, 11.773865f, 11.747426f,
+  11.891315f, 11.858107f, 11.793916f, 11.766356f, 11.874997f, 11.840164f,
+  5.940535f,  0.770746f,  4.292692f,  4.309581f,  0.848423f,  4.292334f,
+  4.298179f,  8.514713f,  14.911736f, 19.825352f,
+};
+
+static const float av1_simple_motion_search_term_none_std_128[28] = {
+  1.796731f, 1.797056f, 1.898383f, 1.900753f, 1.846624f, 1.846953f, 1.906632f,
+  1.908089f, 1.836533f, 1.835967f, 1.840262f, 1.840671f, 1.816836f, 1.817103f,
+  1.879846f, 1.881333f, 1.803102f, 1.802654f, 2.263402f, 0.420354f, 1.117165f,
+  1.083779f, 0.358611f, 1.101183f, 1.084938f, 2.462638f, 1.577009f, 1.574711f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_64[28] = {
+  10.904455f, 10.853546f, 9.247903f,  9.184479f,  9.251985f,  9.186686f,
+  9.253490f,  9.190190f,  9.270079f,  9.204357f,  10.086511f, 10.031060f,
+  10.100875f, 10.045429f, 10.069688f, 10.013173f, 10.082980f, 10.024640f,
+  4.888378f,  0.878113f,  3.598450f,  3.628491f,  0.925833f,  3.560971f,
+  3.573322f,  8.807137f,  13.348477f, 18.269117f,
+};
+
+static const float av1_simple_motion_search_term_none_std_64[28] = {
+  1.789300f, 1.787061f, 1.823519f, 1.820226f, 1.794643f, 1.788620f, 1.797194f,
+  1.795135f, 1.777795f, 1.773634f, 1.794000f, 1.790377f, 1.772197f, 1.769692f,
+  1.819050f, 1.817139f, 1.793577f, 1.789333f, 1.998251f, 0.327156f, 0.885748f,
+  0.853767f, 0.262043f, 0.902435f, 0.860033f, 1.224865f, 1.603411f, 1.589296f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_32[28] = {
+  9.818970f, 9.751199f, 8.015079f, 7.927318f, 8.029113f, 7.938330f,  8.012570f,
+  7.923719f, 8.033508f, 7.941911f, 8.933057f, 8.857422f, 8.935639f,  8.859187f,
+  8.905495f, 8.829741f, 8.929428f, 8.851351f, 4.114069f, 0.954752f,  2.645082f,
+  2.709703f, 0.964678f, 2.652077f, 2.673393f, 9.430499f, 11.922798f, 16.942251f,
+};
+
+static const float av1_simple_motion_search_term_none_std_32[28] = {
+  1.737107f, 1.734327f, 1.727923f, 1.720244f, 1.721570f, 1.712775f, 1.718028f,
+  1.710370f, 1.711612f, 1.702596f, 1.754856f, 1.748855f, 1.741871f, 1.736304f,
+  1.722428f, 1.717380f, 1.713563f, 1.707582f, 1.761170f, 0.207847f, 0.900058f,
+  0.862356f, 0.184593f, 0.903822f, 0.856120f, 1.529199f, 1.412085f, 1.453153f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_16[28] = {
+  8.998877f, 8.912468f, 7.085255f, 6.953476f, 7.086386f, 6.954091f,  7.088727f,
+  6.955747f, 7.093955f, 6.960635f, 8.065050f, 7.961432f, 8.071631f,  7.967233f,
+  8.041699f, 7.937715f, 8.046791f, 7.942183f, 3.833521f, 0.978421f,  1.901347f,
+  1.950124f, 0.979418f, 1.928000f, 1.936727f, 9.773951f, 10.735227f, 15.949769f,
+};
+
+static const float av1_simple_motion_search_term_none_std_16[28] = {
+  1.641193f, 1.640172f, 1.614794f, 1.608906f, 1.609571f, 1.603580f, 1.606928f,
+  1.601246f, 1.599230f, 1.593529f, 1.633747f, 1.630219f, 1.625695f, 1.622547f,
+  1.633827f, 1.630182f, 1.626607f, 1.622777f, 1.548838f, 0.145303f, 0.744550f,
+  0.736552f, 0.141980f, 0.742979f, 0.736977f, 1.366255f, 1.258794f, 1.294309f,
+};
+
+static const float av1_simple_motion_search_term_none_model_128[] = {
+  -0.6106842357f, -1.0402954455f, 0.6054417656f,  -0.2116623578f,
+  0.2447714930f,  0.3782256209f,  0.5095592479f,  -0.3275620904f,
+  0.3886188013f,  0.2629499420f,  -0.1979599415f, -0.5389565605f,
+  0.1209207902f,  -0.4913347466f, 0.3798542731f,  -0.2812861709f,
+  -0.1049824167f, -0.1088672020f, 0.4059596517f,  -0.1347896613f,
+  0.2276868621f,  0.0506386970f,  0.0071088411f,  0.0467952100f,
+  0.2091247458f,  -0.7371964736f, 0.1368935545f,  0.3175247786f,
+  -0.5493146094f,
+};
+
+static const float av1_simple_motion_search_term_none_model_64[] = {
+  -0.4150046575f, -0.3954358561f, 0.1997997444f,  0.3395826831f,
+  0.2827215753f,  0.3395683652f,  0.2483140395f,  0.2722216476f,
+  0.2610308009f,  0.3724974359f,  -0.0551479654f, -0.1721616359f,
+  -0.3459358629f, -0.0952524186f, -0.1428993840f, -0.0415654914f,
+  -0.3169539902f, -0.0269429900f, 0.9891530919f,  -0.0125084982f,
+  0.0972182377f,  0.0008889801f,  0.0205418050f,  0.0057237854f,
+  0.1005222691f,  -0.2851321920f, -1.5150336445f, 0.1893942436f,
+  -0.4337360901f,
+};
+
+static const float av1_simple_motion_search_term_none_model_32[] = {
+  -0.4667392852f, -0.3893302767f, 0.1603498635f,  0.2304974726f,
+  0.1404975592f,  0.2505516225f,  0.1423053884f,  0.2189318406f,
+  0.1379765409f,  0.2638241296f,  -0.1342865463f, -0.0549054345f,
+  -0.1925223436f, -0.1142702769f, 0.0127811659f,  0.0868639997f,
+  -0.0643197251f, 0.0279496470f,  0.9904395769f,  -0.0095178685f,
+  0.1179410649f,  -0.0013411972f, 0.0095060660f,  0.0195730400f,
+  0.0779717771f,  -0.2498860763f, -0.8168817125f, -0.4798397348f,
+  -0.6609679881f,
+};
+
+static const float av1_simple_motion_search_term_none_model_16[] = {
+  -0.3021081992f, -0.4620153673f, 0.0448577479f,  0.1738455035f,
+  0.0663209177f,  0.1629614573f,  0.0555168744f,  0.1631870212f,
+  0.0425805150f,  0.1688564954f,  0.0434083772f,  -0.0046603915f,
+  -0.0271580056f, -0.0183879127f, 0.1073730471f,  0.0314201476f,
+  0.0576891756f,  0.0119723753f,  0.9084332022f,  -0.0188429077f,
+  0.0755089811f,  -0.0172550234f, 0.0037663075f,  0.0022094472f,
+  0.0500247894f,  -0.2944572004f, -0.8908521199f, -0.2555515792f,
+  -0.5396254205f,
+};
+
+// Early termination in firstpass
+static const float av1_fp_simple_motion_search_term_none_mean_32[20] = {
+  10.216787f, 10.167575f, 8.405353f, 8.340786f,  8.436503f,
+  8.373259f,  8.444113f,  8.379074f, 8.448215f,  8.384669f,
+  4.107491f,  0.923902f,  2.702687f, 2.712742f,  0.953166f,
+  2.703244f,  2.707070f,  9.549801f, 12.013671f, 17.059454f,
+};
+
+static const float av1_fp_simple_motion_search_term_none_std_32[20] = {
+  1.886182f, 1.886638f, 1.884324f, 1.883410f, 1.851800f, 1.851652f, 1.847129f,
+  1.848014f, 1.832187f, 1.832360f, 1.758185f, 0.265155f, 0.939592f, 0.932395f,
+  0.211284f, 0.950024f, 0.945295f, 1.846744f, 1.453674f, 1.505994f,
+};
+
+static const float av1_fp_simple_motion_search_term_none_mean_16[20] = {
+  9.131485f, 9.065489f, 7.254479f, 7.158092f, 7.274240f,  7.178158f,  7.278780f,
+  7.182110f, 7.278793f, 7.182714f, 3.981902f, 0.964040f,  2.080875f,  2.087185f,
+  0.973397f, 2.088189f, 2.090166f, 9.386505f, 10.826546f, 15.985614f,
+};
+
+static const float av1_fp_simple_motion_search_term_none_std_16[20] = {
+  1.681172f, 1.688587f, 1.710854f, 1.717533f, 1.684010f, 1.691476f, 1.683537f,
+  1.691523f, 1.674699f, 1.682130f, 1.639731f, 0.186191f, 0.796448f, 0.795075f,
+  0.160921f, 0.791005f, 0.790048f, 1.430960f, 1.337976f, 1.370498f,
+};
+
+static const float av1_fp_simple_motion_search_term_none_mean_8[20] = {
+  7.821461f, 7.714526f, 5.799360f, 5.606948f, 5.805885f, 5.614357f,  5.794252f,
+  5.599669f, 5.798780f, 5.605399f, 4.069016f, 0.977720f, 1.577513f,  1.581266f,
+  0.983371f, 1.524603f, 1.524952f, 9.221803f, 9.508886f, 14.972815f,
+};
+
+static const float av1_fp_simple_motion_search_term_none_std_8[20] = {
+  1.618036f, 1.634415f, 1.652861f, 1.672006f, 1.646337f, 1.664935f, 1.650876f,
+  1.670476f, 1.645141f, 1.664301f, 1.502258f, 0.147592f, 0.760353f, 0.762547f,
+  0.127879f, 0.741096f, 0.742186f, 1.042003f, 1.292524f, 1.250398f,
+};
+
+#define NUM_HIDDEN_LAYERS_32 1
+#define NUM_FEATURES_32 20
+#define NUM_LAYER_0_UNITS_32 20
+#define NUM_LOGITS_32 1
+
+static const float
+    av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_32[] = {
+      -0.293987f,   0.796773f,     -0.0888487f, -0.00796495f, -0.343768f,
+      0.0783252f,   0.0596814f,    -0.235432f,  -0.0780005f,  -0.409017f,
+      -0.256821f,   -0.281654f,    1.00889f,    0.701893f,    -0.0181661f,
+      0.119718f,    0.0956582f,    0.76792f,    0.235693f,    0.351628f,
+      -1.28111f,    -1.45847f,     0.387732f,   0.476054f,    0.384561f,
+      0.427465f,    0.11875f,      -0.0176598f, -0.0528453f,  0.395589f,
+      -0.331994f,   0.0442108f,    0.195171f,   -0.0377402f,  -0.0736457f,
+      -0.0490903f,  0.116165f,     -0.549512f,  0.12968f,     0.641055f,
+      -1.03066f,    -0.601979f,    0.351981f,   -0.122019f,   0.00869275f,
+      0.399222f,    -0.343995f,    -0.444257f,  -0.160805f,   -0.537537f,
+      0.261478f,    -0.163785f,    0.218916f,   0.106506f,    -0.103819f,
+      0.0121841f,   0.284757f,     -0.362989f,  1.10793f,     0.477236f,
+      -0.424117f,   -0.884156f,    -0.468291f,  -0.510531f,   0.791441f,
+      0.75243f,     0.839871f,     0.604127f,   -0.182956f,   -0.246703f,
+      -1.25861f,    0.0546303f,    0.0811323f,  0.00655988f,  0.0286305f,
+      -0.00938366f, -0.0291418f,   -0.231632f,  -0.331077f,   1.12479f,
+      -0.635514f,   -0.146066f,    0.853122f,   0.923699f,    0.180011f,
+      -0.252973f,   0.1474f,       -0.454344f,  0.354736f,    0.576872f,
+      -1.43275f,    0.0327868f,    0.140849f,   -0.102523f,   0.0524867f,
+      0.007091f,    -0.00232578f,  -0.536116f,  -0.700144f,   0.166646f,
+      0.0636548f,   0.44645f,      -0.346062f,  -0.685779f,   -1.0792f,
+      -0.999219f,   0.442744f,     0.371198f,   0.777914f,    0.719409f,
+      -0.417984f,   0.0602868f,    0.0225539f,  0.0457407f,   0.0249501f,
+      0.0126021f,   0.00450792f,   0.0485095f,  0.203485f,    0.584116f,
+      -0.599426f,   -0.244633f,    0.168231f,   -0.00134934f, -0.106987f,
+      -0.0490239f,  -0.22029f,     0.138017f,   0.373674f,    0.00638684f,
+      -2.08003f,    0.106453f,     0.124456f,   -0.0286108f,  0.0422698f,
+      0.013734f,    0.0780971f,    -0.40173f,   0.473453f,    1.16836f,
+      -0.251035f,   0.0119074f,    0.319241f,   0.0422023f,   -0.730454f,
+      -0.745948f,   0.796709f,     0.277634f,   0.09711f,     -0.212224f,
+      0.825348f,    0.0208521f,    -0.0238098f, 0.00929265f,  0.0516351f,
+      -0.02329f,    0.0983163f,    -0.180721f,  0.0122096f,   -0.246159f,
+      0.61468f,     0.923765f,     0.240435f,   -0.294845f,   -0.495317f,
+      -0.0563837f,  -0.417936f,    0.154874f,   -0.604407f,   -0.0681337f,
+      -0.65738f,    -0.0270073f,   0.0920023f,  -0.0742724f,  0.820862f,
+      -0.602758f,   -1.20617f,     -0.201707f,  0.869499f,    -0.0539076f,
+      0.403097f,    0.429168f,     -0.938227f,  -0.830894f,   -0.362462f,
+      -0.0658648f,  0.471469f,     -0.264827f,  0.610275f,    0.367995f,
+      0.735662f,    -0.0473157f,   -0.0380545f, -0.0848067f,  -0.146108f,
+      -0.125875f,   -0.0576117f,   -0.296198f,  -0.100443f,   -0.212971f,
+      0.593524f,    1.23111f,      -0.810009f,  -0.604572f,   0.203021f,
+      0.256285f,    -1.17049f,     -1.19156f,   0.24365f,     0.727876f,
+      -0.466826f,   0.0298762f,    -0.0331735f, -0.0109056f,  0.0114862f,
+      0.00396703f,  0.0385985f,    -0.0587946f, 0.821079f,    0.0582033f,
+      0.349156f,    1.03529f,      -0.407036f,  0.200308f,    -0.265649f,
+      -0.104567f,   0.161149f,     -0.0717528f, -0.0112724f,  0.0681578f,
+      0.103809f,    -0.0807997f,   0.0316814f,  -0.332323f,   0.112254f,
+      -0.163981f,   0.118988f,     -0.777055f,  -1.34047f,    -0.910482f,
+      0.74599f,     -0.59633f,     0.165649f,   -0.594998f,   0.0845802f,
+      0.00440975f,  0.122606f,     -0.463991f,  0.418502f,    -0.339126f,
+      1.41847f,     -0.109594f,    -0.411879f,  -0.444865f,   -0.0404821f,
+      -0.0607352f,  -0.663753f,    -0.724327f,  -0.138642f,   0.834144f,
+      -0.811695f,   -0.930264f,    0.150993f,   -0.325565f,   0.0615853f,
+      -0.473993f,   0.0966587f,    0.315197f,   1.0345f,      0.35441f,
+      0.703234f,    -0.335715f,    0.783153f,   0.467976f,    -0.0234736f,
+      0.549724f,    0.539107f,     -0.510182f,  -0.154442f,   0.0126656f,
+      1.66711f,     0.884555f,     0.118675f,   -0.341705f,   0.195316f,
+      -0.0366564f,  -0.619244f,    -0.634092f,  -0.559951f,   0.0564255f,
+      0.765917f,    0.0510238f,    0.0667615f,  0.0699302f,   -0.0351751f,
+      -0.0484402f,  -0.000792665f, -0.10775f,   -0.337121f,   -0.983947f,
+      0.517793f,    1.34977f,      -0.567602f,  0.129921f,    -0.443722f,
+      -0.276277f,   -0.501404f,    -0.183234f,  -0.553055f,   -0.447434f,
+      -0.35529f,    -0.0444689f,   0.0192031f,  0.0372702f,   -0.195202f,
+      -0.020753f,   -0.0247035f,   0.420298f,   1.39373f,     0.203699f,
+      -0.218818f,   0.250734f,     -0.0282348f, 0.411986f,    -0.262946f,
+      0.526339f,    0.242769f,     -0.159857f,  -0.546788f,   -0.0410147f,
+      0.954238f,    -0.0252765f,   0.639488f,   -0.491367f,   -0.0572638f,
+      0.285763f,    -0.45764f,     0.121657f,   -1.24374f,    -0.372479f,
+      -0.111521f,   0.194134f,     -0.271364f,  0.179678f,    0.121237f,
+      -0.14305f,    -0.205662f,    0.216891f,   0.344568f,    -0.523745f,
+      -1.00908f,    0.180965f,     0.0263031f,  -0.0556144f,  0.0831083f,
+      -0.0623274f,  0.112748f,     0.597137f,   -0.502616f,   -1.10624f,
+      -0.0487462f,  -1.10744f,     -0.125653f,  0.277049f,    -0.141329f,
+      -0.00457003f, -0.161038f,    0.588462f,   0.323317f,    0.49762f,
+      0.477561f,    0.901705f,     -0.264511f,  0.256557f,    0.076023f,
+      -0.0460696f,  0.0830666f,    -0.0651269f, -0.881245f,   -0.285999f,
+      0.53127f,     0.914533f,     0.0505795f,  -0.3054f,     -0.0988696f,
+      -0.0658403f,  0.15979f,      -0.453316f,  -0.824834f,   -0.280222f,
+      -0.686952f,   -0.0768344f,   -1.12235f,   -0.815408f,   0.0202134f,
+      -0.111892f,   0.0847659f,    -0.18763f,   0.597782f,    0.364016f
+    };
+
+static const float
+    av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_32[] = {
+      -1.541f,     -0.00935641f, -1.50754f, -0.638648f, -0.679403f,
+      -0.0387804f, -0.714791f,   -1.69522f, 0.435677f,  -1.5846f,
+      0.108788f,   0.614982f,    0.111048f, -0.465826f, -0.611358f,
+      0.637197f,   0.929621f,    -1.20889f, 0.954558f,  0.716529f
+    };
+
+static const float av1_fp_simple_motion_search_term_none_logits_kernel_32[] = {
+  0.396195f,   -0.791364f,  -0.881893f, 1.0542069f, 0.772562f,
+  0.60815647f, 1.117405f,   -1.272638f, 0.483183f,  -0.917147f,
+  0.690799f,   -0.601466f,  -0.545536f, -0.416353f, -0.927874f,
+  0.972198f,   -0.3770457f, 0.542694f,  -0.591889f, 0.464565f
+};
+
+static const float av1_fp_simple_motion_search_term_none_logits_bias_32[] = {
+  -0.590318f
+};
+
+static const NN_CONFIG av1_fp_simple_motion_search_term_none_nn_config_32 = {
+  NUM_FEATURES_32,
+  NUM_LOGITS_32,
+  NUM_HIDDEN_LAYERS_32,
+  {
+      NUM_LAYER_0_UNITS_32,
+  },
+  {
+      av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_32,
+      av1_fp_simple_motion_search_term_none_logits_kernel_32,
+  },
+  {
+      av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_32,
+      av1_fp_simple_motion_search_term_none_logits_bias_32,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_32
+#undef NUM_FEATURES_32
+#undef NUM_LAYER_0_UNITS_32
+#undef NUM_LOGITS_32
+
+#define NUM_HIDDEN_LAYERS_16 1
+#define NUM_FEATURES_16 20
+#define NUM_LAYER_0_UNITS_16 24
+#define NUM_LOGITS_16 1
+
+static const float
+    av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_16[] = {
+      -0.315922f,   0.74455f,     -0.0196939f,  0.238336f,    0.288554f,
+      0.0845902f,   -0.0121831f,  0.455303f,    0.0235902f,   0.218997f,
+      -0.0445164f,  0.0752211f,   0.0539915f,   -0.0439682f,  -0.397139f,
+      -0.0030004f,  -0.106365f,   0.845384f,    0.684638f,    -0.965702f,
+      0.307643f,    -0.0433377f,  -0.0644826f,  -0.214946f,   -0.44467f,
+      0.142967f,    0.0109982f,   -0.344458f,   -0.42947f,    0.269175f,
+      -0.88534f,    -0.28077f,    -1.36018f,    -0.33725f,    -0.0885953f,
+      -0.123887f,   0.218107f,    -0.0759977f,  0.739124f,    0.684048f,
+      0.577964f,    -0.328481f,   -0.247837f,   0.00546713f,  0.191895f,
+      -0.145274f,   0.320121f,    -0.482379f,   0.534585f,    -0.1582f,
+      0.944784f,    0.944665f,    0.0494451f,   -0.0399724f,  -0.170375f,
+      -0.0869746f,  0.106216f,    -0.120556f,   -1.57849f,    -0.752895f,
+      0.424454f,    -0.0269515f,  0.00398589f,  0.214165f,    -0.142986f,
+      0.199223f,    0.049624f,    -0.116783f,   -0.648119f,   -0.311599f,
+      0.122629f,    -0.0338422f,  0.345092f,    -0.408254f,   0.601037f,
+      -0.00146985f, 0.00133926f,  0.0392668f,   -0.931156f,   0.31429f,
+      -0.150243f,   0.0755763f,   -0.32177f,    0.258521f,    -0.104078f,
+      -0.144506f,   0.0199566f,   -0.454723f,   -0.292959f,   -0.0953681f,
+      -1.24843f,    0.446814f,    -0.311363f,   0.0590878f,   -0.0568717f,
+      -0.421585f,   0.179852f,    0.668763f,    0.48914f,     0.290584f,
+      -1.14053f,    -1.37576f,    0.420112f,    -0.158582f,   0.268231f,
+      0.252999f,    0.276423f,    0.529033f,    0.141127f,    0.702762f,
+      0.181407f,    -0.0279289f,  -0.0194757f,  0.0752152f,   -0.136963f,
+      0.00902489f,  0.125334f,    0.0680212f,   -0.370449f,   0.438003f,
+      -0.600869f,   0.154209f,    -0.36306f,    -0.484209f,   0.140093f,
+      0.0743079f,   -0.143317f,   0.0442872f,   0.272089f,    0.601531f,
+      1.20687f,     -0.280695f,   0.222235f,    -0.0106747f,  -0.017026f,
+      0.204008f,    -0.0316111f,  -0.64679f,    -0.866749f,   -0.774231f,
+      0.306231f,    -0.0940114f,  -0.56555f,    -0.34399f,    0.425142f,
+      0.424064f,    -0.50189f,    -0.146558f,   0.544899f,    0.141728f,
+      1.14592f,     -0.0124826f,  0.111613f,    -0.0862228f,  0.0211737f,
+      0.0614017f,   0.0245077f,   -0.454523f,   -0.0766391f,  -0.436808f,
+      0.251409f,    -0.13354f,    -0.242447f,   -0.311807f,   -0.844505f,
+      -0.671486f,   0.0946297f,   0.241702f,    0.856521f,    0.529763f,
+      -0.869772f,   -0.0016341f,  0.14511f,     0.0136254f,   -0.0359721f,
+      -0.0454713f,  0.00664495f,  0.0373555f,   0.653991f,    -0.075867f,
+      -0.102728f,   -0.947685f,   -0.119479f,   -0.145413f,   0.148364f,
+      0.310885f,    -0.266837f,   0.354087f,    0.299469f,    0.603911f,
+      0.257161f,    0.0190527f,   0.152862f,    -0.0987196f,  -0.293369f,
+      0.139026f,    -0.128421f,   0.0505933f,   -0.703803f,   1.08628f,
+      -0.562294f,   -0.818943f,   0.102178f,    0.727399f,    -0.228433f,
+      0.484057f,    0.0595919f,   -0.0559087f,  -0.549447f,   0.176168f,
+      1.41744f,     -0.126284f,   0.0987251f,   -0.00123073f, 0.00510827f,
+      0.105209f,    0.0671775f,   -0.438525f,   0.211028f,    -0.782459f,
+      0.286411f,    -0.459887f,   0.0633669f,   0.329958f,    -0.0736945f,
+      0.45188f,     -0.2447f,     0.676601f,    0.600321f,    -0.0336198f,
+      0.108531f,    0.0452834f,   -0.0848577f,  0.0731281f,   1.32381f,
+      -0.118349f,   0.129497f,    -0.840938f,   -1.45444f,    -0.559047f,
+      -0.248109f,   -0.491559f,   -0.139812f,   0.175964f,    0.168687f,
+      0.123031f,    0.201625f,    0.422849f,    0.34436f,     0.0426694f,
+      0.558045f,    -0.246772f,   0.679483f,    -0.0959578f,  -0.102879f,
+      0.391029f,    0.280906f,    0.0867408f,   -1.10932f,    0.402526f,
+      -0.227285f,   0.336087f,    -0.237765f,   0.185619f,    -0.309732f,
+      0.0781132f,   -0.0234955f,  0.0828806f,   0.19966f,     -0.241288f,
+      -0.224634f,   0.0638918f,   -0.143521f,   -0.0206692f,  -0.27131f,
+      0.973051f,    1.12031f,     0.262846f,    0.471585f,    0.105231f,
+      -0.386434f,   -0.355846f,   0.7359f,      0.567308f,    0.130768f,
+      0.242369f,    -0.0272523f,  -0.118436f,   0.374145f,    0.24802f,
+      -1.00186f,    -0.0241195f,  0.0140446f,   0.0202831f,   0.163197f,
+      0.0399298f,   -0.00912791f, -0.280572f,   -0.309893f,   -0.644495f,
+      0.243838f,    0.731391f,    0.0725078f,   0.350308f,    -0.136691f,
+      0.208814f,    0.0218567f,   -0.0805393f,  -0.18681f,    -0.214638f,
+      0.273354f,    -0.355047f,   0.242748f,    0.472951f,    -0.202705f,
+      0.405247f,    0.161622f,    -0.284883f,   -1.31181f,    -0.661056f,
+      -0.248219f,   -0.827307f,   0.289221f,    0.660529f,    0.48563f,
+      0.407366f,    0.0327303f,   -0.0610309f,  -0.647064f,   0.0899991f,
+      0.376267f,    1.27555f,     0.0264175f,   0.153931f,    1.07345f,
+      0.0715052f,   0.174473f,    0.01322f,     -0.715723f,   0.113909f,
+      0.100968f,    -0.457287f,   -0.672022f,   -0.20532f,    0.895176f,
+      0.357034f,    0.5413f,      0.918393f,    -0.455f,      -0.499617f,
+      -1.21799f,    0.0634338f,   0.144944f,    -0.106715f,   0.0227713f,
+      -0.0203213f,  0.030851f,    -0.0726756f,  0.589192f,    -0.060841f,
+      -0.198521f,   0.497179f,    -0.0591156f,  -0.135466f,   -0.132638f,
+      -0.181333f,   -0.332358f,   0.0349959f,   0.212885f,    -0.536206f,
+      -0.425009f,   -0.035525f,   0.0384449f,   0.0360549f,   -0.0383953f,
+      -0.0263281f,  -0.0228435f,  1.11771f,     0.928061f,    -0.163923f,
+      -0.327868f,   -0.894518f,   0.00448907f,  0.0805977f,   0.329559f,
+      0.157429f,    0.292729f,    0.497688f,    0.188659f,    0.203724f,
+      -1.26001f,    -0.0392533f,  -0.0566088f,  0.000859925f, 0.125254f,
+      0.054261f,    0.0357295f,   -0.393813f,   -0.275944f,   0.299657f,
+      -0.211421f,   0.038172f,    -0.439829f,   -0.913949f,   0.35642f,
+      0.865473f,    -0.472033f,   -0.752376f,   0.995255f,    0.417965f,
+      -0.680645f,   0.0622027f,   0.128878f,    -0.0357859f,  0.0793577f,
+      0.203629f,    -0.0600867f,  0.0512268f,   0.528584f,    0.23889f,
+      0.38255f,     -0.216407f,   -0.0338828f,  0.0328103f,   -0.885678f,
+      -0.716634f,   0.438663f,    0.320841f,    -0.119656f,   0.626092f,
+      0.8526f,      -0.0325005f,  -0.0275416f,  -0.171131f,   0.0260563f,
+      -0.0162027f,  0.0879367f,   -0.340473f,   0.0220265f,   -0.1731f,
+      0.512539f,    0.587822f,    -0.175619f,   0.177215f,    -0.35458f,
+      -0.159059f,   -0.423754f,   0.0198413f,   -0.336208f,   -0.359052f,
+      -1.50819f,    0.0628184f,   0.054506f,    0.0048834f,   0.361657f,
+      0.00986886f,  -0.0721521f,  -0.256765f,   1.41173f,     0.376196f,
+      -0.0783331f,  0.174803f,    -0.00240091f, -0.306571f,   -0.304654f,
+      -0.0348377f,  0.115569f,    -0.20359f,    -0.162341f,   -0.0443526f,
+      -0.848317f,   -0.228167f,   0.699534f,    0.482092f,    -0.0921484f,
+      -0.172425f,   -0.0610094f,  -0.188327f,   0.836209f,    0.541725f
+    };
+
+static const float
+    av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_16[] = {
+      -0.388147f, -0.0868767f, 0.702129f,  0.376659f, -0.709988f, 0.496603f,
+      -0.238442f, -1.35761f,   -0.391887f, 0.235468f, -0.327982f, 0.731842f,
+      1.0949f,    -0.789218f,  -0.881452f, 0.514341f, 0.727894f,  -0.494498f,
+      -1.32304f,  -1.22643f,   -0.294287f, -1.3974f,  -0.128148f, -0.0956137f
+    };
+
+static const float av1_fp_simple_motion_search_term_none_logits_kernel_16[] = {
+  0.456147f,   0.248707f,  -0.5205241f, -0.1506567f, 0.388359f,   -0.6074409f,
+  -0.4719775f, -0.733864f, 0.5588447f,  -0.4021345f, -1.140733f,  -0.73399f,
+  -0.4299591f, 0.450688f,  0.817564f,   -0.265486f,  -0.3525806f, 0.55188314f,
+  1.365457f,   1.180764f,  0.587772f,   -0.870683f,  0.818839f,   0.318488f
+};
+
+static const float av1_fp_simple_motion_search_term_none_logits_bias_16[] = {
+  -0.1046478f
+};
+
+static const NN_CONFIG av1_fp_simple_motion_search_term_none_nn_config_16 = {
+  NUM_FEATURES_16,
+  NUM_LOGITS_16,
+  NUM_HIDDEN_LAYERS_16,
+  {
+      NUM_LAYER_0_UNITS_16,
+  },
+  {
+      av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_16,
+      av1_fp_simple_motion_search_term_none_logits_kernel_16,
+  },
+  {
+      av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_16,
+      av1_fp_simple_motion_search_term_none_logits_bias_16,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_16
+#undef NUM_FEATURES_16
+#undef NUM_LAYER_0_UNITS_16
+#undef NUM_LOGITS_16
+
+#define NUM_HIDDEN_LAYERS_8 1
+#define NUM_FEATURES_8 20
+#define NUM_LAYER_0_UNITS_8 16
+#define NUM_LOGITS_8 1
+
+static const float
+    av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_8[] = {
+      -1.11024f,    -0.530449f,    -0.164768f,  0.675431f,   0.456155f,
+      0.711099f,    -0.248095f,    0.112132f,   -0.131481f,  0.234457f,
+      0.128073f,    0.306214f,     0.175471f,   0.220189f,   -0.270533f,
+      0.293534f,    -0.0795547f,   0.234901f,   -0.191754f,  0.101171f,
+      -0.108621f,   0.395477f,     -0.529459f,  -0.354854f,  -0.941334f,
+      -0.237689f,   0.39357f,      0.527129f,   0.174333f,   -0.00520422f,
+      1.22219f,     -0.21815f,     0.0866816f,  -0.29591f,   -0.212968f,
+      0.00431436f,  -0.295382f,    -0.582317f,  -0.284654f,  0.486427f,
+      -0.202448f,   -0.0421883f,   -0.116346f,  -0.345832f,  -0.0471637f,
+      -0.149954f,   -0.0969526f,   -0.59491f,   0.594364f,   0.298285f,
+      -1.33301f,    0.149562f,     0.097433f,   0.157641f,   -0.231132f,
+      -0.0191656f,  0.149396f,     0.811553f,   1.07336f,    0.140674f,
+      1.02134f,     0.455909f,     -0.0548795f, 0.0459996f,  -0.0589837f,
+      -0.116328f,   -0.607502f,    -0.232595f,  -0.517977f,  -0.325901f,
+      1.35047f,     -0.148698f,    0.0313182f,  0.181634f,   0.06539f,
+      0.00820322f,  0.0522113f,    -1.06071f,   -0.817999f,  -0.527422f,
+      -1.39175f,    -0.110088f,    0.0858626f,  -0.247541f,  0.29043f,
+      1.13767f,     0.185834f,     0.390613f,   -0.501175f,  -0.214176f,
+      -0.256376f,   0.496687f,     0.240471f,   0.218852f,   0.513543f,
+      0.400559f,    -0.249168f,    -0.752987f,  0.430491f,   -0.72299f,
+      0.339754f,    0.396623f,     -0.0638322f, 0.353122f,   0.355662f,
+      -0.0704821f,  0.195448f,     0.179396f,   0.486533f,   0.0815535f,
+      -0.503726f,   -0.000321223f, 0.501591f,   -0.117849f,  0.217667f,
+      -0.123391f,   -0.4026f,      0.149756f,   -0.0359276f, -0.0990213f,
+      -0.215278f,   -0.293649f,    0.301629f,   -0.11081f,   -0.206725f,
+      -0.00147108f, 0.363644f,     -0.430092f,  0.169524f,   0.116091f,
+      -0.583605f,   -0.0974948f,   0.253256f,   0.22648f,    0.136902f,
+      -0.882541f,   -0.75078f,     -0.0629343f, 0.411035f,   0.265742f,
+      -0.360904f,   -0.899324f,    0.605871f,   0.0318372f,  0.0735312f,
+      -0.00960722f, 0.691249f,     0.127449f,   -0.133021f,  -0.0793589f,
+      0.665591f,    -0.0682262f,   -0.0437626f, 0.0783621f,  2.25727f,
+      0.126529f,    -0.0320763f,   -0.261759f,  -1.19987f,   0.216295f,
+      -0.253886f,   -0.642908f,    0.1865f,     0.00299179f, 0.0246782f,
+      -0.00750628f, 0.566367f,     0.99916f,    -0.0209625f, 0.273254f,
+      1.09724f,     0.30026f,      0.21585f,    -0.0276715f, 0.338996f,
+      0.129884f,    -0.00628438f,  0.0461783f,  -1.36378f,   -0.394756f,
+      -0.395261f,   0.215928f,     0.252803f,   -0.207108f,  -0.0506214f,
+      -0.0138889f,  0.124197f,     -0.0522996f, 0.533803f,   -0.25729f,
+      -0.463514f,   0.128322f,     -1.04751f,   -0.605498f,  -0.107235f,
+      -0.00813289f, 0.539742f,     -0.0524178f, 0.272101f,   0.151935f,
+      0.607511f,    -0.0608427f,   0.36342f,    0.0999134f,  0.69712f,
+      -0.152471f,   0.364244f,     0.410644f,   0.312606f,   0.405679f,
+      -0.371656f,   -0.0492209f,   -0.148911f,  0.214996f,   -0.274749f,
+      -0.0372888f,  0.079023f,     -0.429136f,  -1.30393f,   -0.833824f,
+      -1.31373f,    -0.445343f,    0.526917f,   1.30569f,    -0.0626746f,
+      0.282353f,    -0.28552f,     0.28084f,    -0.234934f,  0.227076f,
+      1.09919f,     0.33248f,      -0.114933f,  0.40629f,    0.331031f,
+      0.245334f,    -0.0318782f,   0.00735305f, -1.58715f,   0.126443f,
+      -0.09472f,    -0.182152f,    0.311673f,   -0.186136f,  0.817743f,
+      0.928961f,    0.117334f,     -0.373644f,  -0.0797864f, 0.205565f,
+      0.0789797f,   0.0757131f,    -0.152409f,  0.30301f,    -0.0170824f,
+      -0.194496f,   0.485547f,     0.370124f,   -0.802044f,  -0.789671f,
+      0.669258f,    0.55082f,      -0.438853f,  0.0597597f,  -0.0148101f,
+      -0.41603f,    0.0486339f,    -0.464523f,  -0.413725f,  0.00907629f,
+      0.70351f,     -0.136422f,    -0.145957f,  -0.0626726f, -0.115773f,
+      -0.333937f,   0.135474f,     -0.379598f,  -0.134422f,  0.227595f,
+      0.908927f,    0.759504f,     -0.0088258f, -0.349333f,  0.122667f,
+      -0.682175f,   0.2201f,       -0.332003f,  -0.44433f,   -0.620308f,
+      -1.36716f,    -0.0167907f,   -0.538969f,  0.256824f,   -0.0706724f,
+      -0.0392471f,  -0.156312f,    0.153699f,   1.41967f,    0.0434739f,
+      0.428178f,    -0.0714879f,   0.0912104f,  0.00687985f, 0.341789f,
+      0.217381f,    0.128288f,     0.0286751f,  0.527344f,   -0.428139f,
+      0.60908f,     1.02074f,      -0.0977894f, 0.158067f,   0.28958f,
+      -0.065152f,   0.120616f,     -0.882976f,  -1.10413f,   -1.37497f
+    };
+
+static const float
+    av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_8[] = {
+      1.37086f,  -1.61858f, -1.32395f,  0.276031f, -0.124696f, -1.71489f,
+      -1.68429f, 1.79103f,  -0.335306f, -1.81523f, 0.841083f,  -0.542628f,
+      -1.82168f, 0.459829f, 0.0949306f, 0.918486f
+    };
+
+static const float av1_fp_simple_motion_search_term_none_logits_kernel_8[] = {
+  -0.283418f, -0.444453f, 0.4977782f, -0.4138758f, 0.41890771f, 0.22149438f,
+  0.545079f,  -0.729164f, 0.619389f,  0.5169534f,  -0.4236282f, 0.7304213f,
+  0.531938f,  -0.14828f,  0.75119f,   -0.464074f
+};
+
+static const float av1_fp_simple_motion_search_term_none_logits_bias_8[] = {
+  -2.22338f
+};
+
+static const NN_CONFIG av1_fp_simple_motion_search_term_none_nn_config_8 = {
+  NUM_FEATURES_8,
+  NUM_LOGITS_8,
+  NUM_HIDDEN_LAYERS_8,
+  {
+      NUM_LAYER_0_UNITS_8,
+  },
+  {
+      av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_8,
+      av1_fp_simple_motion_search_term_none_logits_kernel_8,
+  },
+  {
+      av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_8,
+      av1_fp_simple_motion_search_term_none_logits_bias_8,
+  },
+};
 
 #undef NUM_HIDDEN_LAYERS_8
 #undef NUM_FEATURES_8
 #undef NUM_LAYER_0_UNITS_8
 #undef NUM_LOGITS_8
-#endif
+
+static const float av1_fp_simple_motion_search_term_none_thresh_32 =
+    -2.2884985045792563f;
+static const float av1_fp_simple_motion_search_term_none_thresh_16 =
+    -1.6656874577527165f;
+static const float av1_fp_simple_motion_search_term_none_thresh_8 =
+    -3.608804354309157f;
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libaom/av1/encoder/partition_strategy.c b/libaom/av1/encoder/partition_strategy.c
new file mode 100644
index 0000000..e8270b3
--- /dev/null
+++ b/libaom/av1/encoder/partition_strategy.c
@@ -0,0 +1,727 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <float.h>
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/partition_model_weights.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/rdopt.h"
+
+// Performs a simple_motion_search with a single reference frame and extract
+// the variance of residues. Here features is assumed to be a length 6 array.
+// After this function is called, we will store the following in to features:
+// features[0] = log(1 + dc_q**2/256)
+// features[1] = log(1 + variance_of_residue)
+// for i in [2, 3, 4, 5]:
+//  features[i] = log(1 + variance_of_residue_in_block[i]/variance_of_residue)
+static void get_res_var_features(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
+                                 int mi_col, BLOCK_SIZE bsize,
+                                 float *features) {
+  // TODO(chiyotsai@google.com): The data this model trained on did not also use
+  // SIMPLE_TRANSLATION to build the inter_predictor. Retraining and tuning the
+  // model with the correct data should give better performance.
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  // Perform a single motion search in Y_PLANE to make a prediction
+  const int use_subpixel = 0;
+
+  // Start getting the features
+  int f_idx = 0;
+
+  // Q_INDEX
+  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+  aom_clear_system_state();
+  features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+
+  // VARIANCE
+  unsigned int sse = 0;
+  unsigned int var = 0;
+  const MV ref_mv_full = { .row = 0, .col = 0 };
+  av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full,
+                            use_subpixel, &sse, &var);
+  aom_clear_system_state();
+  features[f_idx++] = logf(1.0f + (float)var);
+
+  // Regional
+  const uint8_t *src = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *dst = xd->plane[0].dst.buf;
+  const int dst_stride = xd->plane[0].dst.stride;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  int r_idx = 0;
+  for (r_idx = 0; r_idx < 4; r_idx++) {
+    const int x_idx = (r_idx & 1) * bw / 2;
+    const int y_idx = (r_idx >> 1) * bh / 2;
+    const int src_offset = y_idx * src_stride + x_idx;
+    const int dst_offset = y_idx * dst_stride + x_idx;
+    const unsigned int sub_var = cpi->fn_ptr[subsize].vf(
+        src + src_offset, src_stride, dst + dst_offset, dst_stride, &sse);
+    aom_clear_system_state();
+    const float var_ratio = (1.0f + (float)sub_var) / (4.0f + (float)var);
+    features[f_idx++] = var_ratio;
+  }
+}
+
+void av1_simple_motion_search_based_split(
+    AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, int mi_col,
+    BLOCK_SIZE bsize, int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_rectangular_split,
+    int *do_square_split) {
+  const NN_CONFIG *nn_config = NULL;
+  float split_only_thresh = 0.0f;
+  if (bsize == BLOCK_128X128) {
+    nn_config = &av1_simple_motion_search_based_split_nn_config_128;
+    split_only_thresh = av1_simple_motion_search_based_split_thresh_128;
+  } else if (bsize == BLOCK_64X64) {
+    nn_config = &av1_simple_motion_search_based_split_nn_config_64;
+    split_only_thresh = av1_simple_motion_search_based_split_thresh_64;
+  } else if (bsize == BLOCK_32X32) {
+    nn_config = &av1_simple_motion_search_based_split_nn_config_32;
+    split_only_thresh = av1_simple_motion_search_based_split_thresh_32;
+  } else if (bsize == BLOCK_16X16) {
+    nn_config = &av1_simple_motion_search_based_split_nn_config_16;
+    split_only_thresh = av1_simple_motion_search_based_split_thresh_16;
+  } else if (bsize == BLOCK_8X8) {
+    // Disable BLOCK_8X8 for now
+#if !CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8
+    nn_config = &av1_simple_motion_search_based_split_nn_config_8;
+    split_only_thresh = av1_simple_motion_search_based_split_thresh_8;
+#endif
+  } else {
+    assert(0 && "Unexpected block size in simple_motion_based_split");
+  }
+  if (nn_config) {
+    float features[6] = { 0 };
+    float score = 0;
+    get_res_var_features(cpi, x, mi_row, mi_col, bsize, features);
+    av1_nn_predict(features, nn_config, &score);
+
+    if (score > split_only_thresh) {
+      *partition_none_allowed = 0;
+      *partition_horz_allowed = 0;
+      *partition_vert_allowed = 0;
+      *do_rectangular_split = 0;
+    }
+    if (cpi->sf.simple_motion_search_split_only >= 2) {
+      if (score < -split_only_thresh) *do_square_split = 0;
+      // For larger scores (>split_only_thresh), none and rectangular partitions
+      // are skipped. As score reduces, possibility of split decreases. Hence
+      // for near larger scores (.875 * split_only_thresh to split_only_thresh)
+      // none partition is disabled, but rectangular partitions are evaluated
+      // additionally.
+      if (score > (split_only_thresh * 0.875)) *partition_none_allowed = 0;
+    }
+  }
+}
+
+// Given a list of ref frames in refs, performs simple_motion_search on each of
+// the refs and returns the ref with the smallest sse. Returns -1 if none of the
+// ref in the list is available. Also stores the best sse and var in best_sse,
+// best_var, respectively. If save_mv_code is -1, don't update mv_ref_fulls in
+// pc_tree. If save_mv_code is between 0 and 3, update mv_ref_fulls under
+// pc_tree->split[i]. If save_mv_code is 4, update mv_ref_fulls under pc_tree.
+static int simple_motion_search_get_best_ref(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, const int *const refs, int num_refs,
+    int use_subpixel, int save_mv_code, unsigned int *best_sse,
+    unsigned int *best_var) {
+  // TODO(chiyotsai@google.com): The calculation of variance currently uses
+  // bsize, so we might take area outside of the image into account. We need to
+  // modify the SIMD functions to fix this later.
+  const AV1_COMMON *const cm = &cpi->common;
+  int best_ref = -1;
+
+  if (mi_col >= cm->mi_cols || mi_row >= cm->mi_rows) {
+    // If the whole block is outside of the image, set the var and sse to 0.
+    *best_var = 0;
+    *best_sse = 0;
+
+    return best_ref;
+  }
+
+  // Otherwise do loop through the reference frames and find the one with the
+  // minimum SSE
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MV *mv_ref_fulls = pc_tree->mv_ref_fulls;
+
+  const int num_planes = 1;
+
+  *best_sse = INT_MAX;
+
+  for (int ref_idx = 0; ref_idx < num_refs; ref_idx++) {
+    const int ref = refs[ref_idx];
+
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) {
+      unsigned int curr_sse = 0, curr_var = 0;
+      av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
+                               mv_ref_fulls[ref], num_planes, use_subpixel);
+      curr_var = cpi->fn_ptr[bsize].vf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
+          xd->plane[0].dst.stride, &curr_sse);
+      if (curr_sse < *best_sse) {
+        *best_sse = curr_sse;
+        *best_var = curr_var;
+        best_ref = ref;
+      }
+
+      const int new_mv_row = x->best_mv.as_mv.row / 8;
+      const int new_mv_col = x->best_mv.as_mv.col / 8;
+      if (save_mv_code == 4) {
+        pc_tree->mv_ref_fulls[ref].row = new_mv_row;
+        pc_tree->mv_ref_fulls[ref].col = new_mv_col;
+      } else if (save_mv_code >= 0 && save_mv_code < 4) {
+        // Propagate the new motion vectors to a lower level
+        pc_tree->split[save_mv_code]->mv_ref_fulls[ref].row = new_mv_row;
+        pc_tree->split[save_mv_code]->mv_ref_fulls[ref].col = new_mv_col;
+      } else {
+        assert(save_mv_code == -1 &&
+               "Unknown code in simple_motion_search_get_best_ref.");
+      }
+    }
+  }
+
+  return best_ref;
+}
+
+// Performs fullpixel simple_motion_search with LAST_FRAME and ALTREF_FRAME on
+// each subblock and extract the variance and sse of residues. Then store the
+// var and sse from each partition subblock to features. The DC qindex is also
+// stored in features.
+// Here features is assumed to be a length 19 array.
+// After this function is called, we will store the following to features:
+// features[0:17] = var and sse from subblocks
+// features[18] = DC q_index
+static void simple_motion_search_prune_part_features(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, float *features) {
+  // TODO(chiyotsai@google.com): Cache the result of the motion search from the
+  // larger bsize.
+  const int w_mi = mi_size_wide[bsize];
+  const int h_mi = mi_size_high[bsize];
+  int f_idx = 0;
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+  assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] ||
+         cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+
+  // Setting up motion search
+  const int ref_list[] = { LAST_FRAME, ALTREF_FRAME };
+  const int num_refs = 2;
+  const int use_subpixel = 1;
+
+  unsigned int int_features[FEATURE_SIZE_SMS_PRUNE_PART - 1];
+
+  // Doing whole block first to update the mv
+  simple_motion_search_get_best_ref(
+      cpi, x, pc_tree, mi_row, mi_col, bsize, ref_list, num_refs, use_subpixel,
+      4, &int_features[f_idx], &int_features[f_idx + 1]);
+  f_idx += 2;
+
+  // Split subblocks
+  BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  int r_idx = 0;
+  for (r_idx = 0; r_idx < 4; r_idx++) {
+    const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
+    const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
+
+    simple_motion_search_get_best_ref(
+        cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+        use_subpixel, r_idx, &int_features[f_idx], &int_features[f_idx + 1]);
+    f_idx += 2;
+  }
+
+  // Horz subblocks
+  subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+  for (r_idx = 0; r_idx < 2; r_idx++) {
+    const int sub_mi_col = mi_col + 0;
+    const int sub_mi_row = mi_row + r_idx * h_mi / 2;
+
+    simple_motion_search_get_best_ref(
+        cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+        use_subpixel, -1, &int_features[f_idx], &int_features[f_idx + 1]);
+
+    f_idx += 2;
+  }
+
+  // Vert subblock
+  subsize = get_partition_subsize(bsize, PARTITION_VERT);
+  for (r_idx = 0; r_idx < 2; r_idx++) {
+    const int sub_mi_col = mi_col + r_idx * w_mi / 2;
+    const int sub_mi_row = mi_row + 0;
+
+    simple_motion_search_get_best_ref(
+        cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+        use_subpixel, -1, &int_features[f_idx], &int_features[f_idx + 1]);
+
+    f_idx += 2;
+  }
+
+  aom_clear_system_state();
+  for (int idx = 0; idx < f_idx; idx++) {
+    features[idx] = logf(1.0f + (float)int_features[idx]);
+  }
+
+  const MACROBLOCKD *xd = &x->e_mbd;
+  set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+  // Q_INDEX
+  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+  features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+
+  // Neighbor stuff
+  const int has_above = !!xd->above_mbmi;
+  const int has_left = !!xd->left_mbmi;
+  const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->sb_type : bsize;
+  const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->sb_type : bsize;
+  features[f_idx++] = (float)has_above;
+  features[f_idx++] = (float)mi_size_wide_log2[above_bsize];
+  features[f_idx++] = (float)mi_size_high_log2[above_bsize];
+  features[f_idx++] = (float)has_left;
+  features[f_idx++] = (float)mi_size_wide_log2[left_bsize];
+  features[f_idx++] = (float)mi_size_high_log2[left_bsize];
+
+  assert(f_idx == FEATURE_SIZE_SMS_PRUNE_PART);
+}
+
+void av1_simple_motion_search_prune_part(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
+    int *partition_horz_allowed, int *partition_vert_allowed,
+    int *do_square_split, int *do_rectangular_split, int *prune_horz,
+    int *prune_vert, float *features, int *valid) {
+  const AV1_COMMON *const cm = &cpi->common;
+  // Get model parameters
+  const NN_CONFIG *nn_config = NULL;
+  const float *prune_thresh = NULL, *only_thresh = NULL;
+  const float *ml_mean = NULL, *ml_std = NULL;
+  float normalized_features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f };
+
+  if (bsize == BLOCK_128X128) {
+    nn_config = &av1_simple_motion_search_prune_part_nn_config_128;
+    ml_mean = av1_simple_motion_search_prune_part_mean_128;
+    ml_std = av1_simple_motion_search_prune_part_std_128;
+    prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_128;
+    only_thresh = av1_simple_motion_search_prune_part_only_thresh_128;
+  } else if (bsize == BLOCK_64X64) {
+    nn_config = &av1_simple_motion_search_prune_part_nn_config_64;
+    ml_mean = av1_simple_motion_search_prune_part_mean_64;
+    ml_std = av1_simple_motion_search_prune_part_std_64;
+    prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_64;
+    only_thresh = av1_simple_motion_search_prune_part_only_thresh_64;
+  } else if (bsize == BLOCK_32X32) {
+    nn_config = &av1_simple_motion_search_prune_part_nn_config_32;
+    ml_mean = av1_simple_motion_search_prune_part_mean_32;
+    ml_std = av1_simple_motion_search_prune_part_std_32;
+    prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_32;
+    only_thresh = av1_simple_motion_search_prune_part_only_thresh_32;
+  } else if (bsize == BLOCK_16X16) {
+    nn_config = &av1_simple_motion_search_prune_part_nn_config_16;
+    ml_mean = av1_simple_motion_search_prune_part_mean_16;
+    ml_std = av1_simple_motion_search_prune_part_std_16;
+    prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_16;
+    only_thresh = av1_simple_motion_search_prune_part_only_thresh_16;
+  } else if (bsize == BLOCK_8X8) {
+    nn_config = &av1_simple_motion_search_prune_part_nn_config_8;
+    ml_mean = av1_simple_motion_search_prune_part_mean_8;
+    ml_std = av1_simple_motion_search_prune_part_std_8;
+    prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_8;
+    only_thresh = av1_simple_motion_search_prune_part_only_thresh_8;
+  } else {
+    assert(0 && "Unexpected block size in simple_motion_prune_part");
+  }
+
+  // If there is no valid threshold, return immediately.
+  if (!nn_config || (prune_thresh[PARTITION_HORZ] == 0.0f &&
+                     prune_thresh[PARTITION_VERT] == 0.0f)) {
+    return;
+  }
+  if (bsize < BLOCK_8X8) {
+    return;
+  }
+
+  // Get features
+  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+                                           bsize, features);
+  *valid = 1;
+  for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) {
+    normalized_features[f_idx] =
+        (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+  }
+
+  // Get probabilities
+  float scores[EXT_PARTITION_TYPES] = { 0.0f },
+        probs[EXT_PARTITION_TYPES] = { 0.0f };
+  const int num_classes = (bsize == BLOCK_128X128 || bsize == BLOCK_8X8)
+                              ? PARTITION_TYPES
+                              : EXT_PARTITION_TYPES;
+
+  av1_nn_predict(normalized_features, nn_config, scores);
+  aom_clear_system_state();
+
+  av1_nn_softmax(scores, probs, num_classes);
+
+  // Determine if we should prune rectangular partitions.
+  if (cpi->sf.simple_motion_search_prune_rect && !frame_is_intra_only(cm) &&
+      (*partition_horz_allowed || *partition_vert_allowed) &&
+      bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
+    *prune_horz = probs[PARTITION_HORZ] <= prune_thresh[PARTITION_HORZ];
+    *prune_vert = probs[PARTITION_VERT] <= prune_thresh[PARTITION_VERT];
+  }
+
+  // Silence compiler warnings
+  (void)only_thresh;
+  (void)partition_none_allowed;
+  (void)do_square_split;
+  (void)do_rectangular_split;
+}
+
+// Early terminates PARTITION_NONE using simple_motion_search features and the
+// rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
+//  - The frame is a show frame
+//  - The frame is not intra only
+//  - The current bsize is > BLOCK_8X8
+//  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
+void av1_simple_motion_search_early_term_none(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, const RD_STATS *none_rdc,
+    int *early_terminate, float *simple_motion_features,
+    int *simple_motion_features_are_valid) {
+  // TODO(chiyotsai@google.com): There are other features we can extract from
+  // PARTITION_NONE. Play with this later.
+  int f_idx = 0;
+  if (!*simple_motion_features_are_valid) {
+    simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+                                             bsize, simple_motion_features);
+    *simple_motion_features_are_valid = 1;
+  }
+  f_idx = 25;
+
+  simple_motion_features[f_idx++] = logf(1.0f + (float)none_rdc->rate);
+  simple_motion_features[f_idx++] = logf(1.0f + (float)none_rdc->dist);
+  simple_motion_features[f_idx++] = logf(1.0f + (float)none_rdc->rdcost);
+
+  assert(f_idx == FEATURE_SIZE_SMS_TERM_NONE);
+
+  const float *ml_mean = NULL;
+  const float *ml_std = NULL;
+  const float *ml_model = NULL;
+
+  if (bsize == BLOCK_128X128) {
+    ml_mean = av1_simple_motion_search_term_none_mean_128;
+    ml_std = av1_simple_motion_search_term_none_std_128;
+    ml_model = av1_simple_motion_search_term_none_model_128;
+  } else if (bsize == BLOCK_64X64) {
+    ml_mean = av1_simple_motion_search_term_none_mean_64;
+    ml_std = av1_simple_motion_search_term_none_std_64;
+    ml_model = av1_simple_motion_search_term_none_model_64;
+  } else if (bsize == BLOCK_32X32) {
+    ml_mean = av1_simple_motion_search_term_none_mean_32;
+    ml_std = av1_simple_motion_search_term_none_std_32;
+    ml_model = av1_simple_motion_search_term_none_model_32;
+  } else if (bsize == BLOCK_16X16) {
+    ml_mean = av1_simple_motion_search_term_none_mean_16;
+    ml_std = av1_simple_motion_search_term_none_std_16;
+    ml_model = av1_simple_motion_search_term_none_model_16;
+  } else {
+    assert(0 && "Unexpected block size in simple_motion_term_none");
+  }
+
+  if (ml_model) {
+    float score = 0.0f;
+    for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) {
+      score += ml_model[f_idx] *
+               (simple_motion_features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+    }
+    score += ml_model[FEATURE_SIZE_SMS_TERM_NONE];
+
+    if (score >= 0.0f) {
+      *early_terminate = 1;
+    }
+  }
+}
+
+static void firstpass_simple_motion_search_features(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, float *features) {
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+  assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] ||
+         cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+
+  // Setting up motion search
+  const int ref_list[] = { LAST_FRAME, ALTREF_FRAME };
+  const int num_refs = 2;
+  const int use_subpixel = 0;
+
+  unsigned int int_features[10] = { 0 };
+
+  int f_idx = 0;
+  // Doing whole block first to update the mv
+  simple_motion_search_get_best_ref(
+      cpi, x, pc_tree, mi_row, mi_col, bsize, ref_list, num_refs, use_subpixel,
+      4, &int_features[f_idx], &int_features[f_idx + 1]);
+  f_idx += 2;
+
+  // Split subblocks
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  const int w_mi = mi_size_wide[bsize];
+  const int h_mi = mi_size_high[bsize];
+  for (int r_idx = 0; r_idx < 4; r_idx++) {
+    const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
+    const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
+
+    simple_motion_search_get_best_ref(
+        cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+        use_subpixel, r_idx, &int_features[f_idx], &int_features[f_idx + 1]);
+    f_idx += 2;
+  }
+
+  aom_clear_system_state();
+  for (int idx = 0; idx < f_idx; idx++) {
+    features[idx] = logf(1.0f + (float)int_features[idx]);
+  }
+
+  const MACROBLOCKD *xd = &x->e_mbd;
+  set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+  // Q_INDEX
+  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+  features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+
+  // Neighbor stuff
+  const int has_above = !!xd->above_mbmi;
+  const int has_left = !!xd->left_mbmi;
+  const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->sb_type : bsize;
+  const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->sb_type : bsize;
+  features[f_idx++] = (float)has_above;
+  features[f_idx++] = (float)mi_size_wide_log2[above_bsize];
+  features[f_idx++] = (float)mi_size_high_log2[above_bsize];
+  features[f_idx++] = (float)has_left;
+  features[f_idx++] = (float)mi_size_wide_log2[left_bsize];
+  features[f_idx++] = (float)mi_size_high_log2[left_bsize];
+}
+
+void av1_firstpass_simple_motion_search_early_term(AV1_COMP *const cpi,
+                                                   MACROBLOCK *x,
+                                                   PC_TREE *pc_tree, int mi_row,
+                                                   int mi_col, BLOCK_SIZE bsize,
+                                                   const RD_STATS *none_rdc,
+                                                   int *do_square_split) {
+  const NN_CONFIG *nn_config = NULL;
+  float thresh = 0.0f;
+  const float *ml_mean = NULL, *ml_std = NULL;
+  if (bsize == BLOCK_32X32) {
+    nn_config = &av1_fp_simple_motion_search_term_none_nn_config_32;
+    ml_mean = av1_fp_simple_motion_search_term_none_mean_32;
+    ml_std = av1_fp_simple_motion_search_term_none_std_32;
+    thresh = av1_fp_simple_motion_search_term_none_thresh_32;
+  } else if (bsize == BLOCK_16X16) {
+    nn_config = &av1_fp_simple_motion_search_term_none_nn_config_16;
+    ml_mean = av1_fp_simple_motion_search_term_none_mean_16;
+    ml_std = av1_fp_simple_motion_search_term_none_std_16;
+    thresh = av1_fp_simple_motion_search_term_none_thresh_16;
+  } else if (bsize == BLOCK_8X8) {
+    nn_config = &av1_fp_simple_motion_search_term_none_nn_config_8;
+    ml_mean = av1_fp_simple_motion_search_term_none_mean_8;
+    ml_std = av1_fp_simple_motion_search_term_none_std_8;
+    thresh = av1_fp_simple_motion_search_term_none_thresh_8;
+  } else {
+    assert(0 &&
+           "Unexpected bsize in firstpass_simple_motion_search_early_term");
+    return;
+  }
+
+  float ml_features[FEATURE_SIZE_FP_SMS_TERM_NONE] = { 0.0f };
+
+  firstpass_simple_motion_search_features(cpi, x, pc_tree, mi_row, mi_col,
+                                          bsize, ml_features);
+  int f_idx = 17;
+
+  ml_features[f_idx++] = logf(1.0f + (float)none_rdc->rate);
+  ml_features[f_idx++] = logf(1.0f + (float)none_rdc->dist);
+  ml_features[f_idx++] = logf(1.0f + (float)none_rdc->rdcost);
+
+  for (f_idx = 0; f_idx < 20; f_idx++) {
+    ml_features[f_idx] = (ml_features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+  }
+
+  // Get probabilities
+  float score = 0.0f;
+
+  av1_nn_predict(ml_features, nn_config, &score);
+  aom_clear_system_state();
+
+  // Determine if we should prune square partitions.
+  if (score < thresh) {
+    *do_square_split = 0;
+  }
+}
+
+void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        float *features) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+
+  assert(sb_size == BLOCK_128X128);
+
+  int f_idx = 0;
+
+  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+  aom_clear_system_state();
+  const float log_q_sq = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+
+  // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb
+  float sum_mv_row_sq = 0;
+  float sum_mv_row = 0;
+  float min_abs_mv_row = FLT_MAX;
+  float max_abs_mv_row = 0;
+
+  float sum_mv_col_sq = 0;
+  float sum_mv_col = 0;
+  float min_abs_mv_col = FLT_MAX;
+  float max_abs_mv_col = 0;
+
+  float sum_log_sse_sq = 0;
+  float sum_log_sse = 0;
+  float min_log_sse = FLT_MAX;
+  float max_log_sse = 0;
+
+  const BLOCK_SIZE mb_size = BLOCK_16X16;
+  const int mb_rows = block_size_high[sb_size] / block_size_high[mb_size];
+  const int mb_cols = block_size_wide[sb_size] / block_size_wide[mb_size];
+  const int mb_in_mi_size_high_log2 = mi_size_high_log2[mb_size];
+  const int mb_in_mi_size_wide_log2 = mi_size_wide_log2[mb_size];
+
+  for (int mb_row = 0; mb_row < mb_rows; mb_row++)
+    for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+      const int this_mi_row = mi_row + (mb_row << mb_in_mi_size_high_log2);
+      const int this_mi_col = mi_col + (mb_col << mb_in_mi_size_wide_log2);
+      unsigned int sse = 0;
+      unsigned int var = 0;
+      const MV ref_mv_full = { .row = 0, .col = 0 };
+
+      av1_simple_motion_sse_var(cpi, x, this_mi_row, this_mi_col, mb_size,
+                                ref_mv_full, 0, &sse, &var);
+
+      aom_clear_system_state();
+      const float mv_row = (float)(x->best_mv.as_mv.row / 8);
+      const float mv_col = (float)(x->best_mv.as_mv.col / 8);
+      const float log_sse = logf(1.0f + (float)sse);
+      const float abs_mv_row = fabsf(mv_row);
+      const float abs_mv_col = fabsf(mv_col);
+
+      sum_mv_row_sq += mv_row * mv_row;
+      sum_mv_row += mv_row;
+      sum_mv_col_sq += mv_col * mv_col;
+      sum_mv_col += mv_col;
+
+      if (abs_mv_row < min_abs_mv_row) min_abs_mv_row = abs_mv_row;
+      if (abs_mv_row > max_abs_mv_row) max_abs_mv_row = abs_mv_row;
+      if (abs_mv_col < min_abs_mv_col) min_abs_mv_col = abs_mv_col;
+      if (abs_mv_col > max_abs_mv_col) max_abs_mv_col = abs_mv_col;
+
+      sum_log_sse_sq += log_sse * log_sse;
+      sum_log_sse += log_sse;
+      if (log_sse < min_log_sse) min_log_sse = log_sse;
+      if (log_sse > max_log_sse) max_log_sse = log_sse;
+    }
+  aom_clear_system_state();
+  const float avg_mv_row = sum_mv_row / 64.0f;
+  const float var_mv_row = sum_mv_row_sq / 64.0f - avg_mv_row * avg_mv_row;
+
+  const float avg_mv_col = sum_mv_col / 64.0f;
+  const float var_mv_col = sum_mv_col_sq / 64.0f - avg_mv_col * avg_mv_col;
+
+  const float avg_log_sse = sum_log_sse / 64.0f;
+  const float var_log_sse = sum_log_sse_sq / 64.0f - avg_log_sse * avg_log_sse;
+
+  features[f_idx++] = avg_log_sse;
+  features[f_idx++] = avg_mv_col;
+  features[f_idx++] = avg_mv_row;
+  features[f_idx++] = log_q_sq;
+  features[f_idx++] = max_abs_mv_col;
+  features[f_idx++] = max_abs_mv_row;
+  features[f_idx++] = max_log_sse;
+  features[f_idx++] = min_abs_mv_col;
+  features[f_idx++] = min_abs_mv_row;
+  features[f_idx++] = min_log_sse;
+  features[f_idx++] = var_log_sse;
+  features[f_idx++] = var_mv_col;
+  features[f_idx++] = var_mv_row;
+
+  assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED);
+}
+
+BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+                                     const float *features) {
+  float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f },
+        probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+  const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config;
+
+  assert(cpi->sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE);
+
+  aom_clear_system_state();
+  av1_nn_predict(features, nn_config, scores);
+  av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
+
+  int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1;
+  if (cpi->sf.auto_max_partition_based_on_simple_motion == DIRECT_PRED) {
+    result = 0;
+    float max_prob = probs[0];
+    for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) {
+      if (probs[i] > max_prob) {
+        max_prob = probs[i];
+        result = i;
+      }
+    }
+  } else if (cpi->sf.auto_max_partition_based_on_simple_motion ==
+             RELAXED_PRED) {
+    for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
+         --result) {
+      if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
+        probs[result] += probs[result + 1];
+      }
+      if (probs[result] > 0.2) break;
+    }
+  } else if (cpi->sf.auto_max_partition_based_on_simple_motion == ADAPT_PRED) {
+    const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
+    MACROBLOCKD *const xd = &x->e_mbd;
+    // TODO(debargha): x->source_variance is unavailable at this point,
+    // so compute. The redundant recomputation later can be removed.
+    const unsigned int source_variance =
+        is_cur_buf_hbd(xd)
+            ? av1_high_get_sby_perpixel_variance(cpi, &x->plane[0].src, sb_size,
+                                                 xd->bd)
+            : av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, sb_size);
+    if (source_variance > 16) {
+      const double thresh = source_variance < 128 ? 0.05 : 0.1;
+      for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
+           --result) {
+        if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
+          probs[result] += probs[result + 1];
+        }
+        if (probs[result] > thresh) break;
+      }
+    }
+  }
+
+  return (BLOCK_SIZE)((result + 2) * 3);
+}
diff --git a/libaom/av1/encoder/partition_strategy.h b/libaom/av1/encoder/partition_strategy.h
new file mode 100644
index 0000000..36b1e95
--- /dev/null
+++ b/libaom/av1/encoder/partition_strategy.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
+#define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encoder.h"
+
+#define FEATURE_SIZE_SMS_PRUNE_PART 25
+#define FEATURE_SIZE_SMS_TERM_NONE 28
+#define FEATURE_SIZE_FP_SMS_TERM_NONE 20
+#define FEATURE_SIZE_MAX_MIN_PART_PRED 13
+#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
+
+// Performs a simple_motion_search with a single reference frame and extract
+// the variance of residues. Then use the features to determine whether we want
+// to go straight to splitting without trying PARTITION_NONE
+void av1_simple_motion_search_based_split(
+    AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, int mi_col,
+    BLOCK_SIZE bsize, int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_rectangular_split,
+    int *do_square_split);
+
+// Performs a simple_motion_search with two reference frames and extract
+// the variance of residues. Then use the features to determine whether we want
+// to prune some partitions.
+void av1_simple_motion_search_prune_part(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
+    int *partition_horz_allowed, int *partition_vert_allowed,
+    int *do_square_split, int *do_rectangular_split, int *prune_horz,
+    int *prune_vert, float *features, int *valid);
+
+// Early terminates PARTITION_NONE using simple_motion_search features and the
+// rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
+//  - The frame is a show frame
+//  - The frame is not intra only
+//  - The current bsize is > BLOCK_8X8
+//  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
+void av1_simple_motion_search_early_term_none(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, const RD_STATS *none_rdc,
+    int *early_terminate, float *simple_motion_features,
+    int *simple_motion_features_are_valid);
+
+// Early terminates after PARTITION_NONE in firstpass of two pass partition
+// search.
+void av1_firstpass_simple_motion_search_early_term(AV1_COMP *const cpi,
+                                                   MACROBLOCK *x,
+                                                   PC_TREE *pc_tree, int mi_row,
+                                                   int mi_col, BLOCK_SIZE bsize,
+                                                   const RD_STATS *none_rdc,
+                                                   int *do_square_split);
+
+// Get the features for selecting the max and min partition size. Currently this
+// performs simple_motion_search on 16X16 subblocks of the currnet superblock,
+// and then extract the statistics of sse and motion vectors as features.
+void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        float *features);
+
+// Predict the maximum BLOCK_SIZE to be used to encoder the current superblock.
+BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+                                     const float *features);
+
+// A simplified version of set_offsets meant to be used for
+// simple_motion_search.
+static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi,
+                                                 MACROBLOCK *const x,
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+
+  // Set up destination pointers.
+  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  x->mv_limits.row_min =
+      -(((mi_row + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
+  x->mv_limits.col_min = -(((mi_col + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
+  x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND;
+  x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND;
+
+  set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((cm->mi_rows - mi_height - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ((cm->mi_cols - mi_width - mi_col) * MI_SIZE) * 8;
+
+  // Set up source buffers.
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+
+  // R/D setup.
+  x->rdmult = cpi->rd.RDMULT;
+}
+
+static INLINE void init_simple_motion_search_mvs(PC_TREE *pc_tree) {
+  for (int idx = 0; idx < REF_FRAMES; idx++) {
+    pc_tree->mv_ref_fulls[idx].row = 0;
+    pc_tree->mv_ref_fulls[idx].col = 0;
+  }
+  if (pc_tree->block_size >= BLOCK_8X8) {
+    init_simple_motion_search_mvs(pc_tree->split[0]);
+    init_simple_motion_search_mvs(pc_tree->split[1]);
+    init_simple_motion_search_mvs(pc_tree->split[2]);
+    init_simple_motion_search_mvs(pc_tree->split[3]);
+  }
+}
+
+static INLINE int is_full_sb(AV1_COMMON *const cm, int mi_row, int mi_col,
+                             BLOCK_SIZE sb_size) {
+  const int sb_mi_wide = mi_size_wide[sb_size];
+  const int sb_mi_high = mi_size_high[sb_size];
+
+  return (mi_row + sb_mi_high) <= cm->mi_rows &&
+         (mi_col + sb_mi_wide) <= cm->mi_cols;
+}
+
+static INLINE int use_auto_max_partition(AV1_COMP *const cpi,
+                                         BLOCK_SIZE sb_size, int mi_row,
+                                         int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  return !frame_is_intra_only(cm) &&
+         cpi->sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE &&
+         sb_size == BLOCK_128X128 && is_full_sb(cm, mi_row, mi_col, sb_size) &&
+         cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index] !=
+             OVERLAY_UPDATE &&
+         cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index] !=
+             INTNL_OVERLAY_UPDATE;
+}
+
+#endif  // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
diff --git a/libaom/av1/encoder/pass2_strategy.c b/libaom/av1/encoder/pass2_strategy.c
new file mode 100644
index 0000000..ac22b68
--- /dev/null
+++ b/libaom/av1/encoder/pass2_strategy.c
@@ -0,0 +1,1787 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/onyxc_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+double calculate_active_area(const AV1_COMP *cpi,
+                             const FIRSTPASS_STATS *this_frame) {
+  double active_pct;
+
+  active_pct =
+      1.0 -
+      ((this_frame->intra_skip_pct / 2) +
+       ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
+  return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+#define ACT_AREA_CORRECTION 0.5
+double calculate_modified_err(const AV1_COMP *cpi, const TWO_PASS *twopass,
+                              const AV1EncoderConfig *oxcf,
+                              const FIRSTPASS_STATS *this_frame) {
+  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+  const double av_weight = stats->weight / stats->count;
+  const double av_err = (stats->coded_error * av_weight) / stats->count;
+  double modified_error =
+      av_err * pow(this_frame->coded_error * this_frame->weight /
+                       DOUBLE_DIVIDE_CHECK(av_err),
+                   oxcf->two_pass_vbrbias / 100.0);
+
+  // Correction for active area. Frames with a reduced active area
+  // (eg due to formatting bars) have a higher error per mb for the
+  // remaining active MBs. The correction here assumes that coding
+  // 0.5N blocks of complexity 2X is a little easier than coding N
+  // blocks of complexity X.
+  modified_error *=
+      pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
+  return fclamp(modified_error, twopass->modified_error_min,
+                twopass->modified_error_max);
+}
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) {
+  p->stats_in = position;
+}
+
+static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
+  if (p->stats_in >= p->stats_in_end) return EOF;
+
+  *fps = *p->stats_in;
+  ++p->stats_in;
+  return 1;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
+  if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
+      (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
+    return NULL;
+  }
+
+  return &p->stats_in[offset];
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section,
+                           const FIRSTPASS_STATS *frame) {
+  section->frame -= frame->frame;
+  section->weight -= frame->weight;
+  section->intra_error -= frame->intra_error;
+  section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
+  section->coded_error -= frame->coded_error;
+  section->sr_coded_error -= frame->sr_coded_error;
+  section->pcnt_inter -= frame->pcnt_inter;
+  section->pcnt_motion -= frame->pcnt_motion;
+  section->pcnt_second_ref -= frame->pcnt_second_ref;
+  section->pcnt_neutral -= frame->pcnt_neutral;
+  section->intra_skip_pct -= frame->intra_skip_pct;
+  section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->MVr -= frame->MVr;
+  section->mvr_abs -= frame->mvr_abs;
+  section->MVc -= frame->MVc;
+  section->mvc_abs -= frame->mvc_abs;
+  section->MVrv -= frame->MVrv;
+  section->MVcv -= frame->MVcv;
+  section->mv_in_out_count -= frame->mv_in_out_count;
+  section->new_mv_count -= frame->new_mv_count;
+  section->count -= frame->count;
+  section->duration -= frame->duration;
+}
+
+// Calculate the linear size relative to a baseline of 1080P
+#define BASE_SIZE 2073600.0  // 1920x1080
+static double get_linear_size_factor(const AV1_COMP *cpi) {
+  const double this_area = cpi->initial_width * cpi->initial_height;
+  return pow(this_area / BASE_SIZE, 0.5);
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+                          const AV1EncoderConfig *oxcf) {
+  int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+                      (int64_t)oxcf->two_pass_vbrmax_section) /
+                     100;
+  if (max_bits < 0)
+    max_bits = 0;
+  else if (max_bits > rc->max_frame_bandwidth)
+    max_bits = rc->max_frame_bandwidth;
+
+  return (int)max_bits;
+}
+
+static double calc_correction_factor(double err_per_mb, double err_divisor,
+                                     double pt_low, double pt_high, int q,
+                                     aom_bit_depth_t bit_depth) {
+  const double error_term = err_per_mb / err_divisor;
+
+  // Adjustment based on actual quantizer to power term.
+  const double power_term =
+      AOMMIN(av1_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+
+  // Calculate correction factor.
+  if (power_term < 1.0) assert(error_term >= 0.0);
+
+  return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+#define ERR_DIVISOR 100.0
+#define FACTOR_PT_LOW 0.70
+#define FACTOR_PT_HIGH 0.90
+
+// Similar to find_qindex_by_rate() function in ratectrl.c, but includes
+// calculation of a correction_factor.
+static int find_qindex_by_rate_with_correction(
+    int desired_bits_per_mb, aom_bit_depth_t bit_depth, FRAME_TYPE frame_type,
+    double error_per_mb, double ediv_size_correction,
+    double group_weight_factor, int best_qindex, int worst_qindex) {
+  assert(best_qindex <= worst_qindex);
+  int low = best_qindex;
+  int high = worst_qindex;
+  while (low < high) {
+    const int mid = (low + high) >> 1;
+    const double mid_factor =
+        calc_correction_factor(error_per_mb, ERR_DIVISOR - ediv_size_correction,
+                               FACTOR_PT_LOW, FACTOR_PT_HIGH, mid, bit_depth);
+    const int mid_bits_per_mb = av1_rc_bits_per_mb(
+        frame_type, mid, mid_factor * group_weight_factor, bit_depth);
+    if (mid_bits_per_mb > desired_bits_per_mb) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+#if CONFIG_DEBUG
+  assert(low == high);
+  const double low_factor =
+      calc_correction_factor(error_per_mb, ERR_DIVISOR - ediv_size_correction,
+                             FACTOR_PT_LOW, FACTOR_PT_HIGH, low, bit_depth);
+  const int low_bits_per_mb = av1_rc_bits_per_mb(
+      frame_type, low, low_factor * group_weight_factor, bit_depth);
+  assert(low_bits_per_mb <= desired_bits_per_mb || low == worst_qindex);
+#endif  // CONFIG_DEBUG
+  return low;
+}
+
+static int get_twopass_worst_quality(const AV1_COMP *cpi,
+                                     const double section_err,
+                                     double inactive_zone,
+                                     int section_target_bandwidth,
+                                     double group_weight_factor) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+
+  if (section_target_bandwidth <= 0) {
+    return rc->worst_quality;  // Highest value allowed
+  } else {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                            ? cpi->initial_mbs
+                            : cpi->common.MBs;
+    const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+    const double av_err_per_mb = section_err / active_mbs;
+    const int target_norm_bits_per_mb =
+        (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
+        active_mbs;
+
+    // Larger image formats are expected to be a little harder to code
+    // relatively given the same prediction error score. This in part at
+    // least relates to the increased size and hence coding overheads of
+    // motion vectors. Some account of this is made through adjustment of
+    // the error divisor.
+    double ediv_size_correction =
+        AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi)));
+    if (ediv_size_correction < 1.0)
+      ediv_size_correction = -(1.0 / ediv_size_correction);
+    ediv_size_correction *= 4.0;
+
+    // Try and pick a max Q that will be high enough to encode the
+    // content at the given rate.
+    int q = find_qindex_by_rate_with_correction(
+        target_norm_bits_per_mb, cpi->common.seq_params.bit_depth, INTER_FRAME,
+        av_err_per_mb, ediv_size_correction, group_weight_factor,
+        rc->best_quality, rc->worst_quality);
+
+    // Restriction on active max q for constrained quality mode.
+    if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level);
+    return q;
+  }
+}
+
+#define SR_DIFF_PART 0.0015
+#define MOTION_AMP_PART 0.003
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define SR_DIFF_MAX 128.0
+#define NCOUNT_FRAME_II_THRESH 5.0
+
+static double get_sr_decay_rate(const AV1_COMP *cpi,
+                                const FIRSTPASS_STATS *frame) {
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : cpi->common.MBs;
+  double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+  double sr_decay = 1.0;
+  double modified_pct_inter;
+  double modified_pcnt_intra;
+  const double motion_amplitude_factor =
+      frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
+
+  modified_pct_inter = frame->pcnt_inter;
+  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+      (double)NCOUNT_FRAME_II_THRESH) {
+    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+  }
+  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+  if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+    sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX);
+    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
+               (MOTION_AMP_PART * motion_amplitude_factor) -
+               (INTRA_PART * modified_pcnt_intra);
+  }
+  return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const AV1_COMP *cpi,
+                                     const FIRSTPASS_STATS *frame) {
+  const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
+  double sr_decay = get_sr_decay_rate(cpi, frame);
+  return AOMMIN(sr_decay, zero_motion_pct);
+}
+
+#define ZM_POWER_FACTOR 0.75
+
+static double get_prediction_decay_rate(const AV1_COMP *cpi,
+                                        const FIRSTPASS_STATS *next_frame) {
+  const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
+  const double zero_motion_factor =
+      (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
+                  ZM_POWER_FACTOR));
+
+  return AOMMAX(zero_motion_factor,
+                (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(AV1_COMP *cpi, int frame_interval,
+                                      int still_interval,
+                                      double loop_decay_rate,
+                                      double last_decay_rate) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Break clause to detect very still sections after motion
+  // For example a static image after a fade or other transition
+  // instead of a clean scene cut.
+  if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 &&
+      last_decay_rate < 0.9) {
+    int j;
+
+    // Look ahead a few frames to see if static condition persists...
+    for (j = 0; j < still_interval; ++j) {
+      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+      if (stats >= twopass->stats_in_end) break;
+
+      if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+    }
+
+    // Only if it does do we signal a transition to still.
+    return j == still_interval;
+  }
+
+  return 0;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash(const TWO_PASS *twopass, int offset) {
+  const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+
+  // What we are looking for here is a situation where there is a
+  // brief break in prediction (such as a flash) but subsequent frames
+  // are reasonably well predicted by an earlier (pre flash) frame.
+  // The recovery after a flash is indicated by a high pcnt_second_ref
+  // compared to pcnt_inter.
+  return next_frame != NULL &&
+         next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+         next_frame->pcnt_second_ref >= 0.5;
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+                                          double *mv_in_out,
+                                          double *mv_in_out_accumulator,
+                                          double *abs_mv_in_out_accumulator,
+                                          double *mv_ratio_accumulator) {
+  const double pct = stats->pcnt_motion;
+
+  // Accumulate Motion In/Out of frame stats.
+  *mv_in_out = stats->mv_in_out_count * pct;
+  *mv_in_out_accumulator += *mv_in_out;
+  *abs_mv_in_out_accumulator += fabs(*mv_in_out);
+
+  // Accumulate a measure of how uniform (or conversely how random) the motion
+  // field is (a ratio of abs(mv) / mv).
+  if (pct > 0.05) {
+    const double mvr_ratio =
+        fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+    const double mvc_ratio =
+        fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+    *mv_ratio_accumulator +=
+        pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
+    *mv_ratio_accumulator +=
+        pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
+  }
+}
+
+#define BASELINE_ERR_PER_MB 1000.0
+#define BOOST_FACTOR 12.5
+
+static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame,
+                               double this_frame_mv_in_out, double max_boost) {
+  double frame_boost;
+  const double lq = av1_convert_qindex_to_q(
+      cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.seq_params.bit_depth);
+  const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
+  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                       : cpi->common.MBs;
+
+  // Correct for any inactive region in the image
+  num_mbs = (int)AOMMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+  frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+
+  // Increase boost for frames where new data coming into frame (e.g. zoom out).
+  // Slightly reduce boost if there is a net balance of motion out of the frame
+  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+  if (this_frame_mv_in_out > 0.0)
+    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+  // In the extreme case the boost is halved.
+  else
+    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+  return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+#define GF_MAX_BOOST 90.0
+#define MIN_ARF_GF_BOOST 240
+#define MIN_DECAY_FACTOR 0.01
+
+static int calc_arf_boost(AV1_COMP *cpi, int offset, int f_frames, int b_frames,
+                          int *f_boost, int *b_boost) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  int i;
+  double boost_score = 0.0;
+  double mv_ratio_accumulator = 0.0;
+  double decay_accumulator = 1.0;
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+  int arf_boost;
+  int flash_detected = 0;
+
+  // Search forward from the proposed arf/next gf position.
+  for (i = 0; i < f_frames; ++i) {
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL) break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(
+        this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+    // We want to discount the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                              ? MIN_DECAY_FACTOR
+                              : decay_accumulator;
+    }
+
+    boost_score +=
+        decay_accumulator *
+        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+  }
+
+  *f_boost = (int)boost_score;
+
+  // Reset for backward looking loop.
+  boost_score = 0.0;
+  mv_ratio_accumulator = 0.0;
+  decay_accumulator = 1.0;
+  this_frame_mv_in_out = 0.0;
+  mv_in_out_accumulator = 0.0;
+  abs_mv_in_out_accumulator = 0.0;
+
+  // Search backward towards last gf position.
+  for (i = -1; i >= -b_frames; --i) {
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL) break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(
+        this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+    // We want to discount the the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
+
+    // Cumulative effect of prediction quality decay.
+    if (!flash_detected) {
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                              ? MIN_DECAY_FACTOR
+                              : decay_accumulator;
+    }
+
+    boost_score +=
+        decay_accumulator *
+        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+  }
+  *b_boost = (int)boost_score;
+
+  arf_boost = (*f_boost + *b_boost);
+  if (arf_boost < ((b_frames + f_frames) * 20))
+    arf_boost = ((b_frames + f_frames) * 20);
+  arf_boost = AOMMAX(arf_boost, MIN_ARF_GF_BOOST);
+
+  return arf_boost;
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+                                         const FIRSTPASS_STATS *end,
+                                         int section_length) {
+  const FIRSTPASS_STATS *s = begin;
+  double intra_error = 0.0;
+  double coded_error = 0.0;
+  int i = 0;
+
+  while (s < end && i < section_length) {
+    intra_error += s->intra_error;
+    coded_error += s->coded_error;
+    ++s;
+    ++i;
+  }
+
+  return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+// Calculate the total bits to allocate in this GF/ARF group.
+static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
+                                             double gf_group_err) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const TWO_PASS *const twopass = &cpi->twopass;
+  const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+  int64_t total_group_bits;
+
+  // Calculate the bits to be allocated to the group as a whole.
+  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+    total_group_bits = (int64_t)(twopass->kf_group_bits *
+                                 (gf_group_err / twopass->kf_group_error_left));
+  } else {
+    total_group_bits = 0;
+  }
+
+  // Clamp odd edge cases.
+  total_group_bits = (total_group_bits < 0)
+                         ? 0
+                         : (total_group_bits > twopass->kf_group_bits)
+                               ? twopass->kf_group_bits
+                               : total_group_bits;
+
+  // Clip based on user supplied data rate variability limit.
+  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+
+  return total_group_bits;
+}
+
+// Calculate the number bits extra to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count, int boost,
+                                int64_t total_group_bits) {
+  int allocation_chunks;
+
+  // return 0 for invalid inputs (could arise e.g. through rounding errors)
+  if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
+
+  allocation_chunks = (frame_count * 100) + boost;
+
+  // Prevent overflow.
+  if (boost > 1023) {
+    int divisor = boost >> 10;
+    boost /= divisor;
+    allocation_chunks /= divisor;
+  }
+
+  // Calculate the number of extra bits for use in the boosted frame or frames.
+  return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+                0);
+}
+
+#define LEAF_REDUCTION_FACTOR 0.75
+static double lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = {
+  { 1.0, 0.0, 0.0 }, { 0.6, 0.4, 0 }, { 0.45, 0.35, 0.20 }
+};
+static void allocate_gf_group_bits(
+    AV1_COMP *cpi, int64_t gf_group_bits, double group_error, int gf_arf_bits,
+    const EncodeFrameParams *const frame_params) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  const int key_frame = (frame_params->frame_type == KEY_FRAME);
+  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  int64_t total_group_bits = gf_group_bits;
+
+  // Check if GF group has any internal arfs.
+  int has_internal_arfs = 0;
+  for (int i = 0; i < gf_group->size; ++i) {
+    if (gf_group->update_type[i] == INTNL_ARF_UPDATE) {
+      has_internal_arfs = 1;
+      break;
+    }
+  }
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // === [frame_index == 0] ===
+  int frame_index = 0;
+  if (!key_frame) {
+    if (rc->source_alt_ref_active)
+      gf_group->bit_allocation[frame_index] = 0;
+    else
+      gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+    // Step over the golden frame / overlay frame
+    FIRSTPASS_STATS frame_stats;
+    if (EOF == input_stats(twopass, &frame_stats)) return;
+  }
+
+  // Deduct the boost bits for arf (or gf if it is not a key frame)
+  // from the group total.
+  if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
+
+  frame_index++;
+
+  // Store the bits to spend on the ARF if there is one.
+  // === [frame_index == 1] ===
+  if (rc->source_alt_ref_pending) {
+    gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+    ++frame_index;
+
+    // Skip all the internal ARFs right after ARF at the starting segment of
+    // the current GF group.
+    if (has_internal_arfs) {
+      while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) {
+        ++frame_index;
+      }
+    }
+  }
+
+  // Save.
+  const int tmp_frame_index = frame_index;
+  int budget_reduced_from_leaf_level = 0;
+
+  // Allocate bits to frames other than first frame, which is either a keyframe,
+  // overlay frame or golden frame.
+  const int normal_frames = rc->baseline_gf_interval - 1;
+
+  for (int i = 0; i < normal_frames; ++i) {
+    FIRSTPASS_STATS frame_stats;
+    if (EOF == input_stats(twopass, &frame_stats)) break;
+
+    const double modified_err =
+        calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
+    const double err_fraction =
+        (group_error > 0) ? modified_err / DOUBLE_DIVIDE_CHECK(group_error)
+                          : 0.0;
+    const int target_frame_size =
+        clamp((int)((double)total_group_bits * err_fraction), 0,
+              AOMMIN(max_bits, (int)total_group_bits));
+
+    if (gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
+      assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
+             "non-valid height for a pyramid structure");
+
+      const int arf_pos = gf_group->arf_pos_in_gf[frame_index];
+      gf_group->bit_allocation[frame_index] = 0;
+
+      gf_group->bit_allocation[arf_pos] = target_frame_size;
+      // Note: Boost, if needed, is added in the next loop.
+    } else {
+      assert(gf_group->update_type[frame_index] == LF_UPDATE);
+      gf_group->bit_allocation[frame_index] = target_frame_size;
+      if (has_internal_arfs) {
+        const int this_budget_reduction =
+            (int)(target_frame_size * LEAF_REDUCTION_FACTOR);
+        gf_group->bit_allocation[frame_index] -= this_budget_reduction;
+        budget_reduced_from_leaf_level += this_budget_reduction;
+      }
+    }
+
+    ++frame_index;
+
+    // Skip all the internal ARFs.
+    if (has_internal_arfs) {
+      while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+        ++frame_index;
+    }
+  }
+
+  if (budget_reduced_from_leaf_level > 0) {
+    assert(has_internal_arfs);
+    // Restore.
+    frame_index = tmp_frame_index;
+
+    // Re-distribute this extra budget to overlay frames in the group.
+    for (int i = 0; i < normal_frames; ++i) {
+      if (gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
+        assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
+               "non-valid height for a pyramid structure");
+        const int arf_pos = gf_group->arf_pos_in_gf[frame_index];
+        const int this_lvl = gf_group->pyramid_level[arf_pos];
+        const int dist2top = gf_group->pyramid_height - 1 - this_lvl;
+        const double lvl_boost_factor =
+            lvl_budget_factor[gf_group->pyramid_height - 2][dist2top];
+        const int extra_size =
+            (int)(budget_reduced_from_leaf_level * lvl_boost_factor /
+                  gf_group->pyramid_lvl_nodes[this_lvl]);
+        gf_group->bit_allocation[arf_pos] += extra_size;
+      }
+      ++frame_index;
+
+      // Skip all the internal ARFs.
+      if (has_internal_arfs) {
+        while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) {
+          ++frame_index;
+        }
+      }
+    }
+  }
+}
+
+// Given the maximum allowed height of the pyramid structure, return the fixed
+// GF length to be used.
+static INLINE int get_fixed_gf_length(int max_pyr_height) {
+  (void)max_pyr_height;
+  return MAX_GF_INTERVAL;
+}
+
+// Returns true if KF group and GF group both are almost completely static.
+static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion) {
+  return (gf_zero_motion >= 0.995) &&
+         (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
+}
+
+#define ARF_ABS_ZOOM_THRESH 4.4
+#define GROUP_ADAPTIVE_MAXQ 1
+#if GROUP_ADAPTIVE_MAXQ
+#define RC_FACTOR_MIN 0.75
+#define RC_FACTOR_MAX 1.75
+#endif  // GROUP_ADAPTIVE_MAXQ
+#define MIN_FWD_KF_INTERVAL 8
+
+// Analyse and define a gf/arf group.
+static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
+                            const EncodeFrameParams *const frame_params) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+  int i;
+
+  double boost_score = 0.0;
+  double gf_group_err = 0.0;
+#if GROUP_ADAPTIVE_MAXQ
+  double gf_group_raw_error = 0.0;
+#endif
+  double gf_group_skip_pct = 0.0;
+  double gf_group_inactive_zone_rows = 0.0;
+  double gf_first_frame_err = 0.0;
+  double mod_frame_err = 0.0;
+
+  double mv_ratio_accumulator = 0.0;
+  double decay_accumulator = 1.0;
+  double zero_motion_accumulator = 1.0;
+
+  double loop_decay_rate = 1.00;
+  double last_loop_decay_rate = 1.00;
+
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+
+  unsigned int allow_alt_ref = is_altref_enabled(cpi);
+
+  int f_boost = 0;
+  int b_boost = 0;
+  int flash_detected;
+  int64_t gf_group_bits;
+  double gf_group_error_left;
+  int gf_arf_bits;
+  const int is_intra_only = frame_params->frame_type == KEY_FRAME ||
+                            frame_params->frame_type == INTRA_ONLY_FRAME;
+  const int arf_active_or_kf = is_intra_only || rc->source_alt_ref_active;
+
+  cpi->internal_altref_allowed = (oxcf->gf_max_pyr_height > 1);
+
+  // Reset the GF group data structures unless this is a key
+  // frame in which case it will already have been done.
+  if (!is_intra_only) {
+    av1_zero(twopass->gf_group);
+  }
+
+  aom_clear_system_state();
+  av1_zero(next_frame);
+
+  // Load stats for the current frame.
+  mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+  // Note the error of the frame at the start of the group. This will be
+  // the GF frame error if we code a normal gf.
+  gf_first_frame_err = mod_frame_err;
+
+  // If this is a key frame or the overlay from a previous arf then
+  // the error score / cost of this frame has already been accounted for.
+  if (arf_active_or_kf) {
+    gf_group_err -= gf_first_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error -= this_frame->coded_error;
+#endif
+    gf_group_skip_pct -= this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
+  }
+  // Motion breakout threshold for loop below depends on image size.
+  const double mv_ratio_accumulator_thresh =
+      (cpi->initial_height + cpi->initial_width) / 4.0;
+
+  // TODO(urvang): Try logic to vary min and max interval based on q.
+  const int active_min_gf_interval = rc->min_gf_interval;
+  const int active_max_gf_interval =
+      AOMMIN(rc->max_gf_interval, get_fixed_gf_length(oxcf->gf_max_pyr_height));
+
+  double avg_sr_coded_error = 0;
+  double avg_raw_err_stdev = 0;
+  int non_zero_stdev_count = 0;
+
+  i = 0;
+  while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
+    ++i;
+
+    // Accumulate error score of frames in this gf group.
+    mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error += this_frame->coded_error;
+#endif
+    gf_group_skip_pct += this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
+
+    if (EOF == input_stats(twopass, &next_frame)) break;
+
+    // Test for the case where there is a brief flash but the prediction
+    // quality back to an earlier frame is then restored.
+    flash_detected = detect_flash(twopass, 0);
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(
+        &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+    // sum up the metric values of current gf group
+    avg_sr_coded_error += next_frame.sr_coded_error;
+    if (fabs(next_frame.raw_error_stdev) > 0.000001) {
+      non_zero_stdev_count++;
+      avg_raw_err_stdev += next_frame.raw_error_stdev;
+    }
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      last_loop_decay_rate = loop_decay_rate;
+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+      decay_accumulator = decay_accumulator * loop_decay_rate;
+
+      // Monitor for static sections.
+      if ((rc->frames_since_key + i - 1) > 1) {
+        zero_motion_accumulator = AOMMIN(
+            zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+      }
+
+      // Break clause to detect very still sections after motion. For example,
+      // a static image after a fade or other transition.
+      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
+                                     last_loop_decay_rate)) {
+        allow_alt_ref = 0;
+        break;
+      }
+    }
+
+    // Calculate a boost number for this frame.
+    boost_score +=
+        decay_accumulator *
+        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+    // If almost totally static, we will not use the the max GF length later,
+    // so we can continue for more frames.
+    if ((i >= active_max_gf_interval + 1) &&
+        !is_almost_static(zero_motion_accumulator,
+                          twopass->kf_zeromotion_pct)) {
+      break;
+    }
+
+    // Some conditions to breakout after min interval.
+    if (i >= active_min_gf_interval &&
+        // If possible don't break very close to a kf
+        (rc->frames_to_key - i >= rc->min_gf_interval) && (i & 0x01) &&
+        !flash_detected &&
+        (mv_ratio_accumulator > mv_ratio_accumulator_thresh ||
+         abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) {
+      break;
+    }
+    *this_frame = next_frame;
+  }
+
+  // Was the group length constrained by the requirement for a new KF?
+  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : cpi->common.MBs;
+  assert(num_mbs > 0);
+  if (i) avg_sr_coded_error /= i;
+
+  if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
+
+  // Disable internal ARFs for "still" gf groups.
+  //   zero_motion_accumulator: minimum percentage of (0,0) motion;
+  //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
+  //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
+  //                            motion error per block of each frame.
+  if (zero_motion_accumulator > MIN_ZERO_MOTION &&
+      avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+      avg_raw_err_stdev < MAX_RAW_ERR_VAR) {
+    cpi->internal_altref_allowed = 0;
+  }
+
+  const int use_alt_ref =
+      !is_almost_static(zero_motion_accumulator, twopass->kf_zeromotion_pct) &&
+      allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
+      (i >= rc->min_gf_interval) &&
+      (cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
+
+#define REDUCE_GF_LENGTH_THRESH 4
+#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
+#define REDUCE_GF_LENGTH_BY 1
+  int alt_offset = 0;
+  // The length reduction strategy is tweaked for certain cases, and doesn't
+  // work well for certain other cases.
+  const int allow_gf_length_reduction =
+      ((cpi->oxcf.rc_mode == AOM_Q && cpi->oxcf.cq_level <= 128) ||
+       !cpi->internal_altref_allowed) &&
+      !is_lossless_requested(&cpi->oxcf);
+
+  if (allow_gf_length_reduction && use_alt_ref) {
+    // adjust length of this gf group if one of the following condition met
+    // 1: only one overlay frame left and this gf is too long
+    // 2: next gf group is too short to have arf compared to the current gf
+
+    // maximum length of next gf group
+    const int next_gf_len = rc->frames_to_key - i;
+    const int single_overlay_left =
+        next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
+    // the next gf is probably going to have a ARF but it will be shorter than
+    // this gf
+    const int unbalanced_gf =
+        i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+        next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+        next_gf_len + 1 >= rc->min_gf_interval;
+
+    if (single_overlay_left || unbalanced_gf) {
+      const int roll_back = REDUCE_GF_LENGTH_BY;
+      // Reduce length only if active_min_gf_interval will be respected later.
+      if (i - roll_back >= active_min_gf_interval + 1) {
+        alt_offset = -roll_back;
+        i -= roll_back;
+      }
+    }
+  }
+
+  // Should we use the alternate reference frame.
+  if (use_alt_ref) {
+    // Calculate the boost for alt ref.
+    rc->gfu_boost =
+        calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost);
+    rc->source_alt_ref_pending = 1;
+
+    // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF
+    cpi->preserve_arf_as_gld = 1;
+  } else {
+    rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
+    rc->source_alt_ref_pending = 0;
+    cpi->preserve_arf_as_gld = 0;
+  }
+
+  // Set the interval until the next gf.
+  // If forward keyframes are enabled, ensure the final gf group obeys the
+  // MIN_FWD_KF_INTERVAL.
+  if (cpi->oxcf.fwd_kf_enabled &&
+      ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) {
+    if (i == rc->frames_to_key) {
+      rc->baseline_gf_interval = i;
+      // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
+    } else if ((rc->frames_to_key - i <
+                AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
+               (rc->frames_to_key != i)) {
+      // if possible, merge the last two gf groups
+      if (rc->frames_to_key <= active_max_gf_interval) {
+        rc->baseline_gf_interval = rc->frames_to_key;
+        // if merging the last two gf groups creates a group that is too long,
+        // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
+      } else {
+        rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
+      }
+    } else {
+      rc->baseline_gf_interval = i - rc->source_alt_ref_pending;
+    }
+  } else {
+    rc->baseline_gf_interval = i - rc->source_alt_ref_pending;
+  }
+
+#define LAST_ALR_BOOST_FACTOR 0.2f
+  rc->arf_boost_factor = 1.0;
+  if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
+    // Reduce the boost of altref in the last gf group
+    if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY ||
+        rc->frames_to_key - i == 0) {
+      rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+    }
+  }
+
+  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
+  // Calculate the bits to be allocated to the gf/arf group as a whole
+  gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+
+#if GROUP_ADAPTIVE_MAXQ
+  // Calculate an estimate of the maxq needed for the group.
+  // We are more agressive about correcting for sections
+  // where there could be significant overshoot than for easier
+  // sections where we do not wish to risk creating an overshoot
+  // of the allocated bit budget.
+  if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) {
+    const int vbr_group_bits_per_frame =
+        (int)(gf_group_bits / rc->baseline_gf_interval);
+    const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
+    const double group_av_skip_pct =
+        gf_group_skip_pct / rc->baseline_gf_interval;
+    const double group_av_inactive_zone =
+        ((gf_group_inactive_zone_rows * 2) /
+         (rc->baseline_gf_interval * (double)cm->mb_rows));
+
+    int tmp_q;
+    // rc factor is a weight factor that corrects for local rate control drift.
+    double rc_factor = 1.0;
+    if (rc->rate_error_estimate > 0) {
+      rc_factor = AOMMAX(RC_FACTOR_MIN,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
+    } else {
+      rc_factor = AOMMIN(RC_FACTOR_MAX,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
+    }
+    tmp_q = get_twopass_worst_quality(
+        cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
+        vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor);
+    twopass->active_worst_quality =
+        AOMMAX(tmp_q, twopass->active_worst_quality >> 1);
+  }
+#endif
+
+  // Calculate the extra bits to be used for boosted frame(s)
+  gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost,
+                                     gf_group_bits);
+
+  // Adjust KF group bits and error remaining.
+  twopass->kf_group_error_left -= (int64_t)gf_group_err;
+
+  // If this is an arf update we want to remove the score for the overlay
+  // frame at the end which will usually be very cheap to code.
+  // The overlay frame has already, in effect, been coded so we want to spread
+  // the remaining bits among the other frames.
+  // For normal GFs remove the score for the GF itself unless this is
+  // also a key frame in which case it has already been accounted for.
+  if (rc->source_alt_ref_pending) {
+    gf_group_error_left = gf_group_err - mod_frame_err;
+  } else if (!is_intra_only) {
+    gf_group_error_left = gf_group_err - gf_first_frame_err;
+  } else {
+    gf_group_error_left = gf_group_err;
+  }
+
+  // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+  av1_gop_setup_structure(cpi, frame_params);
+
+  // Allocate bits to each of the frames in the GF group.
+  allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits,
+                         frame_params);
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  if (frame_params->frame_type != KEY_FRAME) {
+    twopass->section_intra_rating = calculate_section_intra_ratio(
+        start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
+  }
+}
+
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+// We adapt the threshold based on number of frames in this key-frame group so
+// far.
+static double get_second_ref_usage_thresh(int frame_count_so_far) {
+  const int adapt_upto = 32;
+  const double min_second_ref_usage_thresh = 0.085;
+  const double second_ref_usage_thresh_max_delta = 0.035;
+  if (frame_count_so_far >= adapt_upto) {
+    return min_second_ref_usage_thresh + second_ref_usage_thresh_max_delta;
+  }
+  return min_second_ref_usage_thresh +
+         ((double)frame_count_so_far / (adapt_upto - 1)) *
+             second_ref_usage_thresh_max_delta;
+}
+
+static int test_candidate_kf(TWO_PASS *twopass,
+                             const FIRSTPASS_STATS *last_frame,
+                             const FIRSTPASS_STATS *this_frame,
+                             const FIRSTPASS_STATS *next_frame,
+                             int frame_count_so_far) {
+  int is_viable_kf = 0;
+  double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+  double modified_pcnt_inter =
+      this_frame->pcnt_inter - this_frame->pcnt_neutral;
+  const double second_ref_usage_thresh =
+      get_second_ref_usage_thresh(frame_count_so_far);
+
+  // Does the frame satisfy the primary criteria of a key frame?
+  // See above for an explanation of the test criteria.
+  // If so, then examine how well it predicts subsequent frames.
+  if ((this_frame->pcnt_second_ref < second_ref_usage_thresh) &&
+      (next_frame->pcnt_second_ref < second_ref_usage_thresh) &&
+      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+       ((pcnt_intra > MIN_INTRA_LEVEL) &&
+        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+        ((this_frame->intra_error /
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+         KF_II_ERR_THRESHOLD) &&
+        ((fabs(last_frame->coded_error - this_frame->coded_error) /
+              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         (fabs(last_frame->intra_error - this_frame->intra_error) /
+              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         ((next_frame->intra_error /
+           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+          II_IMPROVEMENT_THRESHOLD))))) {
+    int i;
+    const FIRSTPASS_STATS *start_pos = twopass->stats_in;
+    FIRSTPASS_STATS local_next_frame = *next_frame;
+    double boost_score = 0.0;
+    double old_boost_score = 0.0;
+    double decay_accumulator = 1.0;
+
+    // Examine how well the key frame predicts subsequent frames.
+    for (i = 0; i < 16; ++i) {
+      double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
+                             DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+
+      if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
+
+      // Cumulative effect of decay in prediction quality.
+      if (local_next_frame.pcnt_inter > 0.85)
+        decay_accumulator *= local_next_frame.pcnt_inter;
+      else
+        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
+
+      // Keep a running total.
+      boost_score += (decay_accumulator * next_iiratio);
+
+      // Test various breakout clauses.
+      if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+          (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) <
+            0.20) &&
+           (next_iiratio < 3.0)) ||
+          ((boost_score - old_boost_score) < 3.0) ||
+          (local_next_frame.intra_error < 200)) {
+        break;
+      }
+
+      old_boost_score = boost_score;
+
+      // Get the next frame details
+      if (EOF == input_stats(twopass, &local_next_frame)) break;
+    }
+
+    // If there is tolerable prediction for at least the next 3 frames then
+    // break out else discard this potential key frame and move on
+    if (boost_score > 30.0 && (i > 3)) {
+      is_viable_kf = 1;
+    } else {
+      // Reset the file position
+      reset_fpf_position(twopass, start_pos);
+
+      is_viable_kf = 0;
+    }
+  }
+
+  return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+#define KF_MIN_FRAME_BOOST 80.0
+#define KF_MAX_FRAME_BOOST 128.0
+#define MIN_KF_BOOST 300          // Minimum boost for non-static KF interval
+#define MIN_STATIC_KF_BOOST 5400  // Minimum boost for static KF interval
+
+static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  int i, j;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const FIRSTPASS_STATS first_frame = *this_frame;
+  const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+  FIRSTPASS_STATS next_frame;
+  FIRSTPASS_STATS last_frame;
+  int kf_bits = 0;
+  int loop_decay_counter = 0;
+  double decay_accumulator = 1.0;
+  double av_decay_accumulator = 0.0;
+  double zero_motion_accumulator = 1.0;
+  double boost_score = 0.0;
+  double kf_mod_err = 0.0;
+  double kf_group_err = 0.0;
+  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+
+  av1_zero(next_frame);
+
+  rc->frames_since_key = 0;
+
+  // Reset the GF group data structures.
+  av1_zero(*gf_group);
+
+  // Is this a forced key frame by interval.
+  rc->this_key_frame_forced = rc->next_key_frame_forced;
+
+  // Clear the alt ref active flag and last group multi arf flags as they
+  // can never be set for a key frame.
+  rc->source_alt_ref_active = 0;
+
+  // KF is always a GF so clear frames till next gf counter.
+  rc->frames_till_gf_update_due = 0;
+
+  rc->frames_to_key = 1;
+
+  twopass->kf_group_bits = 0;        // Total bits available to kf group
+  twopass->kf_group_error_left = 0;  // Group modified error score.
+
+  kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+  // Initialize the decay rates for the recent frames to check
+  for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+
+  // Find the next keyframe.
+  i = 0;
+  while (twopass->stats_in < twopass->stats_in_end &&
+         rc->frames_to_key < cpi->oxcf.key_freq) {
+    // Accumulate kf group error.
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+    // Load the next frame's stats.
+    last_frame = *this_frame;
+    input_stats(twopass, this_frame);
+
+    // Provided that we are not at the end of the file...
+    if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
+      double loop_decay_rate;
+
+      // Check for a scene cut.
+      if (test_candidate_kf(twopass, &last_frame, this_frame, twopass->stats_in,
+                            rc->frames_to_key))
+        break;
+
+      // How fast is the prediction quality decaying?
+      loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
+
+      // We want to know something about the recent past... rather than
+      // as used elsewhere where we are concerned with decay in prediction
+      // quality since the last GF or KF.
+      recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
+      decay_accumulator = 1.0;
+      for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+        decay_accumulator *= recent_loop_decay[j];
+
+      // Special check for transition or high motion followed by a
+      // static scene.
+      if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
+                                     loop_decay_rate, decay_accumulator))
+        break;
+
+      // Step on to the next frame.
+      ++rc->frames_to_key;
+
+      // If we don't have a real key frame within the next two
+      // key_freq intervals then break out of the loop.
+      if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break;
+    } else {
+      ++rc->frames_to_key;
+    }
+    ++i;
+  }
+
+  // If there is a max kf interval set by the user we must obey it.
+  // We already breakout of the loop above at 2x max.
+  // This code centers the extra kf if the actual natural interval
+  // is between 1x and 2x.
+  if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) {
+    FIRSTPASS_STATS tmp_frame = first_frame;
+
+    rc->frames_to_key /= 2;
+
+    // Reset to the start of the group.
+    reset_fpf_position(twopass, start_position);
+
+    kf_group_err = 0.0;
+
+    // Rescan to get the correct error data for the forced kf group.
+    for (i = 0; i < rc->frames_to_key; ++i) {
+      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
+      input_stats(twopass, &tmp_frame);
+    }
+    rc->next_key_frame_forced = 1;
+  } else if (twopass->stats_in == twopass->stats_in_end ||
+             rc->frames_to_key >= cpi->oxcf.key_freq) {
+    rc->next_key_frame_forced = 1;
+  } else {
+    rc->next_key_frame_forced = 0;
+  }
+
+  // Special case for the last key frame of the file.
+  if (twopass->stats_in >= twopass->stats_in_end) {
+    // Accumulate kf group error.
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  }
+
+  // Calculate the number of bits that should be assigned to the kf group.
+  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+    // Maximum number of bits for a single normal frame (not key frame).
+    const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+
+    // Maximum number of bits allocated to the key frame group.
+    int64_t max_grp_bits;
+
+    // Default allocation based on bits left and relative
+    // complexity of the section.
+    twopass->kf_group_bits = (int64_t)(
+        twopass->bits_left * (kf_group_err / twopass->modified_error_left));
+
+    // Clip based on maximum per frame rate defined by the user.
+    max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+    if (twopass->kf_group_bits > max_grp_bits)
+      twopass->kf_group_bits = max_grp_bits;
+  } else {
+    twopass->kf_group_bits = 0;
+  }
+  twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
+
+  // Reset the first pass file position.
+  reset_fpf_position(twopass, start_position);
+
+  // Scan through the kf group collating various stats used to determine
+  // how many bits to spend on it.
+  decay_accumulator = 1.0;
+  boost_score = 0.0;
+  const double kf_max_boost =
+      cpi->oxcf.rc_mode == AOM_Q
+          ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+                   KF_MAX_FRAME_BOOST)
+          : KF_MAX_FRAME_BOOST;
+  for (i = 0; i < (rc->frames_to_key - 1); ++i) {
+    if (EOF == input_stats(twopass, &next_frame)) break;
+
+    // Monitor for static sections.
+    // For the first frame in kf group, the second ref indicator is invalid.
+    if (i > 0) {
+      zero_motion_accumulator = AOMMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+    } else {
+      zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion;
+    }
+
+    // Not all frames in the group are necessarily used in calculating boost.
+    if ((i <= rc->max_gf_interval) ||
+        ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
+      const double frame_boost =
+          calc_frame_boost(cpi, this_frame, 0, kf_max_boost);
+
+      // How fast is prediction quality decaying.
+      if (!detect_flash(twopass, 0)) {
+        const double loop_decay_rate =
+            get_prediction_decay_rate(cpi, &next_frame);
+        decay_accumulator *= loop_decay_rate;
+        decay_accumulator = AOMMAX(decay_accumulator, MIN_DECAY_FACTOR);
+        av_decay_accumulator += decay_accumulator;
+        ++loop_decay_counter;
+      }
+      boost_score += (decay_accumulator * frame_boost);
+    }
+  }
+  if (loop_decay_counter > 0)
+    av_decay_accumulator /= (double)loop_decay_counter;
+
+  reset_fpf_position(twopass, start_position);
+
+  // Store the zero motion percentage
+  twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  twopass->section_intra_rating = calculate_section_intra_ratio(
+      start_position, twopass->stats_in_end, rc->frames_to_key);
+
+  rc->kf_boost = (int)(av_decay_accumulator * boost_score);
+
+  // Special case for static / slide show content but don't apply
+  // if the kf group is very short.
+  if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) &&
+      (rc->frames_to_key > 8)) {
+    rc->kf_boost = AOMMAX(rc->kf_boost, MIN_STATIC_KF_BOOST);
+  } else {
+    // Apply various clamps for min and max boost
+    rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
+    rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
+  }
+
+  // Work out how many bits to allocate for the key frame itself.
+  kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
+                                 twopass->kf_group_bits);
+  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
+  //        kf_bits, twopass->kf_zeromotion_pct);
+
+  // Work out the fraction of the kf group bits reserved for the inter frames
+  // within the group after discounting the bits for the kf itself.
+  if (twopass->kf_group_bits) {
+    twopass->kfgroup_inter_fraction =
+        (double)(twopass->kf_group_bits - kf_bits) /
+        (double)twopass->kf_group_bits;
+  } else {
+    twopass->kfgroup_inter_fraction = 1.0;
+  }
+
+  twopass->kf_group_bits -= kf_bits;
+
+  // Save the bits to spend on the key frame.
+  gf_group->bit_allocation[0] = kf_bits;
+  gf_group->update_type[0] = KF_UPDATE;
+
+  // Note the total error score of the kf group minus the key frame itself.
+  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+  // Adjust the count of total modified error left.
+  // The count of bits left is adjusted elsewhere based on real coded frame
+  // sizes.
+  twopass->modified_error_left -= kf_group_err;
+}
+
+static int is_skippable_frame(const AV1_COMP *cpi) {
+  // If the current frame does not have non-zero motion vector detected in the
+  // first  pass, and so do its previous and forward frames, then this frame
+  // can be skipped for partition check, and the partition size is assigned
+  // according to the variance
+  const TWO_PASS *const twopass = &cpi->twopass;
+
+  return (!frame_is_intra_only(&cpi->common) &&
+          twopass->stats_in - 2 > twopass->stats_in_start &&
+          twopass->stats_in < twopass->stats_in_end &&
+          (twopass->stats_in - 1)->pcnt_inter -
+                  (twopass->stats_in - 1)->pcnt_motion ==
+              1 &&
+          (twopass->stats_in - 2)->pcnt_inter -
+                  (twopass->stats_in - 2)->pcnt_motion ==
+              1 &&
+          twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
+
+#define ARF_STATS_OUTPUT 0
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+#define DEFAULT_GRP_WEIGHT 1.0
+
+void av1_get_second_pass_params(AV1_COMP *cpi,
+                                EncodeFrameParams *const frame_params,
+                                unsigned int frame_flags) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  int frames_left;
+  FIRSTPASS_STATS this_frame;
+
+  int target_rate;
+
+  frames_left = (int)(twopass->total_stats.count - current_frame->frame_number);
+
+  if (!twopass->stats_in) return;
+
+  // If this is an arf frame then we dont want to read the stats file or
+  // advance the input pointer as we already have what we need.
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+    target_rate = gf_group->bit_allocation[gf_group->index];
+    target_rate = av1_rc_clamp_pframe_target_size(
+        cpi, target_rate, gf_group->update_type[gf_group->index]);
+    rc->base_frame_target = target_rate;
+
+    if (cpi->no_show_kf) {
+      assert(gf_group->update_type[gf_group->index] == ARF_UPDATE);
+      frame_params->frame_type = KEY_FRAME;
+    } else {
+      frame_params->frame_type = INTER_FRAME;
+    }
+
+    // Do the firstpass stats indicate that this frame is skippable for the
+    // partition search?
+    if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+    }
+
+    return;
+  }
+
+  aom_clear_system_state();
+
+  if (cpi->oxcf.rc_mode == AOM_Q) {
+    twopass->active_worst_quality = cpi->oxcf.cq_level;
+  } else if (current_frame->frame_number == 0) {
+    // Special case code for first frame.
+    const int section_target_bandwidth =
+        (int)(twopass->bits_left / frames_left);
+    const double section_length = twopass->total_left_stats.count;
+    const double section_error =
+        twopass->total_left_stats.coded_error / section_length;
+    const double section_intra_skip =
+        twopass->total_left_stats.intra_skip_pct / section_length;
+    const double section_inactive_zone =
+        (twopass->total_left_stats.inactive_zone_rows * 2) /
+        ((double)cm->mb_rows * section_length);
+    const int tmp_q = get_twopass_worst_quality(
+        cpi, section_error, section_intra_skip + section_inactive_zone,
+        section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
+    twopass->active_worst_quality = tmp_q;
+    twopass->baseline_active_worst_quality = tmp_q;
+    rc->ni_av_qi = tmp_q;
+    rc->last_q[INTER_FRAME] = tmp_q;
+    rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
+    rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+    rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
+  }
+
+  av1_zero(this_frame);
+  if (EOF == input_stats(twopass, &this_frame)) return;
+
+  // Set the frame content type flag.
+  if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
+    twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
+  else
+    twopass->fr_content_type = FC_NORMAL;
+
+  // Keyframe and section processing.
+  if (rc->frames_to_key == 0 || (frame_flags & FRAMEFLAGS_KEY)) {
+    FIRSTPASS_STATS this_frame_copy;
+    this_frame_copy = this_frame;
+    frame_params->frame_type = KEY_FRAME;
+    // Define next KF group and assign bits to it.
+    find_next_key_frame(cpi, &this_frame);
+    this_frame = this_frame_copy;
+  } else {
+    frame_params->frame_type = INTER_FRAME;
+  }
+
+  // Define a new GF/ARF group. (Should always enter here for key frames).
+  if (rc->frames_till_gf_update_due == 0) {
+    define_gf_group(cpi, &this_frame, frame_params);
+
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+#if ARF_STATS_OUTPUT
+    {
+      FILE *fpfile;
+      fpfile = fopen("arf.stt", "a");
+      ++arf_count;
+      fprintf(fpfile, "%10d %10d %10d %10d %10d\n", current_frame->frame_number,
+              rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
+              rc->gfu_boost);
+
+      fclose(fpfile);
+    }
+#endif
+  }
+
+  // Do the firstpass stats indicate that this frame is skippable for the
+  // partition search?
+  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+  }
+
+  target_rate = gf_group->bit_allocation[gf_group->index];
+
+  if (frame_params->frame_type == KEY_FRAME) {
+    target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate);
+  } else {
+    target_rate = av1_rc_clamp_pframe_target_size(
+        cpi, target_rate, gf_group->update_type[gf_group->index]);
+  }
+
+  rc->base_frame_target = target_rate;
+
+  {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                            ? cpi->initial_mbs
+                            : cpi->common.MBs;
+    // The multiplication by 256 reverses a scaling factor of (>> 8)
+    // applied when combining MB error values for the frame.
+    twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0);
+    twopass->frame_avg_haar_energy =
+        log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0);
+  }
+
+  // Update the total stats remaining structure.
+  subtract_stats(&twopass->total_left_stats, &this_frame);
+}
+
+void av1_init_second_pass(AV1_COMP *cpi) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  double frame_rate;
+  FIRSTPASS_STATS *stats;
+
+  av1_twopass_zero_stats(&twopass->total_stats);
+  av1_twopass_zero_stats(&twopass->total_left_stats);
+
+  if (!twopass->stats_in_end) return;
+
+  stats = &twopass->total_stats;
+
+  *stats = *twopass->stats_in_end;
+  twopass->total_left_stats = *stats;
+
+  frame_rate = 10000000.0 * stats->count / stats->duration;
+  // Each frame can have a different duration, as the frame rate in the source
+  // isn't guaranteed to be constant. The frame rate prior to the first frame
+  // encoded in the second pass is a guess. However, the sum duration is not.
+  // It is calculated based on the actual durations of all frames from the
+  // first pass.
+  av1_new_framerate(cpi, frame_rate);
+  twopass->bits_left =
+      (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+
+  // This variable monitors how far behind the second ref update is lagging.
+  twopass->sr_update_lag = 1;
+
+  // Scan the first pass file and calculate a modified total error based upon
+  // the bias/power function used to allocate bits.
+  {
+    const double avg_error =
+        stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
+    const FIRSTPASS_STATS *s = twopass->stats_in;
+    double modified_error_total = 0.0;
+    twopass->modified_error_min =
+        (avg_error * oxcf->two_pass_vbrmin_section) / 100;
+    twopass->modified_error_max =
+        (avg_error * oxcf->two_pass_vbrmax_section) / 100;
+    while (s < twopass->stats_in_end) {
+      modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
+      ++s;
+    }
+    twopass->modified_error_left = modified_error_total;
+  }
+
+  // Reset the vbr bits off target counters
+  cpi->rc.vbr_bits_off_target = 0;
+  cpi->rc.vbr_bits_off_target_fast = 0;
+
+  cpi->rc.rate_error_estimate = 0;
+
+  // Static sequence monitor variables.
+  twopass->kf_zeromotion_pct = 100;
+  twopass->last_kfgroup_zeromotion_pct = 100;
+}
+
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+void av1_twopass_postencode_update(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int bits_used = rc->base_frame_target;
+
+  // VBR correction is done through rc->vbr_bits_off_target. Based on the
+  // sign of this value, a limited % adjustment is made to the target rate
+  // of subsequent frames, to try and push it back towards 0. This method
+  // is designed to prevent extreme behaviour at the end of a clip
+  // or group of frames.
+  rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+  twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0);
+
+  // Calculate the pct rc error.
+  if (rc->total_actual_bits) {
+    rc->rate_error_estimate =
+        (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
+    rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+  } else {
+    rc->rate_error_estimate = 0;
+  }
+
+  if (cpi->common.current_frame.frame_type != KEY_FRAME) {
+    twopass->kf_group_bits -= bits_used;
+    twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+  }
+  twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
+
+  // If the rate control is drifting consider adjustment to min or maxq.
+  if ((cpi->oxcf.rc_mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) {
+    const int maxq_adj_limit =
+        rc->worst_quality - twopass->active_worst_quality;
+    const int minq_adj_limit =
+        (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+
+    // Undershoot.
+    if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+      --twopass->extend_maxq;
+      if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+        ++twopass->extend_minq;
+      // Overshoot.
+    } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+      --twopass->extend_minq;
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        ++twopass->extend_maxq;
+    } else {
+      // Adjustment for extreme local overshoot.
+      if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+          rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+        ++twopass->extend_maxq;
+
+      // Unwind undershoot or overshoot adjustment.
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        --twopass->extend_minq;
+      else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+        --twopass->extend_maxq;
+    }
+
+    twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
+    twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+    // If there is a big and undexpected undershoot then feed the extra
+    // bits back in quickly. One situation where this may happen is if a
+    // frame is unexpectedly almost perfectly predicted by the ARF or GF
+    // but not very well predcited by the previous frame.
+    if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+      int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+      if (rc->projected_frame_size < fast_extra_thresh) {
+        rc->vbr_bits_off_target_fast +=
+            fast_extra_thresh - rc->projected_frame_size;
+        rc->vbr_bits_off_target_fast =
+            AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+
+        // Fast adaptation of minQ if necessary to use up the extra bits.
+        if (rc->avg_frame_bandwidth) {
+          twopass->extend_minq_fast =
+              (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+        }
+        twopass->extend_minq_fast = AOMMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+      } else if (rc->vbr_bits_off_target_fast) {
+        twopass->extend_minq_fast = AOMMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+      } else {
+        twopass->extend_minq_fast = 0;
+      }
+    }
+  }
+}
diff --git a/libaom/av1/encoder/pass2_strategy.h b/libaom/av1/encoder/pass2_strategy.h
new file mode 100644
index 0000000..bf37746
--- /dev/null
+++ b/libaom/av1/encoder/pass2_strategy.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PASS2_STRATEGY_H_
+#define AOM_AV1_ENCODER_PASS2_STRATEGY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+
+void av1_init_second_pass(struct AV1_COMP *cpi);
+
+void av1_get_second_pass_params(struct AV1_COMP *cpi,
+                                struct EncodeFrameParams *const frame_params,
+                                unsigned int frame_flags);
+
+void av1_twopass_postencode_update(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PASS2_STRATEGY_H_
diff --git a/libaom/av1/encoder/picklpf.c b/libaom/av1/encoder/picklpf.c
index b6b84c8..aca089c 100644
--- a/libaom/av1/encoder/picklpf.c
+++ b/libaom/av1/encoder/picklpf.c
@@ -70,24 +70,24 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
   // TODO(any): please enable multi-thread and remove the flag when loop
   // filter mask is compatible with multi-thread.
   if (cpi->num_workers > 1)
-    av1_loop_filter_frame_mt(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+    av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane,
                              plane + 1, partial_frame,
 #if LOOP_FILTER_BITMASK
                              0,
 #endif
                              cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
   else
-    av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd,
+    av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd,
 #if LOOP_FILTER_BITMASK
                           0,
 #endif
                           plane, plane + 1, partial_frame);
 
-  filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane,
+  filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane,
                                cm->seq_params.use_highbitdepth);
 
   // Re-instate the unfiltered frame
-  yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, plane);
+  yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane);
 
   return filt_err;
 }
@@ -108,7 +108,17 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   // range.
   int lvl;
   switch (plane) {
-    case 0: lvl = last_frame_filter_level[dir]; break;
+    case 0:
+      switch (dir) {
+        case 2:
+          lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >>
+                1;
+          break;
+        case 0:
+        case 1: lvl = last_frame_filter_level[dir]; break;
+        default: assert(dir >= 0 && dir <= 2); return 0;
+      }
+      break;
     case 1: lvl = last_frame_filter_level[2]; break;
     case 2: lvl = last_frame_filter_level[3]; break;
     default: assert(plane >= 0 && plane <= 2); return 0;
@@ -120,7 +130,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
 
   // Set each entry to -1
   memset(ss_err, 0xFF, sizeof(ss_err));
-  yv12_copy_plane(cm->frame_to_show, &cpi->last_frame_uf, plane);
+  yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane);
   best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
   filt_best = filt_mid;
   ss_err[filt_mid] = best_err;
@@ -203,19 +213,25 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     const int min_filter_level = 0;
     const int max_filter_level = av1_get_max_filter_level(cpi);
     const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth);
+    // based on tests result for rtc test set
+    // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
+    const int strength_boost_q_treshold = 700;
+    const int inter_frame_multiplier =
+        q > strength_boost_q_treshold ? 12034 : 6017;
     // These values were determined by linear fitting the result of the
     // searched level for 8 bit depth:
     // Keyframes: filt_guess = q * 0.06699 - 1.60817
-    // Other frames: filt_guess = q * 0.02295 + 2.48225
+    // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225
     //
     // And high bit depth separately:
     // filt_guess = q * 0.316206 + 3.87252
     int filt_guess;
     switch (cm->seq_params.bit_depth) {
       case AOM_BITS_8:
-        filt_guess = (cm->current_frame.frame_type == KEY_FRAME)
-                         ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
-                         : ROUND_POWER_OF_TWO(q * 6017 + 650707, 18);
+        filt_guess =
+            (cm->current_frame.frame_type == KEY_FRAME)
+                ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
+                : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18);
         break;
       case AOM_BITS_10:
         filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
diff --git a/libaom/av1/encoder/pickrst.c b/libaom/av1/encoder/pickrst.c
index a7fab16..1b4f26c 100644
--- a/libaom/av1/encoder/pickrst.c
+++ b/libaom/av1/encoder/pickrst.c
@@ -140,7 +140,7 @@ static void init_rsc(const YV12_BUFFER_CONFIG *src, const AV1_COMMON *cm,
   rsc->rusi = rusi;
   rsc->sf = sf;
 
-  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+  const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
   const int is_uv = plane != AOM_PLANE_Y;
   rsc->plane_width = src->crop_widths[is_uv];
   rsc->plane_height = src->crop_heights[is_uv];
@@ -165,7 +165,7 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
   const int bit_depth = cm->seq_params.bit_depth;
   const int highbd = cm->seq_params.use_highbitdepth;
 
-  const YV12_BUFFER_CONFIG *fts = cm->frame_to_show;
+  const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf;
   // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
   // also used in encoder.
   const int optimized_lr = 0;
@@ -200,7 +200,7 @@ int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
         v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
         const int32_t e =
             ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -216,7 +216,7 @@ int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
         v += xq[0] * (flt0[j] - u);
         const int32_t e =
             ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -231,7 +231,7 @@ int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
         v += xq[1] * (flt1[j] - u);
         const int32_t e =
             ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -241,7 +241,7 @@ int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
     for (i = 0; i < height; ++i) {
       for (j = 0; j < width; ++j) {
         const int32_t e = (int32_t)(dat[j]) - src[j];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -276,7 +276,7 @@ int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width,
         v += xq0 * v0;
         v += xq1 * v1;
         const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       flt0 += flt0_stride;
@@ -304,7 +304,7 @@ int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width,
         int32_t v = half;
         v += exq * (flt[j] - u);
         const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       flt += flt_stride;
@@ -316,7 +316,7 @@ int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width,
         const int32_t d = dat[j];
         const int32_t s = src[j];
         const int32_t e = d - s;
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -1281,7 +1281,7 @@ static void search_norestore(const RestorationTileLimits *limits,
 
   const int highbd = rsc->cm->seq_params.use_highbitdepth;
   rusi->sse[RESTORE_NONE] = sse_restoration_unit(
-      limits, rsc->src, rsc->cm->frame_to_show, rsc->plane, highbd);
+      limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd);
 
   rsc->sse += rusi->sse[RESTORE_NONE];
 }
@@ -1413,20 +1413,22 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
     RestorationType best_rtype = RESTORE_NONE;
 
     const int highbd = rsc.cm->seq_params.use_highbitdepth;
-    extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
-                 rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
-                 highbd);
+    if (!cpi->sf.disable_loop_restoration_chroma || !plane) {
+      extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
+                   rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
+                   highbd);
 
-    for (RestorationType r = 0; r < num_rtypes; ++r) {
-      if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) &&
-          (r != force_restore_type))
-        continue;
+      for (RestorationType r = 0; r < num_rtypes; ++r) {
+        if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) &&
+            (r != force_restore_type))
+          continue;
 
-      double cost = search_rest_type(&rsc, r);
+        double cost = search_rest_type(&rsc, r);
 
-      if (r == 0 || cost < best_cost) {
-        best_cost = cost;
-        best_rtype = r;
+        if (r == 0 || cost < best_cost) {
+          best_cost = cost;
+          best_rtype = r;
+        }
       }
     }
 
diff --git a/libaom/av1/encoder/ratectrl.c b/libaom/av1/encoder/ratectrl.c
index 21632c0..861c737 100644
--- a/libaom/av1/encoder/ratectrl.c
+++ b/libaom/av1/encoder/ratectrl.c
@@ -29,6 +29,8 @@
 #include "av1/common/seg_common.h"
 
 #include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/gop_structure.h"
 #include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
 
@@ -96,18 +98,13 @@ static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) {
 // fit to the original data (after plotting real maxq vs minq (not q index))
 static int get_minq_index(double maxq, double x3, double x2, double x1,
                           aom_bit_depth_t bit_depth) {
-  int i;
   const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);
 
   // Special case handling to deal with the step from q2.0
   // down to lossless mode represented by q 1.0.
   if (minqtarget <= 2.0) return 0;
 
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    if (minqtarget <= av1_convert_qindex_to_q(i, bit_depth)) return i;
-  }
-
-  return QINDEX_RANGE - 1;
+  return av1_find_qindex(minqtarget, bit_depth, 0, QINDEX_RANGE - 1);
 }
 
 static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
@@ -174,13 +171,15 @@ int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
                 (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
 }
 
-int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target) {
+int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target,
+                                    FRAME_UPDATE_TYPE frame_update_type) {
   const RATE_CONTROL *rc = &cpi->rc;
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   const int min_frame_target =
       AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
   // Clip the frame target to the minimum setup value.
-  if (cpi->rc.is_src_frame_alt_ref) {
+  if (frame_update_type == OVERLAY_UPDATE ||
+      frame_update_type == INTNL_OVERLAY_UPDATE) {
     // If there is an active ARF at this location use the minimum
     // bits on this frame even if it is a constructed arf.
     // The active maximum quantizer insures that an appropriate
@@ -219,9 +218,7 @@ static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
   RATE_CONTROL *const rc = &cpi->rc;
 
   // Non-viewable frames are a special case and are treated as pure overhead.
-  // TODO(zoeliu): To further explore whether we should treat BWDREF_FRAME
-  //               differently, since it is a no-show frame.
-  if (!cm->show_frame && !rc->is_bwd_ref_frame)
+  if (!cm->show_frame)
     rc->bits_off_target -= encoded_frame_size;
   else
     rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
@@ -253,9 +250,7 @@ int av1_rc_get_default_min_gf_interval(int width, int height,
 int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
   int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
   interval += (interval & 0x01);  // Round to even value
-#if CONFIG_FIX_GF_LENGTH
-  interval = AOMMAX(FIXED_GF_LENGTH, interval);
-#endif
+  interval = AOMMAX(MAX_GF_INTERVAL, interval);
   return AOMMAX(interval, min_gf_interval);
 }
 
@@ -352,6 +347,22 @@ int av1_rc_drop_frame(AV1_COMP *cpi) {
   }
 }
 
+static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = {
+  KF_STD,        // KF_UPDATE
+  INTER_NORMAL,  // LF_UPDATE
+  GF_ARF_STD,    // GF_UPDATE
+  GF_ARF_STD,    // ARF_UPDATE
+  INTER_NORMAL,  // OVERLAY_UPDATE
+  INTER_NORMAL,  // INTNL_OVERLAY_UPDATE
+  GF_ARF_LOW,    // INTNL_ARF_UPDATE
+};
+
+static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group) {
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+  assert(update_type < FRAME_UPDATE_TYPES);
+  return rate_factor_levels[update_type];
+}
+
 static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
                                          int height) {
   const RATE_CONTROL *const rc = &cpi->rc;
@@ -360,8 +371,8 @@ static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
     rcf = rc->rate_correction_factors[KF_STD];
   } else if (cpi->oxcf.pass == 2) {
-    RATE_FACTOR_LEVEL rf_lvl =
-        cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    const RATE_FACTOR_LEVEL rf_lvl =
+        get_rate_factor_level(&cpi->twopass.gf_group);
     rcf = rc->rate_correction_factors[rf_lvl];
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
@@ -387,8 +398,8 @@ static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width,
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
     rc->rate_correction_factors[KF_STD] = factor;
   } else if (cpi->oxcf.pass == 2) {
-    RATE_FACTOR_LEVEL rf_lvl =
-        cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    const RATE_FACTOR_LEVEL rf_lvl =
+        get_rate_factor_level(&cpi->twopass.gf_group);
     rc->rate_correction_factors[rf_lvl] = factor;
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
@@ -474,45 +485,82 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width,
   set_rate_correction_factor(cpi, rate_correction_factor, width, height);
 }
 
+// Calculate rate for the given 'q'.
+static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh,
+                           double correction_factor, int q) {
+  const AV1_COMMON *const cm = &cpi->common;
+  return use_cyclic_refresh
+             ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor)
+             : av1_rc_bits_per_mb(cm->current_frame.frame_type, q,
+                                  correction_factor, cm->seq_params.bit_depth);
+}
+
+// Similar to find_qindex_by_rate() function in ratectrl.c, but returns the q
+// index with rate just above or below the desired rate, depending on which of
+// the two rates is closer to the desired rate.
+// Also, respects the selected aq_mode when computing the rate.
+static int find_closest_qindex_by_rate(int desired_bits_per_mb,
+                                       const AV1_COMP *cpi,
+                                       double correction_factor,
+                                       int best_qindex, int worst_qindex) {
+  const int use_cyclic_refresh =
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled;
+
+  // Find 'qindex' based on 'desired_bits_per_mb'.
+  assert(best_qindex <= worst_qindex);
+  int low = best_qindex;
+  int high = worst_qindex;
+  while (low < high) {
+    const int mid = (low + high) >> 1;
+    const int mid_bits_per_mb =
+        get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, mid);
+    if (mid_bits_per_mb > desired_bits_per_mb) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  assert(low == high);
+
+  // Calculate rate difference of this q index from the desired rate.
+  const int curr_q = low;
+  const int curr_bits_per_mb =
+      get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, curr_q);
+  const int curr_bit_diff = (curr_bits_per_mb <= desired_bits_per_mb)
+                                ? desired_bits_per_mb - curr_bits_per_mb
+                                : INT_MAX;
+  assert((curr_bit_diff != INT_MAX && curr_bit_diff >= 0) ||
+         curr_q == worst_qindex);
+
+  // Calculate rate difference for previous q index too.
+  const int prev_q = curr_q - 1;
+  int prev_bit_diff;
+  if (curr_bit_diff == INT_MAX || curr_q == best_qindex) {
+    prev_bit_diff = INT_MAX;
+  } else {
+    const int prev_bits_per_mb =
+        get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, prev_q);
+    assert(prev_bits_per_mb > desired_bits_per_mb);
+    prev_bit_diff = prev_bits_per_mb - desired_bits_per_mb;
+  }
+
+  // Pick one of the two q indices, depending on which one has rate closer to
+  // the desired rate.
+  return (curr_bit_diff <= prev_bit_diff) ? curr_q : prev_q;
+}
+
 int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
                       int active_best_quality, int active_worst_quality,
                       int width, int height) {
-  const AV1_COMMON *const cm = &cpi->common;
-  int q = active_worst_quality;
-  int last_error = INT_MAX;
-  int i, target_bits_per_mb, bits_per_mb_at_this_q;
   const int MBs = av1_get_MBs(width, height);
   const double correction_factor =
       get_rate_correction_factor(cpi, width, height);
-
-  // Calculate required scaling factor based on target frame size and size of
-  // frame produced using previous Q.
-  target_bits_per_mb =
+  const int target_bits_per_mb =
       (int)((uint64_t)(target_bits_per_frame) << BPER_MB_NORMBITS) / MBs;
 
-  i = active_best_quality;
-
-  do {
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
-      bits_per_mb_at_this_q =
-          (int)av1_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
-    } else {
-      bits_per_mb_at_this_q =
-          (int)av1_rc_bits_per_mb(cm->current_frame.frame_type, i,
-                                  correction_factor, cm->seq_params.bit_depth);
-    }
-
-    if (bits_per_mb_at_this_q <= target_bits_per_mb) {
-      if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
-        q = i;
-      else
-        q = i - 1;
-
-      break;
-    } else {
-      last_error = bits_per_mb_at_this_q - target_bits_per_mb;
-    }
-  } while (++i <= active_worst_quality);
+  int q =
+      find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor,
+                                  active_best_quality, active_worst_quality);
 
   // In CBR mode, this makes sure q is between oscillating Qs to prevent
   // resonance.
@@ -560,13 +608,11 @@ static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
                             arfgf_low_motion_minq, arfgf_high_motion_minq);
 }
 
-#if REDUCE_LAST_ALT_BOOST
 static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
   int *arfgf_high_motion_minq;
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
   return arfgf_high_motion_minq[q];
 }
-#endif
 
 static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
@@ -758,10 +804,28 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
   return q;
 }
 
+static int gf_group_pyramid_level(const AV1_COMP *cpi) {
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  int this_height = gf_group->pyramid_level[gf_group->index];
+  return this_height;
+}
+
 static int get_active_cq_level(const RATE_CONTROL *rc,
-                               const AV1EncoderConfig *const oxcf) {
+                               const AV1EncoderConfig *const oxcf,
+                               int intra_only, int superres_denom) {
   static const double cq_adjust_threshold = 0.1;
   int active_cq_level = oxcf->cq_level;
+  (void)intra_only;
+  if (oxcf->rc_mode == AOM_CQ || oxcf->rc_mode == AOM_Q) {
+    // printf("Superres %d %d %d = %d\n", superres_denom, intra_only,
+    //        rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1));
+    if (oxcf->superres_mode == SUPERRES_QTHRESH &&
+        superres_denom != SCALE_NUMERATOR &&
+        !(intra_only && rc->frames_to_key <= 1)) {
+      active_cq_level =
+          AOMMAX(active_cq_level - ((superres_denom - SCALE_NUMERATOR) * 4), 0);
+    }
+  }
   if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) {
     const double x = (double)rc->total_actual_bits / rc->total_target_bits;
     if (x < cq_adjust_threshold) {
@@ -778,7 +842,8 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
   const RATE_CONTROL *const rc = &cpi->rc;
   const CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const int cq_level = get_active_cq_level(rc, oxcf);
+  const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm),
+                                           cm->superres_scale_denominator);
   int active_best_quality;
   int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
   int q;
@@ -920,15 +985,20 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
   return q;
 }
 
-int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
-  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = {
-    INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME
-  };
-  const AV1_COMMON *const cm = &cpi->common;
-  int qdelta = av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
-                                          rate_factor_deltas[rf_level],
-                                          cm->seq_params.bit_depth);
-  return qdelta;
+static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+  1.00,  // INTER_NORMAL
+  1.25,  // GF_ARF_LOW
+  2.00,  // GF_ARF_STD
+  2.00,  // KF_STD
+};
+
+int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
+  const RATE_FACTOR_LEVEL rf_lvl =
+      get_rate_factor_level(&cpi->twopass.gf_group);
+  const FRAME_TYPE frame_type = (rf_lvl == KF_STD) ? KEY_FRAME : INTER_FRAME;
+  return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q,
+                                    rate_factor_deltas[rf_lvl],
+                                    cpi->common.seq_params.bit_depth);
 }
 
 #define STATIC_MOTION_THRESH 95
@@ -939,7 +1009,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-  const int cq_level = get_active_cq_level(rc, oxcf);
+  const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm),
+                                           cm->superres_scale_denominator);
   int active_best_quality;
   int active_worst_quality = cpi->twopass.active_worst_quality;
   int q;
@@ -947,12 +1018,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
   const int bit_depth = cm->seq_params.bit_depth;
   ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
 
-#if CUSTOMIZED_GF
   const int is_intrl_arf_boost =
       gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
-#else
-  const int is_intrl_arf_boost = cpi->refresh_alt2_ref_frame;
-#endif  // CUSTOMIZED_GF
 
   if (frame_is_intra_only(cm)) {
     if (rc->frames_to_key == 1 && oxcf->rc_mode == AOM_Q) {
@@ -961,6 +1028,18 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
       // as q.
       active_best_quality = cq_level;
       active_worst_quality = cq_level;
+    } else if (cm->current_frame.frame_type == KEY_FRAME &&
+               cm->show_frame == 0) {
+      // Handle the special case for forward reference key frames.
+      // Increase the boost because this keyframe is used as a forward and
+      // backward reference.
+      const int qindex = rc->last_boosted_qindex;
+      const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      const int delta_qindex = av1_compute_qdelta(
+          rc, last_boosted_q, last_boosted_q * 0.25, bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+      // Update the arf_q since the forward keyframe is replacing the ALTREF
+      *arf_q = active_best_quality;
     } else if (rc->this_key_frame_forced) {
       // Handle the special case for key frames forced when we have reached
       // the maximum key frame interval. Here force the Q to a range
@@ -978,13 +1057,10 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
         active_worst_quality =
             AOMMIN(qindex + delta_qindex, active_worst_quality);
       } else {
-        // Increase the boost if the forced keyframe is a forward reference.
-        // These numbers were derived empirically.
-        const double boost_factor = cpi->oxcf.fwd_kf_enabled ? 0.25 : 0.50;
         qindex = rc->last_boosted_qindex;
         last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
-        delta_qindex = av1_compute_qdelta(
-            rc, last_boosted_q, last_boosted_q * boost_factor, bit_depth);
+        delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                          last_boosted_q * 0.50, bit_depth);
         active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
       }
     } else {
@@ -1035,80 +1111,57 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
 
-#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
-      if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
-          (is_intrl_arf_boost && !cpi->new_bwdref_update_rule)) {
-#if REDUCE_LAST_ALT_BOOST
-        if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
-          const int min_boost = get_gf_high_motion_quality(q, bit_depth);
-          const int boost = min_boost - active_best_quality;
+      if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+        const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+        const int boost = min_boost - active_best_quality;
 
-          active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
-        }
-#endif  // REDUCE_LAST_ALT_BOOST
+        active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
         *arf_q = active_best_quality;
-      } else if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
+      } else if (is_intrl_arf_boost) {
         assert(rc->arf_q >= 0);  // Ensure it is set to a valid value.
         active_best_quality = rc->arf_q;
-        int this_height = gf_group->pyramid_level[gf_group->index];
+        int this_height = gf_group_pyramid_level(cpi);
         while (this_height < gf_group->pyramid_height) {
           active_best_quality = (active_best_quality + cq_level + 1) / 2;
           ++this_height;
         }
       }
-#endif  // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
     } else if (oxcf->rc_mode == AOM_Q) {
       if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
         active_best_quality = cq_level;
       } else {
         if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
           active_best_quality = get_gf_active_quality(rc, q, bit_depth);
-          *arf_q = active_best_quality;
-#if REDUCE_LAST_ALT_BOOST
           const int min_boost = get_gf_high_motion_quality(q, bit_depth);
           const int boost = min_boost - active_best_quality;
 
           active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
-#endif
+          *arf_q = active_best_quality;
         } else {
           assert(rc->arf_q >= 0);  // Ensure it is set to a valid value.
+          assert(is_intrl_arf_boost);
           active_best_quality = rc->arf_q;
-        }
-#if USE_SYMM_MULTI_LAYER
-        if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
-          int this_height = gf_group->pyramid_level[gf_group->index];
+          int this_height = gf_group_pyramid_level(cpi);
           while (this_height < gf_group->pyramid_height) {
             active_best_quality = (active_best_quality + cq_level + 1) / 2;
             ++this_height;
           }
-        } else {
-#endif
-          // Modify best quality for second level arfs. For mode AOM_Q this
-          // becomes the baseline frame q.
-          if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
-            active_best_quality = (active_best_quality + cq_level + 1) / 2;
-#if USE_SYMM_MULTI_LAYER
         }
-#endif
       }
     } else {
       active_best_quality = get_gf_active_quality(rc, q, bit_depth);
-#if REDUCE_LAST_ALT_BOOST
       const int min_boost = get_gf_high_motion_quality(q, bit_depth);
       const int boost = min_boost - active_best_quality;
 
       active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
-#endif
-#if USE_SYMM_MULTI_LAYER
-      if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
-        int this_height = gf_group->pyramid_level[gf_group->index];
+      if (is_intrl_arf_boost) {
+        int this_height = gf_group_pyramid_level(cpi);
         while (this_height < gf_group->pyramid_height) {
           active_best_quality =
               (active_best_quality + active_worst_quality + 1) / 2;
           ++this_height;
         }
       }
-#endif
     }
   } else {
     if (oxcf->rc_mode == AOM_Q) {
@@ -1126,8 +1179,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
 
   // Extension to max or min Q if undershoot or overshoot is outside
   // the permitted range.
-  if ((cpi->oxcf.rc_mode != AOM_Q) &&
-      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+  if (cpi->oxcf.rc_mode != AOM_Q) {
     if (frame_is_intra_only(cm) ||
         (!rc->is_src_frame_alt_ref &&
          (cpi->refresh_golden_frame || is_intrl_arf_boost ||
@@ -1146,8 +1198,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
   // Static forced key frames Q restrictions dealt with elsewhere.
   if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced ||
       (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
-    int qdelta = av1_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
-                                       active_worst_quality);
+    const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality);
     active_worst_quality =
         AOMMAX(active_worst_quality + qdelta, active_best_quality);
   }
@@ -1167,7 +1218,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
 
   if (oxcf->rc_mode == AOM_Q ||
       (frame_is_intra_only(cm) && !rc->this_key_frame_forced &&
-       cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH)) {
+       cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
+       rc->frames_to_key > 1)) {
     q = active_best_quality;
     // Special case code to try and match quality with forced key frames.
   } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
@@ -1275,16 +1327,12 @@ static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
 
 static void update_golden_frame_stats(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-#if CUSTOMIZED_GF
   const TWO_PASS *const twopass = &cpi->twopass;
   const GF_GROUP *const gf_group = &twopass->gf_group;
   const int is_intrnl_arf =
       cpi->oxcf.pass == 2
           ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
           : cpi->refresh_alt2_ref_frame;
-#else
-  const int is_intnl_arf = cpi->refresh_alt2_ref_frame;
-#endif
 
   // Update the Golden frame usage counts.
   // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
@@ -1292,9 +1340,10 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
   //                   updated and cpi->refresh_golden_frame will still be zero.
   if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
     // We will not use internal overlay frames to replace the golden frame
-    if (!rc->is_src_frame_ext_arf)
+    if (!rc->is_src_frame_internal_arf) {
       // this frame refreshes means next frames don't unless specified by user
       rc->frames_since_golden = 0;
+    }
 
     // If we are not using alt ref in the up and coming group clear the arf
     // active flag. In multi arf group case, if the index is not 0 then
@@ -1310,165 +1359,16 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
   }
 }
 
-// Define the reference buffers that will be updated post encode.
-void av1_configure_buffer_updates(AV1_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
-
-  // NOTE(weitinglin): Should we define another function to take care of
-  // cpi->rc.is_$Source_Type to make this function as it is in the comment?
-
-  cpi->rc.is_src_frame_alt_ref = 0;
-  cpi->rc.is_bwd_ref_frame = 0;
-  cpi->rc.is_last_bipred_frame = 0;
-  cpi->rc.is_bipred_frame = 0;
-  cpi->rc.is_src_frame_ext_arf = 0;
-
-  switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
-    case KF_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_bwd_ref_frame = 1;
-      cpi->refresh_alt2_ref_frame = 1;
-      cpi->refresh_alt_ref_frame = 1;
-      break;
-
-    case LF_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-
-    case GF_UPDATE:
-      // TODO(zoeliu): To further investigate whether 'refresh_last_frame' is
-      //               needed.
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-
-    case OVERLAY_UPDATE:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      cpi->rc.is_src_frame_alt_ref = 1;
-      break;
-
-    case ARF_UPDATE:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 1;
-      break;
-
-    case BRF_UPDATE:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 1;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      cpi->rc.is_bwd_ref_frame = 1;
-      break;
-
-    case LAST_BIPRED_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      cpi->rc.is_last_bipred_frame = 1;
-      break;
-
-    case BIPRED_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      cpi->rc.is_bipred_frame = 1;
-      break;
-
-    case INTNL_OVERLAY_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      cpi->rc.is_src_frame_alt_ref = 1;
-      cpi->rc.is_src_frame_ext_arf = 1;
-      break;
-
-    case INTNL_ARF_UPDATE:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-#if USE_SYMM_MULTI_LAYER
-      if (cpi->new_bwdref_update_rule == 1) {
-        cpi->refresh_bwd_ref_frame = 1;
-        cpi->refresh_alt2_ref_frame = 0;
-      } else {
-#endif
-        cpi->refresh_bwd_ref_frame = 0;
-        cpi->refresh_alt2_ref_frame = 1;
-#if USE_SYMM_MULTI_LAYER
-      }
-#endif
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-
-    default: assert(0); break;
-  }
-}
-
-void av1_estimate_qp_gop(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  int gop_length = cpi->rc.baseline_gf_interval;
-  int bottom_index, top_index;
-  int idx;
-  const int gf_index = cpi->twopass.gf_group.index;
-
-  for (idx = 1; idx <= gop_length + 1 && idx < MAX_LAG_BUFFERS; ++idx) {
-    TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
-    int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
-    int arf_q = 0;
-
-    cpi->twopass.gf_group.index = idx;
-    rc_set_frame_target(cpi, target_rate, cm->width, cm->height);
-    av1_configure_buffer_updates(cpi);
-    tpl_frame->base_qindex = rc_pick_q_and_bounds_two_pass(
-        cpi, cm->width, cm->height, &bottom_index, &top_index, &arf_q);
-    tpl_frame->base_qindex = AOMMAX(tpl_frame->base_qindex, 1);
-  }
-  // Reset the actual index and frame update
-  cpi->twopass.gf_group.index = gf_index;
-  av1_configure_buffer_updates(cpi);
-}
-
 void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   const AV1_COMMON *const cm = &cpi->common;
   const CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
-#if CUSTOMIZED_GF
   const TWO_PASS *const twopass = &cpi->twopass;
   const GF_GROUP *const gf_group = &twopass->gf_group;
   const int is_intrnl_arf =
       cpi->oxcf.pass == 2
           ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
           : cpi->refresh_alt2_ref_frame;
-#else
-  const int is_intrnl_arf = cpi->refresh_alt2_ref_frame;
-#endif
 
   const int qindex = cm->base_qindex;
 
@@ -1539,10 +1439,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
 
   // Actual bits spent
   rc->total_actual_bits += rc->projected_frame_size;
-  // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
-  //               differently here for rc->avg_frame_bandwidth.
-  rc->total_target_bits +=
-      (cm->show_frame || rc->is_bwd_ref_frame) ? rc->avg_frame_bandwidth : 0;
+  rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
 
   rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
 
@@ -1575,22 +1472,24 @@ void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
 // Use this macro to turn on/off use of alt-refs in one-pass mode.
 #define USE_ALTREF_FOR_ONE_PASS 1
 
-static int calc_pframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+static int calc_pframe_target_size_one_pass_vbr(
+    const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) {
   static const int af_ratio = 10;
   const RATE_CONTROL *const rc = &cpi->rc;
   int target;
 #if USE_ALTREF_FOR_ONE_PASS
-  target =
-      (!rc->is_src_frame_alt_ref &&
-       (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))
-          ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
-                (rc->baseline_gf_interval + af_ratio - 1)
-          : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
-                (rc->baseline_gf_interval + af_ratio - 1);
+  if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE ||
+      frame_update_type == ARF_UPDATE) {
+    target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
+             (rc->baseline_gf_interval + af_ratio - 1);
+  } else {
+    target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+             (rc->baseline_gf_interval + af_ratio - 1);
+  }
 #else
   target = rc->avg_frame_bandwidth;
 #endif
-  return av1_rc_clamp_pframe_target_size(cpi, target);
+  return av1_rc_clamp_pframe_target_size(cpi, target, frame_update_type);
 }
 
 static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
@@ -1600,7 +1499,10 @@ static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
   return av1_rc_clamp_iframe_target_size(cpi, target);
 }
 
-void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
+void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi,
+                                    FRAME_UPDATE_TYPE *const frame_update_type,
+                                    EncodeFrameParams *const frame_params,
+                                    unsigned int frame_flags) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   CurrentFrame *const current_frame = &cm->current_frame;
@@ -1610,48 +1512,45 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
   int sframe_mode = cpi->oxcf.sframe_mode;
   int sframe_enabled = cpi->oxcf.sframe_enabled;
   // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
-  if (!cpi->refresh_alt_ref_frame &&
-      (current_frame->frame_number == 0 ||
-       (cpi->frame_flags & FRAMEFLAGS_KEY) || rc->frames_to_key == 0 ||
-       (cpi->oxcf.auto_key && 0))) {
-    current_frame->frame_type = KEY_FRAME;
+  if (*frame_update_type != ARF_UPDATE &&
+      (current_frame->frame_number == 0 || (frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+    frame_params->frame_type = KEY_FRAME;
     rc->this_key_frame_forced =
         current_frame->frame_number != 0 && rc->frames_to_key == 0;
     rc->frames_to_key = cpi->oxcf.key_freq;
     rc->kf_boost = DEFAULT_KF_BOOST;
     rc->source_alt_ref_active = 0;
   } else {
-    current_frame->frame_type = INTER_FRAME;
+    frame_params->frame_type = INTER_FRAME;
     if (sframe_enabled) {
       if (altref_enabled) {
         if (sframe_mode == 1) {
           // sframe_mode == 1: insert sframe if it matches altref frame.
 
           if (current_frame->frame_number % sframe_dist == 0 &&
-              current_frame->frame_type != KEY_FRAME &&
-              current_frame->frame_number != 0 && cpi->refresh_alt_ref_frame) {
-            current_frame->frame_type = S_FRAME;
+              current_frame->frame_number != 0 &&
+              *frame_update_type == ARF_UPDATE) {
+            frame_params->frame_type = S_FRAME;
           }
         } else {
           // sframe_mode != 1: if sframe will be inserted at the next available
           // altref frame
 
           if (current_frame->frame_number % sframe_dist == 0 &&
-              current_frame->frame_type != KEY_FRAME &&
               current_frame->frame_number != 0) {
             rc->sframe_due = 1;
           }
 
-          if (rc->sframe_due && cpi->refresh_alt_ref_frame) {
-            current_frame->frame_type = S_FRAME;
+          if (rc->sframe_due && *frame_update_type == ARF_UPDATE) {
+            frame_params->frame_type = S_FRAME;
             rc->sframe_due = 0;
           }
         }
       } else {
         if (current_frame->frame_number % sframe_dist == 0 &&
-            current_frame->frame_type != KEY_FRAME &&
             current_frame->frame_number != 0) {
-          current_frame->frame_type = S_FRAME;
+          frame_params->frame_type = S_FRAME;
         }
       }
     }
@@ -1666,7 +1565,7 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
     } else {
       rc->constrained_gf_group = 0;
     }
-    cpi->refresh_golden_frame = 1;
+    if (*frame_update_type == LF_UPDATE) *frame_update_type = GF_UPDATE;
     rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
     rc->gfu_boost = DEFAULT_GF_BOOST;
   }
@@ -1674,14 +1573,15 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     av1_cyclic_refresh_update_parameters(cpi);
 
-  if (current_frame->frame_type == KEY_FRAME)
+  if (frame_params->frame_type == KEY_FRAME)
     target = calc_iframe_target_size_one_pass_vbr(cpi);
   else
-    target = calc_pframe_target_size_one_pass_vbr(cpi);
+    target = calc_pframe_target_size_one_pass_vbr(cpi, *frame_update_type);
   rc_set_frame_target(cpi, target, cm->width, cm->height);
 }
 
-static int calc_pframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+static int calc_pframe_target_size_one_pass_cbr(
+    const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
   const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
@@ -1692,12 +1592,14 @@ static int calc_pframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
 
   if (oxcf->gf_cbr_boost_pct) {
     const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
-    target = cpi->refresh_golden_frame
-                 ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval *
-                    af_ratio_pct) /
-                       (rc->baseline_gf_interval * 100 + af_ratio_pct - 100)
-                 : (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
-                       (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+    if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
+      target =
+          (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
+          (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+    } else {
+      target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+               (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+    }
   } else {
     target = rc->avg_frame_bandwidth;
   }
@@ -1740,23 +1642,25 @@ static int calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
   return av1_rc_clamp_iframe_target_size(cpi, target);
 }
 
-void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
+void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi,
+                                    FRAME_UPDATE_TYPE *const frame_update_type,
+                                    EncodeFrameParams *const frame_params,
+                                    unsigned int frame_flags) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   CurrentFrame *const current_frame = &cm->current_frame;
   int target;
   // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
-  if ((current_frame->frame_number == 0 ||
-       (cpi->frame_flags & FRAMEFLAGS_KEY) || rc->frames_to_key == 0 ||
-       (cpi->oxcf.auto_key && 0))) {
-    current_frame->frame_type = KEY_FRAME;
+  if ((current_frame->frame_number == 0 || (frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+    frame_params->frame_type = KEY_FRAME;
     rc->this_key_frame_forced =
         current_frame->frame_number != 0 && rc->frames_to_key == 0;
     rc->frames_to_key = cpi->oxcf.key_freq;
     rc->kf_boost = DEFAULT_KF_BOOST;
     rc->source_alt_ref_active = 0;
   } else {
-    current_frame->frame_type = INTER_FRAME;
+    frame_params->frame_type = INTER_FRAME;
   }
   if (rc->frames_till_gf_update_due == 0) {
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
@@ -1768,7 +1672,7 @@ void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
     // NOTE: frames_till_gf_update_due must be <= frames_to_key.
     if (rc->frames_till_gf_update_due > rc->frames_to_key)
       rc->frames_till_gf_update_due = rc->frames_to_key;
-    cpi->refresh_golden_frame = 1;
+    if (*frame_update_type == LF_UPDATE) *frame_update_type = GF_UPDATE;
     rc->gfu_boost = DEFAULT_GF_BOOST;
   }
 
@@ -1777,42 +1681,75 @@ void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     av1_cyclic_refresh_update_parameters(cpi);
 
-  if (current_frame->frame_type == KEY_FRAME)
+  if (frame_params->frame_type == KEY_FRAME)
     target = calc_iframe_target_size_one_pass_cbr(cpi);
   else
-    target = calc_pframe_target_size_one_pass_cbr(cpi);
+    target = calc_pframe_target_size_one_pass_cbr(cpi, *frame_update_type);
 
   rc_set_frame_target(cpi, target, cm->width, cm->height);
   // TODO(afergs): Decide whether to scale up, down, or not at all
 }
 
+int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
+                    int best_qindex, int worst_qindex) {
+  assert(best_qindex <= worst_qindex);
+  int low = best_qindex;
+  int high = worst_qindex;
+  while (low < high) {
+    const int mid = (low + high) >> 1;
+    const double mid_q = av1_convert_qindex_to_q(mid, bit_depth);
+    if (mid_q < desired_q) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  assert(low == high);
+  assert(av1_convert_qindex_to_q(low, bit_depth) >= desired_q ||
+         low == worst_qindex);
+  return low;
+}
+
 int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
                        aom_bit_depth_t bit_depth) {
-  int start_index = rc->worst_quality;
-  int target_index = rc->worst_quality;
-  int i;
-
-  // Convert the average q value to an index.
-  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
-    start_index = i;
-    if (av1_convert_qindex_to_q(i, bit_depth) >= qstart) break;
-  }
+  const int start_index =
+      av1_find_qindex(qstart, bit_depth, rc->best_quality, rc->worst_quality);
+  const int target_index =
+      av1_find_qindex(qtarget, bit_depth, rc->best_quality, rc->worst_quality);
+  return target_index - start_index;
+}
 
-  // Convert the q target to an index
-  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
-    target_index = i;
-    if (av1_convert_qindex_to_q(i, bit_depth) >= qtarget) break;
+// Find q_index for the desired_bits_per_mb, within [best_qindex, worst_qindex],
+// assuming 'correction_factor' is 1.0.
+// To be precise, 'q_index' is the smallest integer, for which the corresponding
+// bits per mb <= desired_bits_per_mb.
+// If no such q index is found, returns 'worst_qindex'.
+static int find_qindex_by_rate(int desired_bits_per_mb,
+                               aom_bit_depth_t bit_depth, FRAME_TYPE frame_type,
+                               int best_qindex, int worst_qindex) {
+  assert(best_qindex <= worst_qindex);
+  int low = best_qindex;
+  int high = worst_qindex;
+  while (low < high) {
+    const int mid = (low + high) >> 1;
+    const int mid_bits_per_mb =
+        av1_rc_bits_per_mb(frame_type, mid, 1.0, bit_depth);
+    if (mid_bits_per_mb > desired_bits_per_mb) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
   }
-
-  return target_index - start_index;
+  assert(low == high);
+  assert(av1_rc_bits_per_mb(frame_type, low, 1.0, bit_depth) <=
+             desired_bits_per_mb ||
+         low == worst_qindex);
+  return low;
 }
 
 int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
                                int qindex, double rate_target_ratio,
                                aom_bit_depth_t bit_depth) {
-  int target_index = rc->worst_quality;
-  int i;
-
   // Look up the current projected bits per block for the base index
   const int base_bits_per_mb =
       av1_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth);
@@ -1820,14 +1757,9 @@ int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
   // Find the target bits per mb based on the base value and given ratio.
   const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
 
-  // Convert the q target to an index
-  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
-    if (av1_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <=
-        target_bits_per_mb) {
-      target_index = i;
-      break;
-    }
-  }
+  const int target_index =
+      find_qindex_by_rate(target_bits_per_mb, bit_depth, frame_type,
+                          rc->best_quality, rc->worst_quality);
   return target_index - qindex;
 }
 
diff --git a/libaom/av1/encoder/ratectrl.h b/libaom/av1/encoder/ratectrl.h
index ea8975d..1cd5994 100644
--- a/libaom/av1/encoder/ratectrl.h
+++ b/libaom/av1/encoder/ratectrl.h
@@ -15,6 +15,8 @@
 #include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
 
+#include "aom_ports/mem.h"
+
 #include "av1/common/blockd.h"
 #include "av1/common/onyxc_int.h"
 
@@ -34,54 +36,29 @@ extern "C" {
 // The maximum duration of a GF group that is static (e.g. a slide show).
 #define MAX_STATIC_GF_GROUP_LENGTH 250
 
-#define CUSTOMIZED_GF 1
-
-#if CONFIG_FIX_GF_LENGTH
-#define FIXED_GF_LENGTH 16
+// Minimum and maximum height for the new pyramid structure.
+// (Old structure supports height = 1, but does NOT support height = 4).
+#define MIN_PYRAMID_LVL 0
 #define MAX_PYRAMID_LVL 4
-// We allow a frame to have at most two left/right descendants before changing
-// them into to a subtree, i.e., we allow the following structure:
-/*                    OUT_OF_ORDER_FRAME
-                     / /              \ \
-(two left children) F F                F F (two right children) */
-// Therefore the max gf size supported by 4 layer structure is
-// 1 (KEY/OVERLAY) + 1 + 2 + 4 + 16 (two children on both side of their parent)
-#define MAX_PYRAMID_SIZE 24
-#define USE_SYMM_MULTI_LAYER 1
-#define REDUCE_LAST_ALT_BOOST 1
-#define REDUCE_LAST_GF_LENGTH 1
-#define MULTI_LVL_BOOST_VBR_CQ 1
-#else
-#define MAX_PYRAMID_SIZE 16
-#define USE_SYMM_MULTI_LAYER 0
-#define REDUCE_LAST_ALT_BOOST 0
-#define REDUCE_LAST_GF_LENGTH 0
-#define MULTI_LVL_BOOST_VBR_CQ 0
-#endif
-
-#if USE_SYMM_MULTI_LAYER
-#define USE_MANUAL_GF4_STRUCT 0
-#endif
 
 #define MIN_GF_INTERVAL 4
 #define MAX_GF_INTERVAL 16
 #define FIXED_GF_INTERVAL 8  // Used in some testing modes only
 
-static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
-  1.00,  // INTER_NORMAL
-  0.80,  // INTER_LOW
-  1.50,  // INTER_HIGH
-  1.25,  // GF_ARF_LOW
-  2.00,  // GF_ARF_STD
-  2.00,  // KF_STD
-};
-
 typedef struct {
   int resize_width;
   int resize_height;
   uint8_t superres_denom;
 } size_params_type;
 
+enum {
+  INTER_NORMAL,
+  GF_ARF_LOW,
+  GF_ARF_STD,
+  KF_STD,
+  RATE_FACTOR_LEVELS
+} UENUM1BYTE(RATE_FACTOR_LEVEL);
+
 typedef struct {
   // Rate targetting variables
   int base_frame_target;  // A baseline frame target before adjustment
@@ -94,7 +71,6 @@ typedef struct {
   int last_kf_qindex;       // Q index of the last key frame coded.
 
   int gfu_boost;
-  int last_boost;
   int kf_boost;
 
   double rate_correction_factors[RATE_FACTOR_LEVELS];
@@ -113,18 +89,9 @@ typedef struct {
   int source_alt_ref_pending;
   int source_alt_ref_active;
   int is_src_frame_alt_ref;
+  int is_src_frame_internal_arf;
   int sframe_due;
 
-  // Length of the bi-predictive frame group interval
-  int bipred_group_interval;
-
-  // NOTE: Different types of frames may have different bits allocated
-  //       accordingly, aiming to achieve the overall optimal RD performance.
-  int is_bwd_ref_frame;
-  int is_last_bipred_frame;
-  int is_bipred_frame;
-  int is_src_frame_ext_arf;
-
   int avg_frame_bandwidth;  // Average frame size target for clip
   int min_frame_bandwidth;  // Minimum allocation used for any frame
   int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
@@ -172,8 +139,6 @@ typedef struct {
   int q_1_frame;
   int q_2_frame;
 
-  // Auto frame-scaling variables.
-  int rf_level_maxq[RATE_FACTOR_LEVELS];
   float_t arf_boost_factor;
   // Q index used for ALT frame
   int arf_q;
@@ -196,7 +161,7 @@ int av1_rc_get_default_min_gf_interval(int width, int height, double framerate);
 // Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to
 // be passed in to ensure that the max_gf_interval returned is at least as bis
 // as that.
-int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval);
 
 // Generally at the high level, the following flow is expected
 // to be enforced for rate control:
@@ -221,8 +186,13 @@ int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
 
 // Functions to set parameters for encoding before the actual
 // encode_frame_to_data_rate() function.
-void av1_rc_get_one_pass_vbr_params(struct AV1_COMP *cpi);
-void av1_rc_get_one_pass_cbr_params(struct AV1_COMP *cpi);
+struct EncodeFrameParams;
+void av1_rc_get_one_pass_vbr_params(
+    struct AV1_COMP *cpi, uint8_t *const frame_update_type,
+    struct EncodeFrameParams *const frame_params, unsigned int frame_flags);
+void av1_rc_get_one_pass_cbr_params(
+    struct AV1_COMP *cpi, uint8_t *const frame_update_type,
+    struct EncodeFrameParams *const frame_params, unsigned int frame_flags);
 
 // Post encode update of the rate control parameters based
 // on bytes used
@@ -262,7 +232,14 @@ int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
 int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
                                     int target);
 int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi,
-                                    int target);
+                                    int target, uint8_t frame_update_type);
+
+// Find q_index corresponding to desired_q, within [best_qindex, worst_qindex].
+// To be precise, 'q_index' is the smallest integer, for which the corresponding
+// q >= desired_q.
+// If no such q index is found, returns 'worst_qindex'.
+int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
+                    int best_qindex, int worst_qindex);
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
 // to a target q value
@@ -275,7 +252,7 @@ int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
                                int qindex, double rate_target_ratio,
                                aom_bit_depth_t bit_depth);
 
-int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int rf_level, int q);
+int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q);
 
 void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height);
 
@@ -286,10 +263,6 @@ void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height);
 
 int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
 
-void av1_configure_buffer_updates(struct AV1_COMP *cpi);
-
-void av1_estimate_qp_gop(struct AV1_COMP *cpi);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libaom/av1/encoder/rd.c b/libaom/av1/encoder/rd.c
index 510bb3b..d78e269 100644
--- a/libaom/av1/encoder/rd.c
+++ b/libaom/av1/encoder/rd.c
@@ -344,13 +344,7 @@ void av1_init_me_luts(void) {
 static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
                                          8,  8,  4,  4,  2,  2,  1,  0 };
 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
-  128, 144, 128, 128, 144,
-  // TODO(zoeliu): To adjust further following factor values.
-  128, 128, 128,
-  // TODO(weitinglin): We should investigate if the values should be the same
-  //                   as the value used by OVERLAY frame
-  144,  // INTNL_OVERLAY_UPDATE
-  128   // INTNL_ARF_UPDATE
+  128, 144, 128, 128, 144, 144, 128
 };
 
 int av1_compute_rd_mult_based_on_qindex(const AV1_COMP *cpi, int qindex) {
@@ -508,6 +502,17 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
         av1_cost_tokens_from_cdf(pcost->base_cost[ctx],
                                  fc->coeff_base_cdf[tx_size][plane][ctx], NULL);
 
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
+        pcost->base_cost[ctx][4] = 0;
+        pcost->base_cost[ctx][5] = pcost->base_cost[ctx][1] +
+                                   av1_cost_literal(1) -
+                                   pcost->base_cost[ctx][0];
+        pcost->base_cost[ctx][6] =
+            pcost->base_cost[ctx][2] - pcost->base_cost[ctx][1];
+        pcost->base_cost[ctx][7] =
+            pcost->base_cost[ctx][3] - pcost->base_cost[ctx][2];
+      }
+
       for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
         av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx],
                                  fc->eob_extra_cdf[tx_size][plane][ctx], NULL);
@@ -538,6 +543,14 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
         //  printf("%5d ", pcost->lps_cost[ctx][i]);
         // printf("\n");
       }
+      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+        pcost->lps_cost[ctx][0 + COEFF_BASE_RANGE + 1] =
+            pcost->lps_cost[ctx][0];
+        for (int i = 1; i <= COEFF_BASE_RANGE; ++i) {
+          pcost->lps_cost[ctx][i + COEFF_BASE_RANGE + 1] =
+              pcost->lps_cost[ctx][i] - pcost->lps_cost[ctx][i - 1];
+        }
+      }
     }
   }
 }
@@ -684,6 +697,7 @@ static double interp_cubic(const double *p, double x) {
                           x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
 }
 
+/*
 static double interp_bicubic(const double *p, int p_stride, double x,
                              double y) {
   double q[4];
@@ -693,441 +707,224 @@ static double interp_bicubic(const double *p, int p_stride, double x,
   q[3] = interp_cubic(p + 3 * p_stride, x);
   return interp_cubic(q, y);
 }
+*/
 
-static const double interp_rgrid_surf[65 * 18] = {
-  0.104019,    0.245714,    0.293686,    0.358635,    0.382167,    0.412446,
-  0.419955,    0.421388,    0.426672,    0.427990,    0.428531,    0.456868,
-  0.569880,    0.638822,    1.016319,    2.143453,    3.565229,    4.720880,
-  0.124618,    0.294211,    0.352023,    0.429991,    0.458206,    0.494510,
-  0.503513,    0.505232,    0.511566,    0.513234,    0.519365,    0.570225,
-  0.697373,    0.840624,    1.462198,    3.289054,    6.256517,    6.852788,
-  0.118630,    0.269669,    0.346620,    0.430999,    0.459385,    0.495783,
-  0.504808,    0.506532,    0.512884,    0.514988,    0.543437,    0.662772,
-  0.795876,    1.313596,    2.403841,    4.163098,    7.440589,    8.616275,
-  0.093329,    0.168205,    0.321320,    0.430607,    0.459385,    0.495783,
-  0.504813,    0.506548,    0.512975,    0.520662,    0.571659,    0.701841,
-  1.010727,    2.138851,    3.460626,    6.317955,    10.098127,   14.418553,
-  0.087021,    0.142905,    0.315011,    0.430509,    0.459385,    0.495787,
-  0.505075,    0.507599,    0.513584,    0.543182,    0.669941,    0.825620,
-  1.362800,    2.572187,    4.205047,    7.498399,    12.303118,   16.641735,
-  0.086923,    0.142513,    0.314913,    0.430508,    0.459385,    0.495803,
-  0.506126,    0.511816,    0.514810,    0.549705,    0.725350,    1.127334,
-  2.168597,    3.463686,    6.318605,    10.162284,   18.556041,   19.847042,
-  0.086923,    0.142513,    0.314913,    0.430506,    0.459376,    0.495805,
-  0.506388,    0.512954,    0.520772,    0.580215,    0.810474,    1.391548,
-  2.579442,    4.205160,    7.498399,    12.381597,   21.703618,   24.015457,
-  0.086923,    0.142513,    0.314911,    0.430353,    0.458765,    0.495652,
-  0.506391,    0.513406,    0.544098,    0.702950,    1.121860,    2.168961,
-  3.463798,    6.318607,    10.162284,   18.685361,   28.188192,   37.638872,
-  0.086923,    0.142513,    0.314901,    0.429742,    0.456313,    0.495045,
-  0.506484,    0.519195,    0.580104,    0.810126,    1.391462,    2.579441,
-  4.205160,    7.498399,    12.381597,   21.848607,   33.367199,   42.623190,
-  0.086923,    0.142513,    0.314899,    0.429589,    0.455706,    0.495155,
-  0.507882,    0.542426,    0.702360,    1.119921,    2.168478,    3.463791,
-  6.318607,    10.162284,   18.685361,   28.345760,   47.802028,   49.163533,
-  0.086924,    0.142548,    0.315086,    0.429842,    0.455870,    0.496336,
-  0.512412,    0.556953,    0.773373,    1.266396,    2.548277,    4.204676,
-  7.498399,    12.381597,   21.848607,   33.548250,   54.301011,   56.262859,
-  0.087067,    0.144957,    0.327436,    0.446616,    0.466362,    0.505706,
-  0.522077,    0.610747,    0.972543,    1.666916,    3.338812,    6.316669,
-  10.162284,   18.685361,   28.345760,   48.065311,   66.145302,   78.396020,
-  0.094295,    0.164235,    0.393722,    0.534219,    0.530922,    0.579308,
-  0.603889,    0.760870,    1.229961,    2.423214,    4.173513,    7.497916,
-  12.381597,   21.848607,   33.548250,   54.589585,   74.875848,   86.468182,
-  0.124096,    0.213005,    0.497188,    0.665176,    0.685973,    0.800200,
-  0.911394,    1.077971,    1.677290,    3.332129,    6.314960,    10.162257,
-  18.685361,   28.345760,   48.065311,   66.453506,   98.275189,   96.862588,
-  0.140999,    0.270140,    0.658212,    0.867661,    0.970183,    1.149516,
-  1.480599,    1.664833,    2.421893,    3.857981,    7.418830,    12.380371,
-  21.848607,   33.548250,   54.589585,   75.188867,   106.657971,  99.762997,
-  0.178353,    0.398001,    0.988462,    1.241473,    1.340967,    1.713568,
-  2.335030,    2.701432,    3.348532,    5.077158,    9.829903,    18.676528,
-  28.345700,   48.065311,   66.453506,   98.588283,   117.057193,  101.130722,
-  0.281079,    0.548300,    1.395825,    1.780770,    2.000508,    2.702964,
-  3.638454,    4.573843,    5.051641,    7.079129,    11.293332,   21.594861,
-  33.544335,   54.589585,   75.188867,   106.971065,  119.957601,  101.466632,
-  0.476762,    0.842189,    2.019678,    2.723895,    3.188467,    4.011610,
-  5.545111,    7.508984,    8.176339,    9.774504,    14.720782,   27.334416,
-  48.049609,   66.453506,   98.588283,   117.370357,  121.329855,  101.509242,
-  0.993999,    1.520111,    3.013605,    4.203530,    4.982992,    6.074944,
-  8.583581,    11.818375,   14.192544,   14.937517,   21.258160,   33.305953,
-  54.585735,   75.188867,   106.971135,  120.279824,  121.976055,  102.690130,
-  1.776487,    2.613655,    4.356487,    6.161726,    7.622196,    9.464193,
-  13.077233,   18.051656,   23.221051,   24.080068,   30.085038,   48.345269,
-  66.457698,   98.588353,   117.379415,  121.976128,  124.356210,  107.713202,
-  3.191085,    4.495201,    5.686033,    8.365566,    11.275339,   14.706437,
-  20.300969,   28.152237,   35.688355,   39.341382,   41.030743,   55.752262,
-  75.211764,   106.980285,  120.608403,  124.680746,  130.222528,  112.260098,
-  6.136611,    7.305215,    7.272532,    10.646713,   15.630815,   22.383168,
-  31.349131,   42.419822,   52.301680,   58.983454,   58.915405,   69.161305,
-  98.992460,   117.713855,  124.344836,  130.623638,  138.442401,  127.846670,
-  11.707980,   13.490761,   11.640845,   14.176132,   22.131124,   33.776462,
-  47.365711,   61.603834,   75.281056,   83.463985,   85.510533,   86.026513,
-  108.787480,  123.031136,  130.607284,  138.954406,  160.867784,  158.958882,
-  27.062874,   32.195139,   24.147297,   22.114632,   35.580506,   52.551674,
-  71.652956,   88.606776,   102.107193,  110.703186,  114.398733,  111.118539,
-  121.503578,  132.455924,  139.490806,  161.412674,  193.563210,  172.203945,
-  35.625692,   47.953028,   42.639820,   42.276254,   58.815664,   84.977282,
-  110.656412,  126.168446,  134.658126,  140.604482,  144.006012,  141.702382,
-  140.125323,  153.122630,  164.748041,  194.156197,  206.854650,  174.013079,
-  49.516447,   65.335381,   71.738306,   81.872819,   98.400740,   136.840488,
-  163.775802,  169.440078,  172.747876,  171.222919,  171.679604,  172.173550,
-  168.200129,  187.617133,  199.683394,  207.768200,  210.062520,  175.478356,
-  60.341673,   92.487135,   119.907299,  136.068010,  144.778950,  189.443534,
-  220.120077,  219.641635,  214.616503,  205.894657,  198.453924,  200.013069,
-  195.938103,  206.118661,  210.447375,  212.061379,  216.078218,  181.162805,
-  78.422159,   112.242899,  158.416312,  181.404320,  193.188690,  229.296967,
-  270.461799,  275.168977,  256.511701,  244.706786,  231.344608,  226.065087,
-  222.248618,  218.662324,  217.966722,  218.248574,  218.818588,  182.740573,
-  88.713664,   123.594164,  172.928179,  213.781414,  245.800351,  252.063414,
-  313.283141,  331.703831,  305.866639,  285.177142,  269.759635,  251.988739,
-  245.998388,  232.688076,  230.588702,  230.882657,  230.319053,  192.120741,
-  102.540561,  152.905927,  189.137131,  241.806756,  273.868497,  284.258017,
-  339.689853,  373.561104,  362.657463,  326.291984,  311.922687,  290.460189,
-  276.774381,  273.012072,  277.751792,  279.123748,  278.820447,  233.813798,
-  132.983118,  176.307242,  197.415684,  243.307787,  280.893995,  332.922370,
-  340.329043,  404.530166,  419.475405,  375.775209,  351.300889,  340.042759,
-  315.683832,  306.123530,  306.359319,  306.733063,  307.609556,  261.647847,
-  149.579109,  185.925581,  207.937033,  245.159084,  301.890957,  350.040480,
-  352.250771,  418.742329,  458.112686,  430.125208,  386.460441,  380.346839,
-  354.679150,  337.305620,  334.504124,  335.889932,  341.060725,  286.898578,
-  153.576812,  202.105624,  219.366967,  248.524506,  314.255692,  350.607526,
-  390.567688,  408.629209,  488.000213,  480.563823,  432.461799,  410.412624,
-  398.607371,  400.188740,  402.780916,  408.853470,  430.449735,  363.777088,
-  161.353129,  214.848904,  231.549852,  258.536466,  313.163177,  368.140577,
-  412.136393,  413.409032,  499.838438,  519.571063,  485.833867,  444.562715,
-  435.738129,  442.358549,  450.166531,  453.208524,  458.424358,  385.823139,
-  175.109034,  227.608058,  250.069563,  286.101747,  312.256740,  378.421485,
-  413.344147,  435.058646,  476.960941,  542.448886,  530.189154,  495.408402,
-  475.326752,  465.017144,  464.694045,  465.144689,  466.905382,  398.669138,
-  184.750180,  240.766694,  283.240772,  305.480150,  322.409001,  374.526162,
-  427.141326,  452.840323,  472.604139,  545.366105,  567.676694,  541.666203,
-  509.591873,  492.044219,  492.778569,  493.765684,  493.235693,  413.684325,
-  194.728357,  254.928927,  289.991157,  300.193195,  324.194589,  371.563147,
-  439.226438,  468.295088,  495.654854,  533.506353,  587.476353,  578.298989,
-  548.041942,  527.393885,  538.965146,  545.070442,  544.295454,  454.012211,
-  205.195287,  283.135677,  297.921431,  319.295927,  355.621830,  392.466463,
-  446.696167,  485.053519,  516.426615,  532.264584,  588.481600,  615.906737,
-  589.319634,  555.754316,  558.389367,  569.094521,  569.779764,  475.384946,
-  218.552054,  298.511016,  319.188338,  351.781666,  372.789510,  412.827434,
-  464.569387,  506.270203,  533.049810,  553.347364,  580.644599,  632.759854,
-  622.235843,  569.960552,  580.799340,  586.553714,  579.488366,  491.826482,
-  244.803348,  299.790203,  324.187975,  363.280782,  403.710443,  441.724083,
-  492.732682,  534.722691,  552.193622,  575.112647,  586.097705,  635.224970,
-  644.642944,  606.017786,  640.321218,  642.316989,  616.397020,  548.300111,
-  256.957358,  318.638991,  355.063346,  389.889307,  433.607315,  468.209001,
-  515.178157,  573.556591,  578.113115,  587.246475,  601.762801,  638.454644,
-  656.574853,  641.184609,  676.908189,  684.198162,  678.387412,  574.805864,
-  251.211502,  323.448532,  364.227424,  411.792704,  462.226488,  503.572288,
-  549.299249,  599.124071,  601.227977,  597.118176,  613.247552,  633.278532,
-  658.074755,  664.930719,  685.731531,  693.632845,  693.076350,  578.326477,
-  267.695377,  354.273736,  389.976833,  438.518178,  493.332686,  544.343027,
-  588.895829,  620.206193,  628.327410,  606.067827,  620.998532,  657.985256,
-  683.936059,  691.345257,  693.894723,  695.175306,  693.618786,  578.517148,
-  274.290725,  363.465288,  411.808596,  463.369805,  515.310226,  581.009306,
-  613.070738,  636.638714,  647.333929,  629.867603,  644.646319,  687.796202,
-  702.859596,  713.495479,  704.068069,  704.991807,  704.188594,  587.283658,
-  302.538449,  389.174737,  438.518422,  493.398902,  547.662399,  601.981814,
-  624.773046,  641.629484,  644.699451,  645.848784,  668.033340,  703.643523,
-  707.422408,  717.329600,  726.298973,  744.127507,  745.365167,  617.954068,
-  310.328188,  410.984766,  463.369805,  515.315010,  581.309832,  613.787792,
-  634.988538,  654.145284,  662.632978,  668.413496,  706.494057,  750.545471,
-  730.724808,  730.002100,  743.625262,  750.801609,  745.308457,  606.505800,
-  329.948756,  437.600191,  493.398902,  547.661910,  601.917884,  622.557745,
-  633.244395,  644.055898,  648.224221,  665.062911,  763.555733,  812.391078,
-  769.063582,  744.865168,  727.579796,  724.950408,  722.179707,  598.564510,
-  350.848328,  462.437458,  515.315010,  581.309823,  613.779123,  634.465309,
-  652.056257,  662.179143,  671.466297,  726.881256,  819.824030,  880.232789,
-  810.371672,  754.246481,  725.053473,  724.253390,  723.503395,  603.394909,
-  373.704088,  492.408266,  547.661910,  601.917884,  622.557620,  633.236320,
-  644.023513,  648.232514,  666.381639,  785.498283,  929.441612,  999.772800,
-  890.339033,  775.852504,  731.840181,  726.905100,  725.251844,  604.899901,
-  394.473422,  514.261306,  581.309823,  613.779123,  634.465309,  652.056257,
-  662.179143,  671.466557,  727.134512,  835.764144,  981.747089,  1018.462934,
-  939.686967,  811.276731,  739.398459,  727.365647,  725.285425,  604.923525,
-  419.976505,  546.538939,  601.917884,  622.557620,  633.236320,  644.023513,
-  648.232514,  666.381639,  785.545191,  932.841398,  1036.609617, 1026.945092,
-  963.822765,  840.827315,  755.532423,  730.241865,  725.366847,  604.924155,
-  437.281359,  580.116337,  613.779123,  634.465309,  652.056257,  662.179143,
-  671.466557,  727.134512,  835.764859,  981.996194,  1031.896881, 1002.544732,
-  881.157178,  828.151494,  799.340975,  751.314325,  728.316587,  605.005504,
-  464.713920,  600.649281,  622.557620,  633.236320,  644.023513,  648.232514,
-  666.381639,  785.545191,  932.841398,  1036.735329, 1035.037004, 995.478339,
-  858.093733,  823.471976,  819.881754,  798.749289,  749.440463,  607.955244,
-  495.880237,  612.473139,  634.465309,  652.056257,  662.179143,  671.466557,
-  727.134512,  835.764859,  981.996194,  1032.339788, 1031.105117, 995.303259,
-  857.733663,  823.435877,  822.822791,  819.873050,  796.882480,  629.038445,
-  510.391280,  621.158273,  633.236320,  644.023513,  648.232514,  666.381639,
-  785.545191,  932.841398,  1036.735329, 1035.566013, 1029.599350, 994.926093,
-  857.645648,  823.435143,  822.904139,  822.822791,  817.965681,  673.856962,
-  514.588176,  632.947715,  652.056257,  662.179143,  671.466557,  727.134512,
-  835.764859,  981.996194,  1032.339788, 1031.547475, 1023.835377, 972.158629,
-  851.968626,  823.347128,  822.904770,  822.904139,  820.752301,  684.418900,
-  520.013294,  631.668183,  644.023513,  648.232514,  666.381639,  785.545191,
-  932.841398,  1036.735329, 1035.567378, 1029.776746, 1001.044108, 880.853721,
-  829.201546,  822.994150,  822.904770,  822.904770,  820.792975,  684.582020,
-  531.253628,  650.479606,  662.179143,  671.466557,  727.134512,  835.764859,
-  981.996194,  1032.339788, 1031.636855, 1029.601779, 995.366703,  858.086641,
-  823.524524,  822.906135,  822.904770,  822.904770,  820.792975,  684.582020,
-  528.531744,  642.424501,  648.232514,  666.381639,  785.545191,  932.841398,
-  1036.735329, 1035.567378, 1030.219103, 1029.576226, 995.278687,  857.733663,
-  823.436508,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
-  545.401164,  660.550678,  671.508859,  727.304161,  835.807162,  981.996850,
-  1032.339788, 1031.636855, 1030.130788, 1029.487827, 994.925709,  857.645648,
-  823.435143,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
-  537.684760,  646.650947,  669.110131,  796.487512,  935.569890,  1036.777631,
-  1035.567378, 1030.219103, 1030.018584, 1023.810805, 972.158629,  851.968626,
-  823.347128,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
-  552.408370,  670.001885,  738.246482,  879.690154,  992.939171,  1032.509436,
-  1031.636855, 1030.132153, 1029.665223, 1001.043724, 880.853721,  829.201546,
-  822.994150,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
-  539.835902,  667.496388,  799.216004,  946.512211,  1039.506123, 1035.609680,
-  1030.219103, 1030.107964, 1029.577207, 995.366703,  858.086641,  823.524524,
-  822.906135,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
-  558.362529,  734.277451,  877.197218,  990.478243,  1029.908393, 1028.993978,
-  1027.488620, 1027.464048, 1026.933674, 992.724534,  855.532488,  821.323349,
-  820.792975,  820.792975,  820.792975,  820.792975,  818.686600,  682.825198,
-  453.127195,  649.075095,  780.278390,  867.165890,  862.469711,  857.067460,
-  856.956321,  856.955937,  856.513579,  827.981461,  713.556496,  685.024378,
-  684.582020,  684.582020,  684.582020,  684.582020,  682.825198,  569.510056,
+static const uint8_t bsize_curvfit_model_cat_lookup[BLOCK_SIZES_ALL] = {
+  0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
 };
 
-static const double interp_dgrid_surf[65 * 18] = {
-  10.650434, 12.204694, 12.040917, 11.843008, 11.845578, 12.051535, 12.103583,
-  12.136780, 12.266709, 12.299107, 12.299673, 12.303120, 12.316337, 12.293431,
-  12.092165, 11.602421, 11.141559, 8.864495,  12.770003, 14.634889, 14.437149,
-  14.199413, 14.202487, 14.449423, 14.511827, 14.551629, 14.707410, 14.746265,
-  14.747610, 14.753705, 14.762194, 14.699395, 14.390525, 13.690970, 12.874168,
-  10.367121, 12.832328, 14.790730, 14.503765, 14.236403, 14.239028, 14.486600,
-  14.549164, 14.589069, 14.745250, 14.784258, 14.788320, 14.801930, 14.762798,
-  14.499088, 14.021544, 13.469684, 12.661560, 10.108384, 12.950520, 15.264726,
-  14.621957, 14.238236, 14.239028, 14.486601, 14.549264, 14.589469, 14.745361,
-  14.784949, 14.791572, 14.798652, 14.660251, 14.119394, 13.651131, 12.935657,
-  12.176082, 9.228999,  12.979992, 15.382918, 14.651428, 14.238693, 14.239028,
-  14.486701, 14.555710, 14.615321, 14.751849, 14.787700, 14.797104, 14.743189,
-  14.475057, 13.944406, 13.450468, 12.687876, 11.824993, 8.906683,  12.980449,
-  15.384750, 14.651885, 14.238700, 14.239028, 14.487102, 14.581562, 14.718998,
-  14.777721, 14.788445, 14.778661, 14.582790, 14.099785, 13.649637, 12.935359,
-  12.201859, 10.891931, 8.482221,  12.980449, 15.384750, 14.651886, 14.238801,
-  14.239434, 14.487303, 14.588010, 14.744860, 14.784773, 14.786094, 14.735647,
-  14.455704, 13.939591, 13.450393, 12.687876, 11.849334, 10.476658, 8.043672,
-  12.980449, 15.384750, 14.651987, 14.245320, 14.265579, 14.493824, 14.588211,
-  14.745312, 14.787263, 14.775934, 14.582036, 14.099475, 13.649563, 12.935358,
-  12.201859, 10.911285, 9.730570,  6.696921,  12.980449, 15.384750, 14.652393,
-  14.271466, 14.370434, 14.520069, 14.589027, 14.746028, 14.785482, 14.735605,
-  14.455693, 13.939590, 13.450393, 12.687876, 11.849334, 10.494514, 9.195398,
-  6.215460,  12.980449, 15.384750, 14.652494, 14.277985, 14.396679, 14.533035,
-  14.615021, 14.754825, 14.775610, 14.582796, 14.099664, 13.649565, 12.935358,
-  12.201859, 10.911285, 9.747361,  7.779960,  5.617541,  12.980448, 15.384731,
-  14.652415, 14.278078, 14.397578, 14.559053, 14.718657, 14.776398, 14.747044,
-  14.504690, 13.951810, 13.450583, 12.687876, 11.849334, 10.494514, 9.210817,
-  7.210003,  5.164575,  12.980446, 15.383448, 14.647073, 14.277541, 14.403813,
-  14.569546, 14.744956, 14.765103, 14.629073, 14.296161, 13.698573, 12.936118,
-  12.201859, 10.911285, 9.747361,  7.790897,  6.322998,  3.931551,  12.981550,
-  15.376916, 14.615597, 14.274820, 14.437479, 14.575942, 14.707492, 14.734111,
-  14.515975, 14.000806, 13.462803, 12.688066, 11.849334, 10.494514, 9.210817,
-  7.219566,  5.781392,  3.486081,  12.991899, 15.376201, 14.579444, 14.296898,
-  14.473361, 14.522910, 14.491600, 14.543267, 14.288580, 13.700311, 12.936579,
-  12.201867, 10.911285, 9.747361,  7.790897,  6.331506,  4.480348,  2.923138,
-  13.019848, 15.383477, 14.582260, 14.385262, 14.452673, 14.436019, 14.238174,
-  14.255993, 13.977481, 13.532342, 12.705591, 11.849605, 10.494514, 9.210817,
-  7.219566,  5.789642,  4.018194,  2.766222,  13.028558, 15.315782, 14.439141,
-  14.326286, 14.452429, 14.311731, 14.033235, 13.922587, 13.665868, 13.207897,
-  12.274375, 10.912967, 9.747371,  7.790897,  6.331506,  4.488594,  3.454993,
-  2.692682,  12.992752, 15.321471, 14.409573, 14.236340, 14.322969, 14.049072,
-  13.764823, 13.479242, 13.250105, 12.759133, 12.019174, 10.532951, 9.211409,
-  7.219566,  5.789642,  4.026440,  3.298077,  2.674624,  12.945493, 15.276596,
-  14.315745, 14.026198, 14.085774, 13.844563, 13.447576, 12.964935, 12.735525,
-  12.288592, 11.511693, 9.900227,  7.793270,  6.331506,  4.488594,  3.463236,
-  3.224318,  2.672433,  12.757570, 15.056661, 14.095011, 13.722362, 13.812624,
-  13.608480, 13.021206, 12.367627, 11.937931, 11.581049, 10.599552, 9.247860,
-  7.220151,  5.789642,  4.026437,  3.305882,  3.191260,  2.615317,  12.581293,
-  14.824658, 13.909074, 13.496158, 13.491402, 13.221550, 12.514140, 11.677229,
-  10.936895, 10.619912, 9.634779,  7.763570,  6.331082,  4.488590,  3.462798,
-  3.216460,  3.076315,  2.373499,  12.283499, 14.455760, 13.890593, 13.427587,
-  13.183783, 12.763833, 11.861006, 10.740618, 9.820756,  9.354945,  8.669862,
-  7.123268,  5.787860,  4.025994,  3.290000,  3.084410,  2.810905,  2.222916,
-  12.010893, 14.300919, 13.986624, 13.484026, 13.025385, 12.224281, 11.064265,
-  9.631040,  8.594396,  8.003736,  7.561587,  6.274418,  4.466637,  3.446574,
-  3.102467,  2.816989,  2.598688,  1.951541,  11.581477, 13.831132, 13.632027,
-  13.380414, 12.807880, 11.665651, 10.218236, 8.562237,  7.222614,  6.611808,
-  6.261676,  5.402793,  3.938544,  3.174375,  2.818166,  2.602758,  2.213911,
-  1.434763,  11.050735, 12.893449, 12.363152, 12.712829, 12.012961, 10.887854,
-  9.109699,  7.421701,  5.965603,  5.272129,  4.991435,  4.423000,  3.369988,
-  2.800371,  2.593901,  2.217431,  1.670917,  1.215265,  10.641194, 11.766277,
-  10.777082, 10.972917, 10.689298, 9.701545,  7.719947,  6.145654,  4.872442,
-  4.099600,  3.880934,  3.514159,  2.786474,  2.368963,  2.162376,  1.673670,
-  1.450770,  1.185424,  10.071964, 11.107701, 9.172361,  8.551313,  8.412080,
-  7.641397,  6.174246,  4.853916,  3.904549,  3.246810,  2.959903,  2.785066,
-  2.240001,  1.793166,  1.585520,  1.449824,  1.405368,  1.168856,  9.213182,
-  9.173278,  7.219231,  6.242951,  5.626013,  5.768007,  4.908666,  3.809589,
-  3.115109,  2.617899,  2.274793,  2.172960,  1.838597,  1.505915,  1.414333,
-  1.392666,  1.338173,  1.105611,  7.365015,  7.471370,  5.622346,  4.520127,
-  3.936272,  4.208822,  3.623024,  2.977794,  2.450003,  2.097261,  1.824090,
-  1.643270,  1.473525,  1.351388,  1.327504,  1.323865,  1.307894,  1.088234,
-  6.198210,  6.580712,  4.682511,  3.416952,  2.941929,  2.766637,  2.650686,
-  2.315439,  1.925838,  1.659784,  1.464419,  1.252806,  1.162722,  1.197518,
-  1.199875,  1.197365,  1.194040,  0.995797,  5.402507,  5.055466,  3.728724,
-  2.624359,  2.165810,  1.943189,  1.918190,  1.738078,  1.516328,  1.290520,
-  1.155793,  1.015962,  0.881900,  0.807203,  0.754242,  0.743378,  0.740288,
-  0.614158,  3.937867,  3.862507,  2.884664,  2.088147,  1.648496,  1.473584,
-  1.340123,  1.291769,  1.165381,  1.000224,  0.893316,  0.821333,  0.691363,
-  0.610501,  0.586766,  0.583762,  0.577840,  0.468733,  3.104660,  3.181078,
-  2.420208,  1.747442,  1.297956,  1.109835,  0.970385,  0.943229,  0.876923,
-  0.777584,  0.678183,  0.628623,  0.553745,  0.523430,  0.519490,  0.514394,
-  0.492259,  0.403172,  2.593833,  2.533720,  2.010452,  1.480944,  1.060302,
-  0.846383,  0.738703,  0.673144,  0.658010,  0.592449,  0.518236,  0.470335,
-  0.425088,  0.393168,  0.378116,  0.355846,  0.275469,  0.213128,  2.176988,
-  2.089575,  1.671284,  1.225008,  0.895382,  0.672008,  0.566241,  0.496746,
-  0.488005,  0.449874,  0.400899,  0.354002,  0.318150,  0.281533,  0.238545,
-  0.224159,  0.202399,  0.160681,  1.874679,  1.769165,  1.430124,  1.068727,
-  0.780272,  0.557801,  0.441643,  0.377256,  0.352957,  0.338452,  0.304965,
-  0.273172,  0.240052,  0.208724,  0.193431,  0.190845,  0.185025,  0.138166,
-  1.590226,  1.502830,  1.193127,  0.917885,  0.670432,  0.474546,  0.355420,
-  0.292305,  0.259035,  0.249937,  0.232079,  0.208943,  0.181936,  0.160038,
-  0.152257,  0.151235,  0.149583,  0.120747,  1.331730,  1.255907,  1.012871,
-  0.778422,  0.578977,  0.412432,  0.293155,  0.231824,  0.197187,  0.183921,
-  0.174876,  0.157252,  0.140263,  0.127050,  0.110244,  0.105041,  0.104323,
-  0.086944,  1.153994,  1.118771,  0.822355,  0.612321,  0.478249,  0.348222,
-  0.247408,  0.186141,  0.152714,  0.135445,  0.129810,  0.119994,  0.115619,
-  0.131626,  0.095612,  0.079343,  0.077502,  0.064550,  0.946317,  0.925894,
-  0.677969,  0.499906,  0.397101,  0.297931,  0.214467,  0.152333,  0.120731,
-  0.102686,  0.095062,  0.090361,  0.122319,  0.240194,  0.112687,  0.070690,
-  0.070461,  0.054194,  0.824155,  0.787241,  0.581856,  0.419228,  0.313167,
-  0.245582,  0.183500,  0.128101,  0.096577,  0.080267,  0.071022,  0.066851,
-  0.085754,  0.154163,  0.075884,  0.052401,  0.054270,  0.026656,  0.716310,
-  0.671378,  0.489580,  0.349569,  0.256155,  0.206343,  0.157853,  0.111950,
-  0.079271,  0.062518,  0.053441,  0.049660,  0.051400,  0.063778,  0.039993,
-  0.029133,  0.023382,  0.013725,  0.614125,  0.579096,  0.417126,  0.299465,
-  0.217849,  0.165515,  0.129040,  0.093127,  0.065612,  0.049543,  0.041429,
-  0.036850,  0.034416,  0.033989,  0.024216,  0.017377,  0.014833,  0.011987,
-  0.520407,  0.487239,  0.349473,  0.251741,  0.184897,  0.135813,  0.107098,
-  0.073607,  0.053938,  0.040531,  0.032931,  0.028876,  0.025759,  0.022168,
-  0.016739,  0.014638,  0.014333,  0.011947,  0.449954,  0.415124,  0.299452,
-  0.216942,  0.158874,  0.115334,  0.088821,  0.060105,  0.042610,  0.032566,
-  0.026903,  0.023123,  0.019913,  0.016835,  0.014306,  0.013625,  0.013535,
-  0.011284,  0.377618,  0.347773,  0.251741,  0.184839,  0.132857,  0.095439,
-  0.070462,  0.052244,  0.036078,  0.026025,  0.021518,  0.018487,  0.015361,
-  0.012905,  0.011470,  0.010569,  0.010283,  0.008297,  0.319953,  0.297976,
-  0.216942,  0.158842,  0.113280,  0.080426,  0.057367,  0.041987,  0.030135,
-  0.022295,  0.017901,  0.015121,  0.012224,  0.010035,  0.009353,  0.009108,
-  0.008695,  0.006139,  0.267864,  0.250502,  0.184839,  0.132851,  0.095039,
-  0.068220,  0.049135,  0.035315,  0.025144,  0.018237,  0.013857,  0.012094,
-  0.009715,  0.007743,  0.006937,  0.006446,  0.006243,  0.004929,  0.230449,
-  0.215895,  0.158842,  0.113280,  0.080417,  0.057174,  0.041304,  0.029959,
-  0.021866,  0.015673,  0.012133,  0.010083,  0.007801,  0.006053,  0.005401,
-  0.003834,  0.003429,  0.002851,  0.193984,  0.183963,  0.132851,  0.095039,
-  0.068220,  0.049133,  0.035305,  0.025140,  0.018150,  0.013175,  0.010422,
-  0.008491,  0.006397,  0.004567,  0.003494,  0.002933,  0.002825,  0.002355,
-  0.167298,  0.158088,  0.113280,  0.080417,  0.057174,  0.041304,  0.029959,
-  0.021866,  0.015669,  0.011955,  0.009257,  0.007051,  0.005543,  0.003905,
-  0.002984,  0.002825,  0.002814,  0.002347,  0.143228,  0.132220,  0.095039,
-  0.068220,  0.049133,  0.035305,  0.025140,  0.018150,  0.013174,  0.010394,
-  0.008403,  0.006661,  0.005378,  0.003545,  0.002876,  0.002818,  0.002814,
-  0.002347,  0.122934,  0.112735,  0.080417,  0.057174,  0.041304,  0.029959,
-  0.021866,  0.015669,  0.011955,  0.009258,  0.007182,  0.006012,  0.003762,
-  0.002866,  0.002739,  0.002788,  0.002810,  0.002347,  0.101934,  0.094569,
-  0.068220,  0.049133,  0.035305,  0.025140,  0.018150,  0.013174,  0.010394,
-  0.008405,  0.006797,  0.005845,  0.003333,  0.002703,  0.002695,  0.002723,
-  0.002781,  0.002343,  0.086702,  0.080014,  0.057174,  0.041304,  0.029959,
-  0.021866,  0.015669,  0.011955,  0.009258,  0.007190,  0.006533,  0.005839,
-  0.003326,  0.002700,  0.002690,  0.002694,  0.002716,  0.002314,  0.073040,
-  0.067886,  0.049133,  0.035305,  0.025140,  0.018150,  0.013174,  0.010394,
-  0.008405,  0.006807,  0.006468,  0.005831,  0.003325,  0.002700,  0.002690,
-  0.002690,  0.002687,  0.002253,  0.061685,  0.056890,  0.041304,  0.029959,
-  0.021866,  0.015669,  0.011955,  0.009258,  0.007190,  0.006542,  0.006360,
-  0.005416,  0.003221,  0.002698,  0.002690,  0.002690,  0.002683,  0.002238,
-  0.052465,  0.048894,  0.035305,  0.025140,  0.018150,  0.013174,  0.010394,
-  0.008405,  0.006807,  0.006472,  0.005943,  0.003748,  0.002805,  0.002692,
-  0.002690,  0.002690,  0.002683,  0.002238,  0.043838,  0.041101,  0.029959,
-  0.021866,  0.015669,  0.011955,  0.009258,  0.007190,  0.006543,  0.006465,
-  0.005839,  0.003333,  0.002702,  0.002690,  0.002690,  0.002690,  0.002683,
-  0.002238,  0.037824,  0.035133,  0.025140,  0.018150,  0.013174,  0.010394,
-  0.008405,  0.006807,  0.006480,  0.006464,  0.005838,  0.003326,  0.002700,
-  0.002690,  0.002690,  0.002690,  0.002683,  0.002238,  0.031865,  0.029815,
-  0.021866,  0.015668,  0.011955,  0.009258,  0.007190,  0.006543,  0.006475,
-  0.006462,  0.005831,  0.003325,  0.002700,  0.002690,  0.002690,  0.002690,
-  0.002683,  0.002238,  0.027150,  0.025016,  0.018128,  0.013083,  0.010371,
-  0.008405,  0.006807,  0.006480,  0.006472,  0.006359,  0.005416,  0.003221,
-  0.002698,  0.002690,  0.002690,  0.002690,  0.002683,  0.002238,  0.023094,
-  0.021760,  0.015577,  0.011590,  0.009167,  0.007188,  0.006543,  0.006475,
-  0.006466,  0.005943,  0.003748,  0.002805,  0.002692,  0.002690,  0.002690,
-  0.002690,  0.002683,  0.002238,  0.019269,  0.018038,  0.013060,  0.010280,
-  0.008382,  0.006806,  0.006480,  0.006474,  0.006464,  0.005839,  0.003333,
-  0.002702,  0.002690,  0.002690,  0.002690,  0.002690,  0.002683,  0.002238,
-  0.016874,  0.015472,  0.011566,  0.009148,  0.007171,  0.006527,  0.006458,
-  0.006457,  0.006447,  0.005823,  0.003318,  0.002693,  0.002683,  0.002683,
-  0.002683,  0.002683,  0.002676,  0.002232,  0.011968,  0.011056,  0.008762,
-  0.007219,  0.005717,  0.005391,  0.005386,  0.005386,  0.005377,  0.004856,
-  0.002767,  0.002246,  0.002238,  0.002238,  0.002238,  0.002238,  0.002232,
-  0.001862,
+static int sse_norm_curvfit_model_cat_lookup(double sse_norm) {
+  return (sse_norm > 16.0);
+}
+
+// Models distortion by sse using a logistic function on
+// l = log2(sse / q^2) as:
+// dbysse = 16 / (1 + k exp(l + c))
+static double get_dbysse_logistic(double l, double c, double k) {
+  const double A = 16.0;
+  const double dbysse = A / (1 + k * exp(l + c));
+  return dbysse;
+}
+
+// Models rate using a clamped linear function on
+// l = log2(sse / q^2) as:
+// rate = max(0, a + b * l)
+static double get_rate_clamplinear(double l, double a, double b) {
+  const double rate = a + b * l;
+  return (rate < 0 ? 0 : rate);
+}
+
+static const uint8_t bsize_surffit_model_cat_lookup[BLOCK_SIZES_ALL] = {
+  0, 0, 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 0, 0, 2, 2, 4, 4
 };
 
-void av1_model_rd_surffit(double xm, double yl, double *rate_f,
-                          double *dist_f) {
-  const double x_start = -0.5;
-  const double x_end = 16.5;
-  const double x_step = 1;
-  const double y_start = -15.5;
-  const double y_end = 16.5;
-  const double y_step = 0.5;
-  const double epsilon = 1e-6;
-  const int stride = (int)rint((x_end - x_start) / x_step) + 1;
-  (void)y_end;
+static const double surffit_rate_params[9][4] = {
+  {
+      638.390212,
+      2.253108,
+      166.585650,
+      -3.939401,
+  },
+  {
+      5.256905,
+      81.997240,
+      -1.321771,
+      17.694216,
+  },
+  {
+      -74.193045,
+      72.431868,
+      -19.033152,
+      15.407276,
+  },
+  {
+      416.770113,
+      14.794188,
+      167.686830,
+      -6.997756,
+  },
+  {
+      378.511276,
+      9.558376,
+      154.658843,
+      -6.635663,
+  },
+  {
+      277.818787,
+      4.413180,
+      150.317637,
+      -9.893038,
+  },
+  {
+      142.212132,
+      11.542038,
+      94.393964,
+      -5.518517,
+  },
+  {
+      219.100256,
+      4.007421,
+      108.932852,
+      -6.981310,
+  },
+  {
+      222.261971,
+      3.251049,
+      95.972916,
+      -5.609789,
+  },
+};
+
+static const double surffit_dist_params[7] = {
+  1.475844, 4.328362, -5.680233, -0.500994, 0.554585, 4.839478, -0.695837
+};
 
-  xm = AOMMAX(xm, x_start + x_step + epsilon);
-  xm = AOMMIN(xm, x_end - x_step - epsilon);
-  yl = AOMMAX(yl, y_start + y_step + epsilon);
-  yl = AOMMIN(yl, y_end - y_step - epsilon);
+static void rate_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
+                                             double *rpar) {
+  const int cat = bsize_surffit_model_cat_lookup[bsize];
+  rpar[0] = surffit_rate_params[cat][0] + surffit_rate_params[cat][1] * xm;
+  rpar[1] = surffit_rate_params[cat][2] + surffit_rate_params[cat][3] * xm;
+}
 
-  const double y = (yl - y_start) / y_step;
-  const double x = (xm - x_start) / x_step;
+static void dist_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
+                                             double *dpar) {
+  (void)bsize;
+  const double *params = surffit_dist_params;
+  dpar[0] = params[0] + params[1] / (1 + exp((xm + params[2]) * params[3]));
+  dpar[1] = params[4] + params[5] * exp(params[6] * xm);
+}
 
-  const int yi = (int)floor(y);
-  const int xi = (int)floor(x);
-  assert(xi > 0);
-  assert(yi > 0);
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+                          double yl, double *rate_f, double *distbysse_f) {
+  (void)sse_norm;
+  double rpar[2], dpar[2];
+  rate_surffit_model_params_lookup(bsize, xm, rpar);
+  dist_surffit_model_params_lookup(bsize, xm, dpar);
 
-  const double yo = y - yi;
-  const double xo = x - xi;
-  const double *prate = &interp_rgrid_surf[(yi - 1) * stride + (xi - 1)];
-  const double *pdist = &interp_dgrid_surf[(yi - 1) * stride + (xi - 1)];
-  *rate_f = interp_bicubic(prate, stride, xo, yo);
-  *dist_f = interp_bicubic(pdist, stride, xo, yo);
+  *rate_f = get_rate_clamplinear(yl, rpar[0], rpar[1]);
+  *distbysse_f = get_dbysse_logistic(yl, dpar[0], dpar[1]);
 }
 
-static const double interp_rgrid_curv[65] = {
-  0.000000,    0.000000,    0.000000,    0.000000,    0.000000,     0.000000,
-  0.000000,    0.000000,    0.000000,    0.000000,    0.000000,     0.000000,
-  0.000000,    0.000000,    0.000000,    0.000000,    0.000000,     4.759876,
-  8.132086,    13.651828,   21.908271,   33.522054,   48.782376,    71.530983,
-  106.728649,  151.942795,  199.893011,  242.850965,  283.933923,   322.154203,
-  360.684608,  394.801656,  426.879017,  460.234313,  484.103987,   508.261495,
-  536.486763,  558.196737,  586.285894,  614.764511,  634.166333,   647.706472,
-  658.211478,  681.360407,  701.052141,  727.007310,  768.663973,   804.407660,
-  884.627751,  1065.658131, 1238.875214, 1440.185176, 1678.377931,  1962.243390,
-  2300.571467, 2702.152072, 3175.775119, 3730.230519, 4374.308184,  5116.798028,
-  5966.489961, 6932.173897, 8022.639747, 9246.677424, 10613.076839,
+static const double interp_rgrid_curv[4][65] = {
+  {
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    23.801499,   28.387688,   33.388795,   42.298282,
+      41.525408,   51.597692,   49.566271,   54.632979,   60.321507,
+      67.730678,   75.766165,   85.324032,   96.600012,   120.839562,
+      173.917577,  255.974908,  354.107573,  458.063476,  562.345966,
+      668.568424,  772.072881,  878.598490,  982.202274,  1082.708946,
+      1188.037853, 1287.702240, 1395.588773, 1490.825830, 1584.231230,
+      1691.386090, 1766.822555, 1869.630904, 1926.743565, 2002.949495,
+      2047.431137, 2138.486068, 2154.743767, 2209.242472, 2277.593051,
+      2290.996432, 2307.452938, 2343.567091, 2397.654644, 2469.425868,
+      2558.591037, 2664.860422, 2787.944296, 2927.552932, 3083.396602,
+      3255.185579, 3442.630134, 3645.440541, 3863.327072, 4096.000000,
+  },
+  {
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    8.998436,    9.439592,    9.731837,    10.865931,
+      11.561347,   12.578139,   14.205101,   16.770584,   19.094853,
+      21.330863,   23.298907,   26.901921,   34.501017,   57.891733,
+      112.234763,  194.853189,  288.302032,  380.499422,  472.625309,
+      560.226809,  647.928463,  734.155122,  817.489721,  906.265783,
+      999.260562,  1094.489206, 1197.062998, 1293.296825, 1378.926484,
+      1472.760990, 1552.663779, 1635.196884, 1692.451951, 1759.741063,
+      1822.162720, 1916.515921, 1966.686071, 2031.647506, 2033.700134,
+      2087.847688, 2161.688858, 2242.536028, 2334.023491, 2436.337802,
+      2549.665519, 2674.193198, 2810.107395, 2957.594666, 3116.841567,
+      3288.034655, 3471.360486, 3667.005616, 3875.156602, 4096.000000,
+  },
+  {
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    2.377584,    2.557185,    2.732445,    2.851114,
+      3.281800,    3.765589,    4.342578,    5.145582,    5.611038,
+      6.642238,    7.945977,    11.800522,   17.346624,   37.501413,
+      87.216800,   165.860942,  253.865564,  332.039345,  408.518863,
+      478.120452,  547.268590,  616.067676,  680.022540,  753.863541,
+      834.529973,  919.489191,  1008.264989, 1092.230318, 1173.971886,
+      1249.514122, 1330.510941, 1399.523249, 1466.923387, 1530.533471,
+      1586.515722, 1695.197774, 1746.648696, 1837.136959, 1909.075485,
+      1975.074651, 2060.159200, 2155.335095, 2259.762505, 2373.710437,
+      2497.447898, 2631.243895, 2775.367434, 2930.087523, 3095.673170,
+      3272.393380, 3460.517161, 3660.313520, 3872.051464, 4096.000000,
+  },
+  {
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.296997,    0.342545,    0.403097,    0.472889,
+      0.614483,    0.842937,    1.050824,    1.326663,    1.717750,
+      2.530591,    3.582302,    6.995373,    9.973335,    24.042464,
+      56.598240,   113.680735,  180.018689,  231.050567,  266.101082,
+      294.957934,  323.326511,  349.434429,  380.443211,  408.171987,
+      441.214916,  475.716772,  512.900000,  551.186939,  592.364455,
+      624.527378,  661.940693,  679.185473,  724.800679,  764.781792,
+      873.050019,  950.299001,  939.292954,  1052.406153, 1033.893184,
+      1112.182406, 1219.174326, 1337.296681, 1471.648357, 1622.492809,
+      1790.093491, 1974.713858, 2176.617364, 2396.067465, 2633.327614,
+      2888.661266, 3162.331876, 3454.602899, 3765.737789, 4096.000000,
+  },
 };
 
-static const double interp_dgrid_curv[65] = {
-  14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855,
-  14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.555776, 14.533692,
-  14.439920, 14.257791, 13.977230, 13.623229, 13.064884, 12.355411, 11.560773,
-  10.728960, 9.861975,  8.643612,  6.916021,  5.154769,  3.734940,  2.680051,
-  1.925506,  1.408410,  1.042223,  0.767641,  0.565392,  0.420116,  0.310427,
-  0.231711,  0.172999,  0.128293,  0.094992,  0.072171,  0.052972,  0.039354,
-  0.029555,  0.022857,  0.016832,  0.013297,  0.000000,  0.000000,  0.000000,
-  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,
-  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,
-  0.000000,  0.000000,
+static const double interp_dgrid_curv[2][65] = {
+  {
+      16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770,
+      15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870,
+      15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387,
+      13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790,
+      7.487633,  5.688649,  4.267515,  3.196300,  2.434201,  1.834064,
+      1.369920,  1.035921,  0.775279,  0.574895,  0.427232,  0.314123,
+      0.233236,  0.171440,  0.128188,  0.092762,  0.067569,  0.049324,
+      0.036330,  0.027008,  0.019853,  0.015539,  0.011093,  0.008733,
+      0.007624,  0.008105,  0.005427,  0.004065,  0.003427,  0.002848,
+      0.002328,  0.001865,  0.001457,  0.001103,  0.000801,  0.000550,
+      0.000348,  0.000193,  0.000085,  0.000021,  0.000000,
+  },
+  {
+      16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501,
+      15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967,
+      15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212,
+      13.073692, 12.222005, 11.237799, 9.985848,  8.898823,  7.423519,
+      5.995325,  4.773152,  3.744032,  2.938217,  2.294526,  1.762412,
+      1.327145,  1.020728,  0.765535,  0.570548,  0.425833,  0.313825,
+      0.232959,  0.171324,  0.128174,  0.092750,  0.067558,  0.049319,
+      0.036330,  0.027008,  0.019853,  0.015539,  0.011093,  0.008733,
+      0.007624,  0.008105,  0.005427,  0.004065,  0.003427,  0.002848,
+      0.002328,  0.001865,  0.001457,  0.001103,  0.000801,  0.000550,
+      0.000348,  0.000193,  0.000085,  0.000021,  -0.000000,
+  },
 };
 
-void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) {
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+                          double *rate_f, double *distbysse_f) {
   const double x_start = -15.5;
   const double x_end = 16.5;
   const double x_step = 0.5;
   const double epsilon = 1e-6;
+  const int rcat = bsize_curvfit_model_cat_lookup[bsize];
+  const int dcat = sse_norm_curvfit_model_cat_lookup(sse_norm);
   (void)x_end;
 
   xqr = AOMMAX(xqr, x_start + x_step + epsilon);
@@ -1138,9 +935,9 @@ void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) {
 
   assert(xi > 0);
 
-  const double *prate = &interp_rgrid_curv[(xi - 1)];
-  const double *pdist = &interp_dgrid_curv[(xi - 1)];
+  const double *prate = &interp_rgrid_curv[rcat][(xi - 1)];
   *rate_f = interp_cubic(prate, xo);
+  const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)];
   *distbysse_f = interp_cubic(pdist, xo);
 }
 
@@ -1257,13 +1054,12 @@ int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
 
 YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
                                              int ref_frame) {
-  const AV1_COMMON *const cm = &cpi->common;
   assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
-  const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
-  const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
-  return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX)
-             ? &cm->buffer_pool->frame_bufs[scaled_idx].buf
-             : NULL;
+  RefCntBuffer *const scaled_buf = cpi->scaled_ref_buf[ref_frame - 1];
+  const RefCntBuffer *const ref_buf =
+      get_ref_frame_buf(&cpi->common, ref_frame);
+  return (scaled_buf != ref_buf && scaled_buf != NULL) ? &scaled_buf->buf
+                                                       : NULL;
 }
 
 int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
@@ -1304,7 +1100,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   } else {
     rd->thresh_mult[THR_NEARESTMV] = 0;
     rd->thresh_mult[THR_NEARESTL2] = 0;
-    rd->thresh_mult[THR_NEARESTL3] = 0;
+    rd->thresh_mult[THR_NEARESTL3] = 100;
     rd->thresh_mult[THR_NEARESTB] = 0;
     rd->thresh_mult[THR_NEARESTA2] = 0;
     rd->thresh_mult[THR_NEARESTA] = 0;
@@ -1315,7 +1111,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_NEWL2] += 1000;
   rd->thresh_mult[THR_NEWL3] += 1000;
   rd->thresh_mult[THR_NEWB] += 1000;
-  rd->thresh_mult[THR_NEWA2] = 1000;
+  rd->thresh_mult[THR_NEWA2] = 1100;
   rd->thresh_mult[THR_NEWA] += 1000;
   rd->thresh_mult[THR_NEWG] += 1000;
 
@@ -1327,18 +1123,18 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_NEARA] += 1000;
   rd->thresh_mult[THR_NEARG] += 1000;
 
-  rd->thresh_mult[THR_GLOBALMV] += 2000;
+  rd->thresh_mult[THR_GLOBALMV] += 2200;
   rd->thresh_mult[THR_GLOBALL2] += 2000;
   rd->thresh_mult[THR_GLOBALL3] += 2000;
-  rd->thresh_mult[THR_GLOBALB] += 2000;
+  rd->thresh_mult[THR_GLOBALB] += 2400;
   rd->thresh_mult[THR_GLOBALA2] = 2000;
   rd->thresh_mult[THR_GLOBALG] += 2000;
-  rd->thresh_mult[THR_GLOBALA] += 2000;
+  rd->thresh_mult[THR_GLOBALA] += 2400;
 
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1100;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 800;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 900;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
@@ -1356,17 +1152,17 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1530;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1870;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2750;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1870;
   rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 1800;
   rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
@@ -1375,23 +1171,23 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 3000;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1320;
   rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 2040;
   rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2250;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1360;
   rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2500;
+  rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2250;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
@@ -1404,7 +1200,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1870;
   rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
   rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] += 2500;
@@ -1418,7 +1214,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARLA2] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1500;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1800;
   rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] += 1500;
   rd->thresh_mult[THR_COMP_NEAR_NEWLA2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARLA2] += 1700;
@@ -1433,7 +1229,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEW_NEWL2A2] += 2000;
   rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] += 2500;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1440;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] += 1500;
   rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] += 1700;
@@ -1447,29 +1243,29 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWGA2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARGA2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2750;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000;
   rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2200;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2640;
   rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 2200;
   rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2400;
   rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] += 3200;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1600;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 2000;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 1800;
   rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 2200;
   rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 2200;
   rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2400;
   rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] += 3200;
 
-  rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1600;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1760;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2400;
   rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 2000;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 2200;
-  rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2200;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 1760;
+  rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2640;
   rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2400;
   rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] += 3200;
 
@@ -1477,34 +1273,25 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 2000;
   rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 2000;
   rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 2200;
-  rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200;
-  rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400;
+  rd->thresh_mult[THR_COMP_NEW_NEARBA] += 1980;
+  rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2640;
   rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200;
 
   rd->thresh_mult[THR_DC] += 1000;
   rd->thresh_mult[THR_PAETH] += 1000;
-  rd->thresh_mult[THR_SMOOTH] += 2000;
+  rd->thresh_mult[THR_SMOOTH] += 2200;
   rd->thresh_mult[THR_SMOOTH_V] += 2000;
   rd->thresh_mult[THR_SMOOTH_H] += 2000;
   rd->thresh_mult[THR_H_PRED] += 2000;
-  rd->thresh_mult[THR_V_PRED] += 2000;
+  rd->thresh_mult[THR_V_PRED] += 1800;
   rd->thresh_mult[THR_D135_PRED] += 2500;
-  rd->thresh_mult[THR_D203_PRED] += 2500;
+  rd->thresh_mult[THR_D203_PRED] += 2000;
   rd->thresh_mult[THR_D157_PRED] += 2500;
-  rd->thresh_mult[THR_D67_PRED] += 2500;
+  rd->thresh_mult[THR_D67_PRED] += 2000;
   rd->thresh_mult[THR_D113_PRED] += 2500;
   rd->thresh_mult[THR_D45_PRED] += 2500;
 }
 
-void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
-  static const int thresh_mult[MAX_REFS] = { 2500, 2500, 2500, 2500, 2500,
-                                             2500, 2500, 4500, 4500, 4500,
-                                             4500, 4500, 4500, 4500, 4500,
-                                             4500, 4500, 4500, 4500, 2500 };
-  RD_OPT *const rd = &cpi->rd;
-  memcpy(rd->thresh_mult_sub8x8, thresh_mult, sizeof(thresh_mult));
-}
-
 void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
                                int (*factor_buf)[MAX_MODES], int rd_thresh,
                                int bsize, int best_mode_index) {
diff --git a/libaom/av1/encoder/rd.h b/libaom/av1/encoder/rd.h
index 2e2a30d..ff46083 100644
--- a/libaom/av1/encoder/rd.h
+++ b/libaom/av1/encoder/rd.h
@@ -48,7 +48,7 @@ extern "C" {
 
 // This enumerator type needs to be kept aligned with the mode order in
 // const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code.
-typedef enum {
+enum {
   THR_NEARESTMV,
   THR_NEARESTL2,
   THR_NEARESTL3,
@@ -246,9 +246,9 @@ typedef enum {
   MAX_SINGLE_REF_MODES = LAST_SINGLE_REF_MODES + 1,
   LAST_COMP_REF_MODES = THR_COMP_GLOBAL_GLOBALBA,
   MAX_COMP_REF_MODES = LAST_COMP_REF_MODES + 1
-} THR_MODES;
+} UENUM1BYTE(THR_MODES);
 
-typedef enum {
+enum {
   THR_LAST,
   THR_LAST2,
   THR_LAST3,
@@ -275,7 +275,7 @@ typedef enum {
   THR_INTRA,
 
   MAX_REFS
-} THR_MODES_SUB8X8;
+} UENUM1BYTE(THR_MODES_SUB8X8);
 
 typedef struct RD_OPT {
   // Thresh_mult is used to set a threshold for the rd score. A higher value
@@ -283,7 +283,6 @@ typedef struct RD_OPT {
   // is used in combination with the current block size, and thresh_freq_fact
   // to pick a threshold.
   int thresh_mult[MAX_MODES];
-  int thresh_mult_sub8x8[MAX_REFS];
 
   int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES];
 
@@ -319,25 +318,6 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
     }
   }
 #endif
-#if CONFIG_ONE_PASS_SVM
-  rd_stats->eob = 0;
-  rd_stats->eob_0 = 0;
-  rd_stats->eob_1 = 0;
-  rd_stats->eob_2 = 0;
-  rd_stats->eob_3 = 0;
-
-  rd_stats->rd = 0;
-  rd_stats->rd_0 = 0;
-  rd_stats->rd_1 = 0;
-  rd_stats->rd_2 = 0;
-  rd_stats->rd_3 = 0;
-
-  rd_stats->y_sse = 0;
-  rd_stats->sse_0 = 0;
-  rd_stats->sse_1 = 0;
-  rd_stats->sse_2 = 0;
-  rd_stats->sse_3 = 0;
-#endif
 }
 
 static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
@@ -365,30 +345,6 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
     }
   }
 #endif
-#if CONFIG_ONE_PASS_SVM
-  // TODO(chiyotsai@google.com): Change invalid values to INT_MAX and
-  // INT64_MAX. Currently there are some code paths where rd_stats's properties
-  // are set directly without calling av1_init_rd_stats, so changing it now will
-  // break this speed feature. Need to hunt down all places where rd_stats is
-  // used without initialized.
-  rd_stats->eob = 0;
-  rd_stats->eob_0 = 0;
-  rd_stats->eob_1 = 0;
-  rd_stats->eob_2 = 0;
-  rd_stats->eob_3 = 0;
-
-  rd_stats->rd = 0;
-  rd_stats->rd_0 = 0;
-  rd_stats->rd_1 = 0;
-  rd_stats->rd_2 = 0;
-  rd_stats->rd_3 = 0;
-
-  rd_stats->y_sse = 0;
-  rd_stats->sse_0 = 0;
-  rd_stats->sse_1 = 0;
-  rd_stats->sse_2 = 0;
-  rd_stats->sse_3 = 0;
-#endif
 }
 
 static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
@@ -422,222 +378,8 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
     }
   }
 #endif
-#if CONFIG_ONE_PASS_SVM
-  rd_stats_dst->eob += rd_stats_src->eob;
-  rd_stats_dst->eob_0 += rd_stats_src->eob_0;
-  rd_stats_dst->eob_1 += rd_stats_src->eob_1;
-  rd_stats_dst->eob_2 += rd_stats_src->eob_2;
-  rd_stats_dst->eob_3 += rd_stats_src->eob_3;
-
-  rd_stats_dst->rd += rd_stats_src->rd;
-  rd_stats_dst->rd_0 += rd_stats_src->rd_0;
-  rd_stats_dst->rd_1 += rd_stats_src->rd_1;
-  rd_stats_dst->rd_2 += rd_stats_src->rd_2;
-  rd_stats_dst->rd_3 += rd_stats_src->rd_3;
-
-  rd_stats_dst->y_sse += rd_stats_src->y_sse;
-  rd_stats_dst->sse_0 += rd_stats_src->sse_0;
-  rd_stats_dst->sse_1 += rd_stats_src->sse_1;
-  rd_stats_dst->sse_2 += rd_stats_src->sse_2;
-  rd_stats_dst->sse_3 += rd_stats_src->sse_3;
-#endif
-}
-
-#if CONFIG_ONE_PASS_SVM
-static INLINE void av1_add_reg_stat(RD_STATS *rd_stats, int eob, int64_t rd,
-                                    int64_t sse, int blk_row, int blk_col,
-                                    BLOCK_SIZE bsize, BLOCK_SIZE crop_bsize) {
-  // NOTE: Currently the calculation of regional features works by assuming
-  // bsize is square so that each transform block of size crop_bsize either
-  // 1. locates completely within a quadrant or
-  // 2. is exactly half of bsize or
-  // 3. is the entire prediction block
-  // Size of TX block and SB
-  const int block_width_mi = mi_size_wide[bsize];
-  const int block_height_mi = mi_size_high[bsize];
-  const int crop_width_mi = mi_size_wide[crop_bsize];
-  const int crop_height_mi = mi_size_high[crop_bsize];
-
-  // Increment the eob proportionally to how much the tx_block overlaps with
-  // each quadrant. We will scale it by MAX_MIB_SIZE * MAX_MIB_SIZE to avoid
-  // being truncated.
-  const int max_scaling_factor = MAX_MIB_SIZE * MAX_MIB_SIZE;
-
-  // Update the stats
-  rd_stats->eob = eob;
-  rd_stats->rd = rd;
-  rd_stats->y_sse = sse;
-
-  if (crop_width_mi <= block_width_mi / 2 &&
-      crop_height_mi <= block_width_mi / 2) {
-    // The transform block lies completely in a quadrant.
-    const int scaling_factor = max_scaling_factor;
-    const int r_eob = eob * scaling_factor, r_rd = rd * scaling_factor,
-              r_sse = sse * scaling_factor;
-
-    if (blk_row < block_height_mi / 2 && blk_col < block_width_mi / 2) {
-      rd_stats->eob_0 = r_eob;
-      rd_stats->rd_0 = r_rd;
-      rd_stats->sse_0 = r_sse;
-    } else if (blk_row < block_height_mi / 2 && blk_col >= block_width_mi / 2) {
-      rd_stats->eob_1 = r_eob;
-      rd_stats->rd_1 = r_rd;
-      rd_stats->sse_1 = r_sse;
-    } else if (blk_row >= block_height_mi / 2 && blk_col < block_width_mi / 2) {
-      rd_stats->eob_2 = r_eob;
-      rd_stats->rd_2 = r_rd;
-      rd_stats->sse_2 = r_sse;
-    } else {
-      rd_stats->eob_3 = r_eob;
-      rd_stats->rd_3 = r_rd;
-      rd_stats->sse_3 = r_sse;
-    }
-  } else if (crop_height_mi == block_height_mi &&
-             crop_width_mi == block_width_mi) {
-    // The transform block is the whole prediction block
-    const int scaling_factor = max_scaling_factor;
-    const int r_eob = eob * scaling_factor, r_rd = rd * scaling_factor,
-              r_sse = sse * scaling_factor;
-
-    rd_stats->eob_0 = r_eob;
-    rd_stats->rd_0 = r_rd;
-    rd_stats->sse_0 = r_sse;
-
-    rd_stats->eob_1 = r_eob;
-    rd_stats->rd_1 = r_rd;
-    rd_stats->sse_1 = r_sse;
-
-    rd_stats->eob_2 = r_eob;
-    rd_stats->rd_2 = r_rd;
-    rd_stats->sse_2 = r_sse;
-
-    rd_stats->eob_3 = r_eob;
-    rd_stats->rd_3 = r_rd;
-    rd_stats->sse_3 = r_sse;
-  } else if (crop_height_mi == block_height_mi) {
-    // The tranform block is a vertical block
-    const int scaling_factor = max_scaling_factor / 2;
-    const int r_eob = eob * scaling_factor, r_rd = rd * scaling_factor,
-              r_sse = sse * scaling_factor;
-
-    if (blk_col < block_width_mi / 2) {
-      rd_stats->eob_0 = r_eob;
-      rd_stats->rd_0 = r_rd;
-      rd_stats->sse_0 = r_sse;
-
-      rd_stats->eob_2 = r_eob;
-      rd_stats->rd_2 = r_rd;
-      rd_stats->sse_2 = r_sse;
-    } else {
-      rd_stats->eob_1 = r_eob;
-      rd_stats->rd_1 = r_rd;
-      rd_stats->sse_1 = r_sse;
-
-      rd_stats->eob_3 = r_eob;
-      rd_stats->rd_3 = r_rd;
-      rd_stats->sse_3 = r_sse;
-    }
-  } else if (crop_width_mi == block_width_mi) {
-    // The tranform block is a horizontal block half the size of predition block
-    const int scaling_factor = max_scaling_factor / 2;
-    const int r_eob = eob * scaling_factor, r_rd = rd * scaling_factor,
-              r_sse = sse * scaling_factor;
-
-    if (blk_row < block_height_mi / 2) {
-      rd_stats->eob_0 = r_eob;
-      rd_stats->rd_0 = r_rd;
-      rd_stats->sse_0 = r_sse;
-
-      rd_stats->eob_1 = r_eob;
-      rd_stats->rd_1 = r_rd;
-      rd_stats->sse_1 = r_sse;
-    } else {
-      rd_stats->eob_2 = r_eob;
-      rd_stats->rd_2 = r_rd;
-      rd_stats->sse_2 = r_sse;
-
-      rd_stats->eob_3 = r_eob;
-      rd_stats->rd_3 = r_rd;
-      rd_stats->sse_3 = r_sse;
-    }
-  } else {
-    assert(0 && "Unexpected transform size");
-  }
 }
 
-static INLINE void av1_reg_stat_skipmode_update(RD_STATS *rd_stats,
-                                                int rdmult) {
-  // Update the stats
-  rd_stats->eob = 0;
-  rd_stats->eob_0 = 0;
-  rd_stats->eob_1 = 0;
-  rd_stats->eob_2 = 0;
-  rd_stats->eob_3 = 0;
-
-  rd_stats->rd = RDCOST(rdmult, 0, rd_stats->sse);
-  rd_stats->rd_0 = RDCOST(rdmult, 0, rd_stats->sse_0);
-  rd_stats->rd_1 = RDCOST(rdmult, 0, rd_stats->sse_1);
-  rd_stats->rd_2 = RDCOST(rdmult, 0, rd_stats->sse_2);
-  rd_stats->rd_3 = RDCOST(rdmult, 0, rd_stats->sse_3);
-}
-
-static INLINE void av1_copy_reg_stat(RD_STATS *rd_stats_dst,
-                                     RD_STATS *rd_stats_src) {
-  rd_stats_dst->eob = rd_stats_src->eob;
-  rd_stats_dst->eob_0 = rd_stats_src->eob_0;
-  rd_stats_dst->eob_1 = rd_stats_src->eob_1;
-  rd_stats_dst->eob_2 = rd_stats_src->eob_2;
-  rd_stats_dst->eob_3 = rd_stats_src->eob_3;
-
-  rd_stats_dst->rd = rd_stats_src->rd;
-  rd_stats_dst->rd_0 = rd_stats_src->rd_0;
-  rd_stats_dst->rd_1 = rd_stats_src->rd_1;
-  rd_stats_dst->rd_2 = rd_stats_src->rd_2;
-  rd_stats_dst->rd_3 = rd_stats_src->rd_3;
-
-  rd_stats_dst->y_sse = rd_stats_src->y_sse;
-  rd_stats_dst->sse_0 = rd_stats_src->sse_0;
-  rd_stats_dst->sse_1 = rd_stats_src->sse_1;
-  rd_stats_dst->sse_2 = rd_stats_src->sse_2;
-  rd_stats_dst->sse_3 = rd_stats_src->sse_3;
-}
-
-static INLINE void av1_unpack_reg_stat(RD_STATS *rd_stats, int *eob, int *eob_0,
-                                       int *eob_1, int *eob_2, int *eob_3,
-                                       int64_t *rd, int64_t *rd_0,
-                                       int64_t *rd_1, int64_t *rd_2,
-                                       int64_t *rd_3) {
-  *rd = rd_stats->rd;
-  *rd_0 = rd_stats->rd_0;
-  *rd_1 = rd_stats->rd_1;
-  *rd_2 = rd_stats->rd_2;
-  *rd_3 = rd_stats->rd_3;
-
-  *eob = rd_stats->eob;
-  *eob_0 = rd_stats->eob_0;
-  *eob_1 = rd_stats->eob_1;
-  *eob_2 = rd_stats->eob_2;
-  *eob_3 = rd_stats->eob_3;
-}
-
-static INLINE void av1_set_reg_stat(RD_STATS *rd_stats, int eob, int eob_0,
-                                    int eob_1, int eob_2, int eob_3, int64_t rd,
-                                    int64_t rd_0, int64_t rd_1, int64_t rd_2,
-                                    int64_t rd_3) {
-  rd_stats->rd = rd;
-  rd_stats->rd_0 = rd_0;
-  rd_stats->rd_1 = rd_1;
-  rd_stats->rd_2 = rd_2;
-  rd_stats->rd_3 = rd_3;
-
-  rd_stats->eob = eob;
-  rd_stats->eob_0 = eob_0;
-  rd_stats->eob_1 = eob_1;
-  rd_stats->eob_2 = eob_2;
-  rd_stats->eob_3 = eob_3;
-}
-#endif
-
 struct TileInfo;
 struct TileDataEnc;
 struct AV1_COMP;
@@ -657,9 +399,10 @@ void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
 void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
                                   unsigned int qstep, int *rate, int64_t *dist);
 
-void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f);
-void av1_model_rd_surffit(double xm, double yl, double *rate_f,
-                          double *distbysse_f);
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+                          double *rate_f, double *distbysse_f);
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+                          double yl, double *rate_f, double *distbysse_f);
 
 int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
                             const MACROBLOCKD *xd);
@@ -684,8 +427,6 @@ void av1_get_entropy_contexts(BLOCK_SIZE bsize,
 
 void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
 
-void av1_set_rd_speed_thresholds_sub8x8(struct AV1_COMP *cpi);
-
 void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
                                int (*fact)[MAX_MODES], int rd_thresh, int bsize,
                                int best_mode_index);
diff --git a/libaom/av1/encoder/rdopt.c b/libaom/av1/encoder/rdopt.c
index b393e6f..5e6054e 100644
--- a/libaom/av1/encoder/rdopt.c
+++ b/libaom/av1/encoder/rdopt.c
@@ -125,14 +125,14 @@ static void model_rd_with_surffit(const AV1_COMP *const cpi,
                                   int64_t sse, int num_samples, int *rate,
                                   int64_t *dist);
 
-typedef enum {
+enum {
   MODELRD_LEGACY,
   MODELRD_CURVFIT,
   MODELRD_SUFFIT,
   MODELRD_DNN,
   MODELRD_FULLRDY,
   MODELRD_TYPES
-} ModelRdType;
+} UENUM1BYTE(ModelRdType);
 
 static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
   model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit,
@@ -150,11 +150,12 @@ static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
 // 3: DNN regression model
 // 4: Full rd model
 #define MODELRD_TYPE_INTERP_FILTER 1
-#define MODELRD_TYPE_TX_SEARCH_PRUNE 2
+#define MODELRD_TYPE_TX_SEARCH_PRUNE 1
 #define MODELRD_TYPE_MASKED_COMPOUND 1
 #define MODELRD_TYPE_INTERINTRA 1
 #define MODELRD_TYPE_INTRA 1
-#define MODELRD_TYPE_JNT_COMPOUND 1
+#define MODELRD_TYPE_DIST_WTD_COMPOUND 1
+#define MODELRD_TYPE_MOTION_MODE_RD 1
 
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
 static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = {
@@ -163,10 +164,6 @@ static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = {
   0x00000002, 0x00010002, 0x00020002,  // y = 2
 };
 
-#define SECOND_REF_FRAME_MASK                                         \
-  ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | \
-   (1 << GOLDEN_FRAME) | (1 << LAST2_FRAME) | 0x01)
-
 static const double ADST_FLIP_SVM[8] = {
   /* vertical */
   -6.6623, -2.8062, -3.2531, 3.1671,
@@ -179,26 +176,12 @@ typedef struct {
   MV_REFERENCE_FRAME ref_frame[2];
 } MODE_DEFINITION;
 
-typedef struct {
-  MV_REFERENCE_FRAME ref_frame[2];
-} REF_DEFINITION;
-
-typedef enum {
+enum {
   FTXS_NONE = 0,
   FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0,
   FTXS_DISABLE_TRELLIS_OPT = 1 << 1,
   FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
-} FAST_TX_SEARCH_MODE;
-
-static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
-                               RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
-                               int mi_col, int64_t ref_best_rd);
-
-static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
-                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                            int64_t non_skip_ref_best_rd,
-                            int64_t skip_ref_best_rd,
-                            FAST_TX_SEARCH_MODE ftxs_mode);
+} UENUM1BYTE(FAST_TX_SEARCH_MODE);
 
 struct rdcost_block_args {
   const AV1_COMP *cpi;
@@ -212,6 +195,7 @@ struct rdcost_block_args {
   int incomplete_exit;
   int use_fast_coef_costing;
   FAST_TX_SEARCH_MODE ftxs_mode;
+  int skip_trellis;
 };
 
 #define LAST_NEW_MV_INDEX 6
@@ -749,12 +733,12 @@ typedef struct InterModeSearchState {
   MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
 } InterModeSearchState;
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
 static int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
-  if (bsize == BLOCK_8X8) return 1;
-  if (bsize == BLOCK_16X16) return 2;
-  if (bsize == BLOCK_32X32) return 3;
-  return -1;
+  if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+      bsize == BLOCK_4X16 || bsize == BLOCK_16X4) {
+    return -1;
+  }
+  return 1;
 }
 
 void av1_inter_mode_data_init(TileDataEnc *tile_data) {
@@ -770,37 +754,41 @@ void av1_inter_mode_data_init(TileDataEnc *tile_data) {
   }
 }
 
-static int get_est_rate_dist(TileDataEnc *tile_data, BLOCK_SIZE bsize,
+static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
                              int64_t sse, int *est_residue_cost,
                              int64_t *est_dist) {
   aom_clear_system_state();
   const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
   if (md->ready) {
-    const double est_ld = md->a * sse + md->b;
     if (sse < md->dist_mean) {
       *est_residue_cost = 0;
       *est_dist = sse;
     } else {
-      *est_residue_cost = (int)round((sse - md->dist_mean) / est_ld);
       *est_dist = (int64_t)round(md->dist_mean);
+      const double est_ld = md->a * sse + md->b;
+      // Clamp estimated rate cost by INT_MAX / 2.
+      // TODO(angiebird@google.com): find better solution than clamping.
+      if (fabs(est_ld) < 1e-2) {
+        *est_residue_cost = INT_MAX / 2;
+      } else {
+        double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld);
+        if (est_residue_cost_dbl < 0) {
+          *est_residue_cost = 0;
+        } else {
+          *est_residue_cost =
+              (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2);
+        }
+      }
+      if (*est_residue_cost <= 0) {
+        *est_residue_cost = 0;
+        *est_dist = sse;
+      }
     }
     return 1;
   }
   return 0;
 }
 
-static int64_t get_est_rd(TileDataEnc *tile_data, BLOCK_SIZE bsize, int rdmult,
-                          int64_t sse, int curr_cost) {
-  int est_residue_cost;
-  int64_t est_dist;
-  if (get_est_rate_dist(tile_data, bsize, sse, &est_residue_cost, &est_dist)) {
-    int rate = est_residue_cost + curr_cost;
-    int64_t est_rd = RDCOST(rdmult, rate, est_dist);
-    return est_rd;
-  }
-  return 0;
-}
-
 void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
   aom_clear_system_state();
   for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
@@ -865,20 +853,31 @@ static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize,
     rd_model->dist_sum += dist;
     rd_model->ld_sum += ld;
     rd_model->sse_sum += sse;
-    rd_model->sse_sse_sum += sse * sse;
+    rd_model->sse_sse_sum += (double)sse * (double)sse;
     rd_model->sse_ld_sum += sse * ld;
   }
 }
 
 static void inter_modes_info_push(InterModesInfo *inter_modes_info,
-                                  int mode_rate, int64_t sse, int64_t est_rd,
+                                  int mode_rate, int64_t sse, int64_t rd,
+                                  bool true_rd, uint8_t *blk_skip,
+                                  RD_STATS *rd_cost, RD_STATS *rd_cost_y,
+                                  RD_STATS *rd_cost_uv,
                                   const MB_MODE_INFO *mbmi) {
   const int num = inter_modes_info->num;
   assert(num < MAX_INTER_MODES);
   inter_modes_info->mbmi_arr[num] = *mbmi;
   inter_modes_info->mode_rate_arr[num] = mode_rate;
   inter_modes_info->sse_arr[num] = sse;
-  inter_modes_info->est_rd_arr[num] = est_rd;
+  inter_modes_info->est_rd_arr[num] = rd;
+  inter_modes_info->true_rd_arr[num] = true_rd;
+  if (blk_skip != NULL) {
+    memcpy(inter_modes_info->blk_skip_arr[num], blk_skip,
+           sizeof(blk_skip[0]) * MAX_MIB_SIZE * MAX_MIB_SIZE);
+  }
+  inter_modes_info->rd_cost_arr[num] = *rd_cost;
+  inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y;
+  inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv;
   ++inter_modes_info->num;
 }
 
@@ -904,7 +903,6 @@ static void inter_modes_info_sort(const InterModesInfo *inter_modes_info,
   qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]),
         compare_rd_idx_pair);
 }
-#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
 
 static INLINE int write_uniform_cost(int n, int v) {
   const int l = get_unsigned_bits(n);
@@ -961,7 +959,7 @@ static unsigned pixel_dist_visible_only(
   }
   const MACROBLOCKD *xd = &x->e_mbd;
 
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
                                              visible_cols, visible_rows);
     return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
@@ -1217,7 +1215,7 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
 
   if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
       x->tune_metric == AOM_TUNE_DAALA_DIST) {
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
       for (j = 0; j < bsh; j++)
         for (i = 0; i < bsw; i++)
           orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
@@ -1281,8 +1279,7 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
                                  bsw, coeff_shift);
       }
     }
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      d = ((uint64_t)d) >> 2 * coeff_shift;
+    if (is_cur_buf_hbd(xd)) d = ((uint64_t)d) >> 2 * coeff_shift;
   } else {
     // Otherwise, MSE by default
     d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride,
@@ -1310,7 +1307,7 @@ static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
 
   if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
       x->tune_metric == AOM_TUNE_DAALA_DIST) {
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
       for (j = 0; j < bsh; j++)
         for (i = 0; i < bsw; i++)
           orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
@@ -1727,16 +1724,19 @@ void av1_get_horver_correlation_full_c(const int16_t *diff, int stride,
 static void score_2D_transform_pow8(float *scores_2D, float shift) {
   float sum = 0.0f;
   int i;
-
   for (i = 0; i < 16; i++) {
-    float v, v2, v4;
-    v = AOMMAX(scores_2D[i] + shift, 0.0f);
-    v2 = v * v;
-    v4 = v2 * v2;
+    const float v = AOMMIN(AOMMAX(scores_2D[i] + shift, 0.0f), 100.0f);
+    const float v2 = v * v;
+    const float v4 = v2 * v2;
     scores_2D[i] = v4 * v4;
     sum += scores_2D[i];
   }
-  for (i = 0; i < 16; i++) scores_2D[i] /= sum;
+  for (i = 0; i < 16; i++) {
+    if (scores_2D[i] < sum * 1e-4)
+      scores_2D[i] = 0.0f;
+    else
+      scores_2D[i] /= sum;
+  }
 }
 
 // These thresholds were calibrated to provide a certain number of TX types
@@ -1909,7 +1909,13 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   x->tx_search_prune[tx_set_type] = 0;
   x->tx_split_prune_flag = 0;
   const MB_MODE_INFO *mbmi = xd->mi[0];
-  if (!is_inter_block(mbmi) || cpi->sf.tx_type_search.prune_mode == NO_PRUNE ||
+  const int is_inter = is_inter_block(mbmi);
+  if ((is_inter && cpi->oxcf.use_inter_dct_only) ||
+      (!is_inter && cpi->oxcf.use_intra_dct_only)) {
+    x->tx_search_prune[tx_set_type] = ~(1 << DCT_DCT);
+    return;
+  }
+  if (!is_inter || cpi->sf.tx_type_search.prune_mode == NO_PRUNE ||
       x->use_default_inter_tx_type || xd->lossless[mbmi->segment_id] ||
       x->cb_partition_scan)
     return;
@@ -1948,8 +1954,7 @@ static void model_rd_from_sse(const AV1_COMP *const cpi,
   (void)num_samples;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int dequant_shift =
-      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
 
   // Fast approximate the modelling function.
   if (cpi->sf.simple_model_rd_from_var) {
@@ -1971,7 +1976,6 @@ static void model_rd_from_sse(const AV1_COMP *const cpi,
   *dist <<= 4;
 }
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
 static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
@@ -1994,7 +1998,6 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
   total_sse <<= 4;
   return total_sse;
 }
-#endif
 
 static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                             MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
@@ -2028,7 +2031,7 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
 
     if (x->skip_chroma_rd && plane) continue;
 
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
       sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
                            pd->dst.stride, bw, bh);
     } else {
@@ -2057,43 +2060,6 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   *out_dist_sum = dist_sum;
 }
 
-static void check_block_skip(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                             MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
-                             int plane_to, int *skip_txfm_sb) {
-  *skip_txfm_sb = 1;
-  for (int plane = plane_from; plane <= plane_to; ++plane) {
-    struct macroblock_plane *const p = &x->plane[plane];
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE bs =
-        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    unsigned int sse;
-
-    if (x->skip_chroma_rd && plane) continue;
-
-    // Since fast HBD variance functions scale down sse by 4 bit, we first use
-    // fast vf implementation to rule out blocks with non-zero scaled sse. Then,
-    // only if the source is HBD and the scaled sse is 0, accurate sse
-    // computation is applied to determine if the sse is really 0. This step is
-    // necessary for HBD lossless coding.
-    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-                       &sse);
-    if (sse) {
-      *skip_txfm_sb = 0;
-      return;
-    } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      uint64_t sse64 = aom_highbd_sse_odd_size(
-          p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-          block_size_wide[bs], block_size_high[bs]);
-
-      if (sse64) {
-        *skip_txfm_sb = 0;
-        return;
-      }
-    }
-  }
-  return;
-}
-
 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
@@ -2195,7 +2161,8 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
 static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
                                       int blk_row, int blk_col,
                                       const BLOCK_SIZE plane_bsize,
-                                      const BLOCK_SIZE tx_bsize) {
+                                      const BLOCK_SIZE tx_bsize,
+                                      unsigned int *block_mse_q8) {
   int visible_rows, visible_cols;
   const MACROBLOCKD *xd = &x->e_mbd;
   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
@@ -2218,7 +2185,11 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
   }
 #endif
   diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]);
-  return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+  uint64_t sse =
+      aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+  if (block_mse_q8 != NULL)
+    *block_mse_q8 = (unsigned int)((256 * sse) / (visible_cols * visible_rows));
+  return sse;
 }
 
 int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
@@ -2318,7 +2289,7 @@ static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+  if (is_cur_buf_hbd(xd))
     *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
                                        xd->bd);
   else
@@ -2354,7 +2325,7 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
   uint8_t *recon;
   DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
 
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     recon = CONVERT_TO_BYTEPTR(recon16);
     av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride,
                                    CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw,
@@ -2376,11 +2347,29 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
                          blk_row, blk_col, plane_bsize, tx_bsize);
 }
 
-static double get_mean(const int16_t *diff, int stride, int w, int h) {
+static double get_diff_mean(const uint8_t *src, int src_stride,
+                            const uint8_t *dst, int dst_stride, int w, int h) {
   double sum = 0.0;
   for (int j = 0; j < h; ++j) {
     for (int i = 0; i < w; ++i) {
-      sum += diff[j * stride + i];
+      const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+      sum += diff;
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static double get_highbd_diff_mean(const uint8_t *src8, int src_stride,
+                                   const uint8_t *dst8, int dst_stride, int w,
+                                   int h) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+      sum += diff;
     }
   }
   assert(w > 0 && h > 0);
@@ -2469,6 +2458,17 @@ static void get_2x2_normalized_sses_and_sads(
 #if CONFIG_COLLECT_RD_STATS
 
 #if CONFIG_COLLECT_RD_STATS == 1
+static double get_mean(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += diff[j * stride + i];
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
 static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     const RD_STATS *const rd_stats, int blk_row,
                                     int blk_col, BLOCK_SIZE plane_bsize,
@@ -2491,10 +2491,9 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int txw = tx_size_wide[tx_size];
   const int txh = tx_size_high[tx_size];
-  const int dequant_shift =
-      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
   const int q_step = pd->dequant_Q3[1] >> dequant_shift;
-  const double num_samples = txw * txh;
+  const int num_samples = txw * txh;
 
   const double rate_norm = (double)rd_stats->rate / num_samples;
   const double dist_norm = (double)rd_stats->dist / num_samples;
@@ -2566,15 +2565,25 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_COLLECT_RD_STATS == 1
 
 #if CONFIG_COLLECT_RD_STATS >= 2
-static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
+static void PrintPredictionUnitStats(const AV1_COMP *const cpi,
+                                     const TileDataEnc *tile_data,
+                                     MACROBLOCK *x,
                                      const RD_STATS *const rd_stats,
                                      BLOCK_SIZE plane_bsize) {
   if (rd_stats->invalid_rate) return;
   if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
 
+  if (cpi->sf.inter_mode_rd_model_estimation == 1 &&
+      (tile_data == NULL ||
+       !tile_data->inter_mode_rd_models[plane_bsize].ready))
+    return;
+  (void)tile_data;
   // Generate small sample to restrict output size.
   static unsigned int seed = 95014;
-  if (lcg_rand16(&seed) % 256 > 0) return;
+
+  if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) !=
+      1)
+    return;
 
   const char output_file[] = "pu_stats.txt";
   FILE *fout = fopen(output_file, "a");
@@ -2589,8 +2598,7 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
   get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
                      &bh);
   const int num_samples = bw * bh;
-  const int dequant_shift =
-      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
   const int q_step = pd->dequant_Q3[1] >> dequant_shift;
 
   const double rate_norm = (double)rd_stats->rate / num_samples;
@@ -2607,7 +2615,14 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
   const int16_t *const src_diff = p->src_diff;
   const int shift = (xd->bd - 8);
 
-  int64_t sse = aom_sum_squares_2d_i16(src_diff, diff_stride, bw, bh);
+  int64_t sse;
+  if (is_cur_buf_hbd(xd)) {
+    sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                         bw, bh);
+  } else {
+    sse =
+        aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
+  }
   sse = ROUND_POWER_OF_TWO(sse, shift * 2);
   const double sse_norm = (double)sse / num_samples;
 
@@ -2646,7 +2661,14 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
   fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
           model_rdcost_norm);
 
-  double mean = get_mean(src_diff, diff_stride, bw, bh);
+  double mean;
+  if (is_cur_buf_hbd(xd)) {
+    mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf,
+                                pd->dst.stride, bw, bh);
+  } else {
+    mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                         bw, bh);
+  }
   mean /= (1 << shift);
   float hor_corr, vert_corr;
   av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr,
@@ -2659,6 +2681,21 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
   fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
           hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
 
+  if (cpi->sf.inter_mode_rd_model_estimation == 1) {
+    assert(tile_data->inter_mode_rd_models[plane_bsize].ready);
+    const int64_t overall_sse = get_sse(cpi, x);
+    int est_residue_cost = 0;
+    int64_t est_dist = 0;
+    get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost,
+                      &est_dist);
+    const double est_residue_cost_norm = (double)est_residue_cost / num_samples;
+    const double est_dist_norm = (double)est_dist / num_samples;
+    const double est_rdcost_norm =
+        (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples;
+    fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm,
+            est_rdcost_norm);
+  }
+
   fprintf(fout, "\n");
   fclose(fout);
 }
@@ -2673,8 +2710,7 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi,
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int log_numpels = num_pels_log2_lookup[plane_bsize];
 
-  const int dequant_shift =
-      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
   const int q_step = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
 
   const struct macroblock_plane *const p = &x->plane[plane];
@@ -2711,7 +2747,12 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi,
   get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
                                    dst_stride, src_diff, diff_stride,
                                    sse_norm_arr, NULL);
-  double mean = get_mean(src_diff, bw, bw, bh);
+  double mean;
+  if (is_cur_buf_hbd(xd)) {
+    mean = get_highbd_diff_mean(src, src_stride, dst, dst_stride, bw, bh);
+  } else {
+    mean = get_diff_mean(src, src_stride, dst, dst_stride, bw, bh);
+  }
   if (shift) {
     for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
     mean /= (1 << shift);
@@ -2790,7 +2831,7 @@ static void model_rd_for_sb_with_dnn(
     int bw, bh;
     get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
                        &bw, &bh);
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
       sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
                            pd->dst.stride, bw, bh);
     } else {
@@ -2829,8 +2870,7 @@ static void model_rd_with_surffit(const AV1_COMP *const cpi,
   (void)plane_bsize;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int dequant_shift =
-      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
   const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
   if (sse == 0) {
     if (rate) *rate = 0;
@@ -2844,7 +2884,8 @@ static void model_rd_with_surffit(const AV1_COMP *const cpi,
   const double yl = log(sse_norm / qstepsqr) / log(2.0);
   double rate_f, dist_by_sse_norm_f;
 
-  av1_model_rd_surffit(xm, yl, &rate_f, &dist_by_sse_norm_f);
+  av1_model_rd_surffit(plane_bsize, sse_norm, xm, yl, &rate_f,
+                       &dist_by_sse_norm_f);
 
   const double dist_f = dist_by_sse_norm_f * sse_norm;
   int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
@@ -2894,7 +2935,7 @@ static void model_rd_for_sb_with_surffit(
     const int shift = (xd->bd - 8);
     get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
                        &bw, &bh);
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
       sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
                            pd->dst.stride, bw, bh);
     } else {
@@ -2934,8 +2975,7 @@ static void model_rd_with_curvfit(const AV1_COMP *const cpi,
   (void)plane_bsize;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int dequant_shift =
-      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
   const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
 
   if (sse == 0) {
@@ -2946,10 +2986,11 @@ static void model_rd_with_curvfit(const AV1_COMP *const cpi,
   aom_clear_system_state();
   const double sse_norm = (double)sse / num_samples;
   const double qstepsqr = (double)qstep * qstep;
-  const double xqr = log(sse_norm / qstepsqr) / log(2.0);
+  const double xqr = log2(sse_norm / qstepsqr);
 
   double rate_f, dist_by_sse_norm_f;
-  av1_model_rd_curvfit(xqr, &rate_f, &dist_by_sse_norm_f);
+  av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f,
+                       &dist_by_sse_norm_f);
 
   const double dist_f = dist_by_sse_norm_f * sse_norm;
   int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
@@ -3000,7 +3041,7 @@ static void model_rd_for_sb_with_curvfit(
     get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
                        &bw, &bh);
 
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
       sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
                            pd->dst.stride, bw, bh);
     } else {
@@ -3029,78 +3070,13 @@ static void model_rd_for_sb_with_curvfit(
   *out_dist_sum = dist_sum;
 }
 
-static void model_rd_for_sb_with_fullrdy(
-    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
-    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
-    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
-    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
-  const int ref = xd->mi[0]->ref_frame[0];
-
-  int64_t rate_sum = 0;
-  int64_t dist_sum = 0;
-  int64_t total_sse = 0;
-
-  for (int plane = plane_from; plane <= plane_to; ++plane) {
-    struct macroblock_plane *const p = &x->plane[plane];
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    const int bw = block_size_wide[plane_bsize];
-    const int bh = block_size_high[plane_bsize];
-    int64_t sse;
-    int rate;
-    int64_t dist;
-
-    if (x->skip_chroma_rd && plane) continue;
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
-                           pd->dst.stride, bw, bh);
-    } else {
-      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
-                    bh);
-    }
-    sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
-
-    RD_STATS rd_stats;
-    if (plane == 0) {
-      select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
-      if (rd_stats.invalid_rate) {
-        rate = 0;
-        dist = sse << 4;
-      } else {
-        rate = rd_stats.rate;
-        dist = rd_stats.dist;
-      }
-    } else {
-      model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
-                            &dist);
-    }
-
-    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
-
-    total_sse += sse;
-    rate_sum += rate;
-    dist_sum += dist;
-
-    if (plane_rate) plane_rate[plane] = rate;
-    if (plane_sse) plane_sse[plane] = sse;
-    if (plane_dist) plane_dist[plane] = dist;
-  }
-
-  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
-  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
-  *out_rate_sum = (int)rate_sum;
-  *out_dist_sum = dist_sum;
-}
-
 static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                                int block, int blk_row, int blk_col,
                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                                const TXB_CTX *const txb_ctx,
                                FAST_TX_SEARCH_MODE ftxs_mode,
-                               int use_fast_coef_costing, int64_t ref_best_rd,
-                               RD_STATS *best_rd_stats) {
+                               int use_fast_coef_costing, int skip_trellis,
+                               int64_t ref_best_rd, RD_STATS *best_rd_stats) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -3118,6 +3094,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   tran_low_t *best_dqcoeff = this_dqcoeff;
   const int txk_type_idx =
       av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+  int perform_block_coeff_opt;
   av1_invalid_rd_stats(best_rd_stats);
 
   TXB_RD_INFO *intra_txb_rd_info = NULL;
@@ -3129,6 +3106,9 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
       (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
       mi_col >= xd->tile.mi_col_start &&
       (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
+  skip_trellis |=
+      cpi->optimize_seg_arr[mbmi->segment_id] == NO_TRELLIS_OPT ||
+      cpi->optimize_seg_arr[mbmi->segment_id] == FINAL_PASS_TRELLIS_OPT;
   if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
       !is_inter && plane == 0 &&
       tx_size_wide[tx_size] == tx_size_high[tx_size]) {
@@ -3168,7 +3148,8 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   TX_TYPE txk_end = TX_TYPES - 1;
   if ((!is_inter && x->use_default_intra_tx_type) ||
       (is_inter && x->use_default_inter_tx_type)) {
-    txk_start = txk_end = get_default_tx_type(0, xd, tx_size);
+    txk_start = txk_end =
+        get_default_tx_type(0, xd, tx_size, cpi->is_screen_content_type);
   } else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) {
     if (plane == 0) txk_end = DCT_DCT;
   }
@@ -3186,7 +3167,9 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   }
   const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type];
   if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
-      ext_tx_used_flag == 0x0001) {
+      ext_tx_used_flag == 0x0001 ||
+      (is_inter && cpi->oxcf.use_inter_dct_only) ||
+      (!is_inter && cpi->oxcf.use_intra_dct_only)) {
     txk_start = txk_end = DCT_DCT;
   }
   uint16_t allowed_tx_mask = 0;  // 1: allow; 0: skip.
@@ -3212,14 +3195,35 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
       }
     }
   }
+
+  if (cpi->oxcf.enable_flip_idtx == 0) {
+    for (TX_TYPE tx_type = FLIPADST_DCT; tx_type <= H_FLIPADST; ++tx_type) {
+      allowed_tx_mask &= ~(1 << tx_type);
+    }
+  }
+
   // Need to have at least one transform type allowed.
   if (allowed_tx_mask == 0) {
     txk_start = txk_end = (plane ? uv_tx_type : DCT_DCT);
     allowed_tx_mask = (1 << txk_start);
   }
 
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  int64_t block_sse = 0;
+  unsigned int block_mse_q8 = UINT_MAX;
+  block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize,
+                              &block_mse_q8);
+  assert(block_mse_q8 != UINT_MAX);
+  if (is_cur_buf_hbd(xd)) {
+    block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+    block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
+  }
+  block_sse *= 16;
+  // Tranform domain distortion is accurate for higher residuals.
+  // TODO(any): Experiment with variance and mean based thresholds
   int use_transform_domain_distortion =
       (cpi->sf.use_transform_domain_distortion > 0) &&
+      (block_mse_q8 >= cpi->tx_domain_dist_threshold) &&
       // Any 64-pt transforms only preserves half the coefficients.
       // Therefore transform domain distortion is not valid for these
       // transform sizes.
@@ -3237,20 +3241,18 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
   const uint16_t *eobs_ptr = x->plane[plane].eobs;
 
-  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-  int64_t block_sse =
-      pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
-  block_sse *= 16;
+  // Used mse based threshold logic to take decision of R-D of optimization of
+  // coeffs. For smaller residuals, coeff optimization would be helpful. For
+  // larger residuals, R-D optimization may not be effective.
+  // TODO(any): Experiment with variance and mean based thresholds
+  perform_block_coeff_opt = (block_mse_q8 <= cpi->coeff_opt_dist_threshold);
 
   for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
     if (!(allowed_tx_mask & (1 << tx_type))) continue;
     if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
     RD_STATS this_rd_stats;
     av1_invalid_rd_stats(&this_rd_stats);
-
-    if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
+    if (skip_trellis || (!perform_block_coeff_opt)) {
       av1_xform_quant(
           cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
           USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
@@ -3270,8 +3272,8 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
             RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse));
         if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue;
       }
-      av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1,
-                     &rate_cost);
+      av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
+                     cpi->sf.trellis_eob_fast, &rate_cost);
     }
     if (eobs_ptr[block] == 0) {
       // When eob is 0, pixel domain distortion is more efficient and accurate.
@@ -3280,8 +3282,38 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
       dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
                            &this_rd_stats.sse);
     } else {
-      this_rd_stats.dist = dist_block_px_domain(
-          cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+      int64_t sse_diff = INT64_MAX;
+      // high_energy threshold assumes that every pixel within a txfm block
+      // has a residue energy of at least 25% of the maximum, i.e. 128 * 128
+      // for 8 bit, then the threshold is scaled based on input bit depth.
+      const int64_t high_energy_thresh =
+          ((int64_t)128 * 128 * tx_size_2d[tx_size]) << ((xd->bd - 8) * 2);
+      const int is_high_energy = (block_sse >= high_energy_thresh);
+      if (tx_size == TX_64X64 || is_high_energy) {
+        // Because 3 out 4 quadrants of transform coefficients are forced to
+        // zero, the inverse transform has a tendency to overflow. sse_diff
+        // is effectively the energy of those 3 quadrants, here we use it
+        // to decide if we should do pixel domain distortion. If the energy
+        // is mostly in first quadrant, then it is unlikely that we have
+        // overflow issue in inverse transform.
+        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                             &this_rd_stats.sse);
+        sse_diff = block_sse - this_rd_stats.sse;
+      }
+      if (tx_size != TX_64X64 || !is_high_energy ||
+          (sse_diff * 2) < this_rd_stats.sse) {
+        const int64_t tx_domain_dist = this_rd_stats.dist;
+        this_rd_stats.dist = dist_block_px_domain(
+            cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+        // For high energy blocks, occasionally, the pixel domain distortion
+        // can be artificially low due to clamping at reconstruction stage
+        // even when inverse transform output is hugely different from the
+        // actual residue.
+        if (is_high_energy && this_rd_stats.dist < tx_domain_dist)
+          this_rd_stats.dist = tx_domain_dist;
+      } else {
+        this_rd_stats.dist += sse_diff;
+      }
       this_rd_stats.sse = block_sse;
     }
 
@@ -3396,7 +3428,7 @@ RECON_INTRA:
     // if the last search tx_type is the best tx_type, we don't need to
     // do this again
     if (best_tx_type != last_tx_type) {
-      if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
+      if (skip_trellis) {
         av1_xform_quant(
             cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
             best_tx_type,
@@ -3404,8 +3436,8 @@ RECON_INTRA:
       } else {
         av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
                         tx_size, best_tx_type, AV1_XFORM_QUANT_FP);
-        av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, 1,
-                       &rate_cost);
+        av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx,
+                       cpi->sf.trellis_eob_fast, &rate_cost);
       }
     }
 
@@ -3432,12 +3464,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(xd->mi[0]);
   const AV1_COMP *cpi = args->cpi;
   ENTROPY_CONTEXT *a = args->t_above + blk_col;
   ENTROPY_CONTEXT *l = args->t_left + blk_row;
   const AV1_COMMON *cm = &cpi->common;
-  int64_t rd1, rd2, rd;
   RD_STATS this_rd_stats;
 
   av1_init_rd_stats(&this_rd_stats);
@@ -3447,7 +3478,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
     return;
   }
 
-  if (!is_inter_block(mbmi)) {
+  if (!is_inter) {
     av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
     av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
   }
@@ -3455,10 +3486,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
   search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                   &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
-                  args->best_rd - args->this_rd, &this_rd_stats);
+                  args->skip_trellis, args->best_rd - args->this_rd,
+                  &this_rd_stats);
 
   if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
-    assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
+    assert(!is_inter || plane_bsize < BLOCK_8X8);
     cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
   }
 
@@ -3477,37 +3509,26 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   else
     set_blk_skip(x, plane, blk_idx, 0);
 
-  rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
-  rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+  const int64_t rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+  const int64_t rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
 
   // TODO(jingning): temporarily enabled only for luma component
-  rd = AOMMIN(rd1, rd2);
+  const int64_t rd = AOMMIN(rd1, rd2);
 
   this_rd_stats.skip &= !x->plane[plane].eobs[block];
 
-#if CONFIG_ONE_PASS_SVM
-  if (plane == AOM_PLANE_Y && plane_bsize >= BLOCK_8X8) {
-    int eob = x->plane[plane].eobs[block];
-    av1_add_reg_stat(&this_rd_stats, eob, rd, this_rd_stats.sse, blk_row,
-                     blk_col, plane_bsize, txsize_to_bsize[tx_size]);
-  }
-#endif
-
   av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
 
   args->this_rd += rd;
 
-  if (args->this_rd > args->best_rd) {
-    args->exit_early = 1;
-    return;
-  }
+  if (args->this_rd > args->best_rd) args->exit_early = 1;
 }
 
 static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
                              RD_STATS *rd_stats, int64_t ref_best_rd,
                              int64_t this_rd, int plane, BLOCK_SIZE bsize,
                              TX_SIZE tx_size, int use_fast_coef_casting,
-                             FAST_TX_SEARCH_MODE ftxs_mode) {
+                             FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   struct rdcost_block_args args;
@@ -3518,8 +3539,14 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
   args.use_fast_coef_costing = use_fast_coef_casting;
   args.ftxs_mode = ftxs_mode;
   args.this_rd = this_rd;
+  args.skip_trellis = skip_trellis;
   av1_init_rd_stats(&args.rd_stats);
 
+  if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) {
+    av1_invalid_rd_stats(rd_stats);
+    return;
+  }
+
   if (plane == 0) xd->mi[0]->tx_size = tx_size;
 
   av1_get_entropy_contexts(bsize, pd, args.t_above, args.t_left);
@@ -3544,23 +3571,20 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
 
 static int tx_size_cost(const AV1_COMMON *const cm, const MACROBLOCK *const x,
                         BLOCK_SIZE bsize, TX_SIZE tx_size) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(bsize == x->e_mbd.mi[0]->sb_type);
+  if (cm->tx_mode != TX_MODE_SELECT || !block_signals_txsize(bsize)) return 0;
 
-  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type)) {
-    const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
-    const int depth = tx_size_to_depth(tx_size, bsize);
-    const int tx_size_ctx = get_tx_size_context(xd);
-    int r_tx_size = x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
-    return r_tx_size;
-  } else {
-    return 0;
-  }
+  const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+  const int depth = tx_size_to_depth(tx_size, bsize);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int tx_size_ctx = get_tx_size_context(xd);
+  return x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
 }
 
 static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                         RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
-                        TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode) {
+                        TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
+                        int skip_trellis) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -3594,49 +3618,60 @@ static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   mbmi->tx_size = tx_size;
   txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOMMIN(this_rd, skip_rd),
                    AOM_PLANE_Y, bs, tx_size, cpi->sf.use_fast_coef_costing,
-                   ftxs_mode);
+                   ftxs_mode, skip_trellis);
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
 
+  // rdstats->rate should include all the rate except skip/non-skip cost as the
+  // same is accounted in the caller functions after rd evaluation of all
+  // planes. However the decisions should be done after considering the
+  // skip/non-skip header cost
   if (rd_stats->skip) {
     if (is_inter) {
       rd = RDCOST(x->rdmult, s1, rd_stats->sse);
-#if CONFIG_ONE_PASS_SVM
-      // TODO(chiyotsai@google.com): Investigate if these updates are really
-      // needed.
-      av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-#endif
     } else {
       rd = RDCOST(x->rdmult, s1 + r_tx_size * tx_select, rd_stats->sse);
-#if CONFIG_ONE_PASS_SVM
-      av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-#endif
+      rd_stats->rate += r_tx_size * tx_select;
     }
   } else {
     rd = RDCOST(x->rdmult, rd_stats->rate + s0 + r_tx_size * tx_select,
                 rd_stats->dist);
+    rd_stats->rate += r_tx_size * tx_select;
+  }
+  if (is_inter && !xd->lossless[xd->mi[0]->segment_id]) {
+    int64_t temp_skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+    if (temp_skip_rd <= rd) {
+      rd = temp_skip_rd;
+      rd_stats->rate = 0;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
+    }
   }
-
-  if (tx_select) rd_stats->rate += r_tx_size;
-
-  if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip))
-    rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
 
   return rd;
 }
 
 static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
-                                   MACROBLOCK *x, int *r, int64_t *d, int *s,
-                                   int64_t *sse, int64_t ref_best_rd) {
-  RD_STATS rd_stats;
+                                   MACROBLOCK *x, int64_t ref_best_rd,
+                                   RD_STATS *rd_stats) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   av1_subtract_plane(x, bs, 0);
   x->rd_model = LOW_TXFM_RD;
-  int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs,
-                        max_txsize_rect_lookup[bs], FTXS_NONE);
+  int skip_trellis = cpi->optimize_seg_arr[xd->mi[0]->segment_id] ==
+                     NO_ESTIMATE_YRD_TRELLIS_OPT;
+  const int64_t rd =
+      txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, max_txsize_rect_lookup[bs],
+               FTXS_NONE, skip_trellis);
   x->rd_model = FULL_TXFM_RD;
-  *r = rd_stats.rate;
-  *d = rd_stats.dist;
-  *s = rd_stats.skip;
-  *sse = rd_stats.sse;
+  if (rd != INT64_MAX) {
+    const int skip_ctx = av1_get_skip_context(xd);
+    if (rd_stats->skip) {
+      const int s1 = x->skip_cost[skip_ctx][1];
+      rd_stats->rate = s1;
+    } else {
+      const int s0 = x->skip_cost[skip_ctx][0];
+      rd_stats->rate += s0;
+    }
+  }
   return rd;
 }
 
@@ -3662,7 +3697,7 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOMMIN(this_rd, skip_rd),
                    AOM_PLANE_Y, bs, mbmi->tx_size,
-                   cpi->sf.use_fast_coef_costing, FTXS_NONE);
+                   cpi->sf.use_fast_coef_costing, FTXS_NONE, 0);
   // Reset the pruning flags.
   av1_zero(x->tx_search_prune);
   x->tx_split_prune_flag = 0;
@@ -3677,7 +3712,7 @@ static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
   mbmi->tx_size = TX_4X4;
   // TODO(any) : Pass this_rd based on skip/non-skip cost
   txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size,
-                   cpi->sf.use_fast_coef_costing, FTXS_NONE);
+                   cpi->sf.use_fast_coef_costing, FTXS_NONE, 0);
 }
 
 static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
@@ -3707,55 +3742,64 @@ static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
 static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
                                         MACROBLOCK *x, RD_STATS *rd_stats,
                                         int64_t ref_best_rd, BLOCK_SIZE bs) {
+  av1_invalid_rd_stats(rd_stats);
+
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  int64_t rd = INT64_MAX;
-  int n;
-  int start_tx;
-  int depth;
-  int64_t best_rd = INT64_MAX;
   const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
-  TX_SIZE best_tx_size = max_rect_tx_size;
-  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
-  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  const int n4 = bsize_to_num_blk(bs);
   const int tx_select = cm->tx_mode == TX_MODE_SELECT;
-
-  av1_invalid_rd_stats(rd_stats);
+  int start_tx;
+  int depth, init_depth;
 
   if (tx_select) {
     start_tx = max_rect_tx_size;
-    depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
-                                  is_inter_block(mbmi), &cpi->sf);
+    init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
+                                       is_inter_block(mbmi), &cpi->sf);
   } else {
     const TX_SIZE chosen_tx_size = tx_size_from_tx_mode(bs, cm->tx_mode);
     start_tx = chosen_tx_size;
-    depth = MAX_TX_DEPTH;
+    init_depth = MAX_TX_DEPTH;
   }
 
   prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16);
 
-  for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) {
+  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  TX_SIZE best_tx_size = max_rect_tx_size;
+  int64_t best_rd = INT64_MAX;
+  const int n4 = bsize_to_num_blk(bs);
+  x->rd_model = FULL_TXFM_RD;
+  depth = init_depth;
+  int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX };
+  for (int n = start_tx; depth <= MAX_TX_DEPTH;
+       depth++, n = sub_tx_size_map[n]) {
 #if CONFIG_DIST_8X8
     if (x->using_dist_8x8) {
       if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue;
     }
 #endif
+    if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[n] == TX_64X64) continue;
+
     RD_STATS this_rd_stats;
-    if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD;
-    rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE);
-    x->rd_model = FULL_TXFM_RD;
+    rd[depth] =
+        txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE, 0);
 
-    if (rd < best_rd) {
+    if (rd[depth] < best_rd) {
       memcpy(best_txk_type, mbmi->txk_type,
              sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
       memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
       best_tx_size = n;
-      best_rd = rd;
+      best_rd = rd[depth];
       *rd_stats = this_rd_stats;
     }
     if (n == TX_4X4) break;
+    // If we are searching three depths, prune the smallest size depending
+    // on rd results for the first two depths for low contrast blocks.
+    if (depth > init_depth && depth != MAX_TX_DEPTH &&
+        x->source_variance < 256) {
+      if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break;
+    }
   }
 
   if (rd_stats->rate != INT_MAX) {
@@ -3770,14 +3814,245 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
   x->tx_split_prune_flag = 0;
 }
 
+// origin_threshold * 128 / 100
+static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
+  {
+      64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
+  },
+  {
+      88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
+  },
+  {
+      90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
+      74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
+  },
+};
+
+// lookup table for predict_skip_flag
+// int max_tx_size = max_txsize_rect_lookup[bsize];
+// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
+//   max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
+static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
+  TX_4X4,   TX_4X8,   TX_8X4,   TX_8X8,   TX_8X16,  TX_16X8,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16,  TX_16X4,
+  TX_8X8,   TX_8X8,   TX_16X16, TX_16X16,
+};
+
+// Uses simple features on top of DCT coefficients to quickly predict
+// whether optimal RD decision is to skip encoding the residual.
+// The sse value is stored in dist.
+static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+                             int reduced_tx_set) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
+
+  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
+
+  const int64_t mse = *dist / bw / bh;
+  // Normalized quantizer takes the transform upscaling factor (8 for tx size
+  // smaller than 32) into account.
+  const int16_t normalized_dc_q = dc_q >> 3;
+  const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
+  // Predict not to skip when mse is larger than threshold.
+  if (mse > mse_thresh) return 0;
+
+  const int max_tx_size = max_predict_sf_tx_size[bsize];
+  const int tx_h = tx_size_high[max_tx_size];
+  const int tx_w = tx_size_wide[max_tx_size];
+  DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
+  TxfmParam param;
+  param.tx_type = DCT_DCT;
+  param.tx_size = max_tx_size;
+  param.bd = xd->bd;
+  param.is_hbd = is_cur_buf_hbd(xd);
+  param.lossless = 0;
+  param.tx_set_type = av1_get_ext_tx_set_type(
+      param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+  const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
+  const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
+  const int16_t *src_diff = x->plane[0].src_diff;
+  const int n_coeff = tx_w * tx_h;
+  const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+  const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
+  const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
+  for (int row = 0; row < bh; row += tx_h) {
+    for (int col = 0; col < bw; col += tx_w) {
+      av1_fwd_txfm(src_diff + col, coefs, bw, &param);
+      // Operating on TX domain, not pixels; we want the QTX quantizers
+      const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
+      if (dc_coef >= dc_thresh) return 0;
+      for (int i = 1; i < n_coeff; ++i) {
+        const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
+        if (ac_coef >= ac_thresh) return 0;
+      }
+    }
+    src_diff += tx_h * bw;
+  }
+  return 1;
+}
+
+// Used to set proper context for early termination with skip = 1.
+static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
+                          int64_t dist) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int n4 = bsize_to_num_blk(bsize);
+  const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+  memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
+  memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
+  mbmi->tx_size = tx_size;
+  for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
+  rd_stats->skip = 1;
+  if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
+  rd_stats->dist = rd_stats->sse = (dist << 4);
+  // Though decision is to make the block as skip based on luma stats,
+  // it is possible that block becomes non skip after chroma rd. In addition
+  // intermediate non skip costs calculated by caller function will be
+  // incorrect, if rate is set as  zero (i.e., if zero_blk_rate is not
+  // accounted). Hence intermediate rate is populated to code the luma tx blks
+  // as skip, the caller function based on final rd decision (i.e., skip vs
+  // non-skip) sets the final rate accordingly. Here the rate populated
+  // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx
+  // size possible) in the current block. Eg: For 128*128 block, rate would be
+  // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx
+  // block as 'all zeros'
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
+  ENTROPY_CONTEXT *ta = ctxa;
+  ENTROPY_CONTEXT *tl = ctxl;
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx);
+  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+  rd_stats->rate = zero_blk_rate *
+                   (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) *
+                   (block_size_high[bsize] >> tx_size_high_log2[tx_size]);
+}
+
+static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+  const int16_t *diff = x->plane[0].src_diff;
+  const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+                                             (uint8_t *)diff, 2 * rows * cols);
+  return (hash << 5) + bsize;
+}
+
+static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
+                            const RD_STATS *const rd_stats,
+                            MB_RD_RECORD *tx_rd_record) {
+  int index;
+  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
+    index =
+        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
+    ++tx_rd_record->num;
+  } else {
+    index = tx_rd_record->index_start;
+    tx_rd_record->index_start =
+        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+  }
+  MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  tx_rd_info->hash_value = hash;
+  tx_rd_info->tx_size = mbmi->tx_size;
+  memcpy(tx_rd_info->blk_skip, x->blk_skip,
+         sizeof(tx_rd_info->blk_skip[0]) * n4);
+  av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
+  av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
+  tx_rd_info->rd_stats = *rd_stats;
+}
+
+static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info,
+                             RD_STATS *const rd_stats, MACROBLOCK *const x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  mbmi->tx_size = tx_rd_info->tx_size;
+  memcpy(x->blk_skip, tx_rd_info->blk_skip,
+         sizeof(tx_rd_info->blk_skip[0]) * n4);
+  av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
+  av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
+  *rd_stats = tx_rd_info->rd_stats;
+}
+
+static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
+                                      const int64_t ref_best_rd,
+                                      const uint32_t hash) {
+  int32_t match_index = -1;
+  if (ref_best_rd != INT64_MAX) {
+    for (int i = 0; i < mb_rd_record->num; ++i) {
+      const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+      // If there is a match in the tx_rd_record, fetch the RD decision and
+      // terminate early.
+      if (mb_rd_record->tx_rd_info[index].hash_value == hash) {
+        match_index = index;
+        break;
+      }
+    }
+  }
+  return match_index;
+}
+
 static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                             RD_STATS *rd_stats, BLOCK_SIZE bs,
                             int64_t ref_best_rd) {
   MACROBLOCKD *xd = &x->e_mbd;
   av1_init_rd_stats(rd_stats);
-
+  int is_inter = is_inter_block(xd->mi[0]);
   assert(bs == xd->mi[0]->sb_type);
 
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+
+  uint32_t hash = 0;
+  int32_t match_index = -1;
+  MB_RD_RECORD *mb_rd_record = NULL;
+  const int within_border = mi_row >= xd->tile.mi_row_start &&
+                            (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) &&
+                            mi_col >= xd->tile.mi_col_start &&
+                            (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end);
+  const int is_mb_rd_hash_enabled =
+      (within_border && cpi->sf.use_mb_rd_hash && is_inter);
+  const int n4 = bsize_to_num_blk(bs);
+  if (is_mb_rd_hash_enabled) {
+    hash = get_block_residue_hash(x, bs);
+    mb_rd_record = &x->mb_rd_record;
+    match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+    if (match_index != -1) {
+      MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
+      fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
+      // Reset the pruning flags.
+      av1_zero(x->tx_search_prune);
+      x->tx_split_prune_flag = 0;
+      return;
+    }
+  }
+
+  // If we predict that skip is the optimal RD decision - set the respective
+  // context and terminate early.
+  int64_t dist;
+
+  if (cpi->sf.tx_type_search.use_skip_flag_prediction && is_inter &&
+      (!xd->lossless[xd->mi[0]->segment_id]) &&
+      predict_skip_flag(x, bs, &dist, cpi->common.reduced_tx_set_used)) {
+    // Populate rdstats as per skip decision
+    set_skip_flag(x, rd_stats, bs, dist);
+    // Save the RD search results into tx_rd_record.
+    if (is_mb_rd_hash_enabled)
+      save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+    // Reset the pruning flags.
+    av1_zero(x->tx_search_prune);
+    x->tx_split_prune_flag = 0;
+    return;
+  }
+
   if (xd->lossless[xd->mi[0]->segment_id]) {
     choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
   } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
@@ -3785,6 +4060,12 @@ static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   } else {
     choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
   }
+
+  // Save the RD search results into tx_rd_record.
+  if (is_mb_rd_hash_enabled) {
+    assert(mb_rd_record != NULL);
+    save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+  }
 }
 
 // Return the rate cost for luma prediction mode info. of intra blocks.
@@ -4527,6 +4808,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   const int *bmode_costs;
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int try_palette =
+      cpi->oxcf.enable_palette &&
       av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
   uint8_t *best_palette_color_map =
       try_palette ? x->palette_buffer->best_palette_color_map : NULL;
@@ -4542,8 +4824,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   if (cpi->sf.intra_angle_estimation) {
     const int src_stride = x->plane[0].src.stride;
     const uint8_t *src = x->plane[0].src.buf;
-    angle_estimation(src, src_stride, rows, cols, bsize,
-                     xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH,
+    angle_estimation(src, src_stride, rows, cols, bsize, is_cur_buf_hbd(xd),
                      directional_mode_skip_mask);
   }
   mbmi->filter_intra_mode_info.use_filter_intra = 0;
@@ -4561,6 +4842,11 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     int this_rate, this_rate_tokenonly, s;
     int64_t this_distortion, this_rd, this_model_rd;
     mbmi->mode = intra_rd_search_mode_order[mode_idx];
+    if ((!cpi->oxcf.enable_smooth_intra || cpi->sf.disable_smooth_intra) &&
+        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+         mbmi->mode == SMOOTH_V_PRED))
+      continue;
+    if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
     mbmi->angle_delta[PLANE_TYPE_Y] = 0;
     this_model_rd =
         intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode], mi_row, mi_col);
@@ -4570,7 +4856,8 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (this_model_rd < best_model_rd) best_model_rd = this_model_rd;
     is_directional_mode = av1_is_directional_mode(mbmi->mode);
     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
-    if (is_directional_mode && av1_use_angle_delta(bsize)) {
+    if (is_directional_mode && av1_use_angle_delta(bsize) &&
+        cpi->oxcf.enable_angle_delta) {
       this_rd_stats.rate = INT_MAX;
       rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &this_rate,
                               &this_rd_stats, bsize, bmode_costs[mbmi->mode],
@@ -4649,6 +4936,8 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
   int plane;
   int is_cost_valid = 1;
+  const int is_inter = is_inter_block(mbmi);
+  int64_t this_rd = 0, skip_rd = 0;
   av1_init_rd_stats(rd_stats);
 
   if (ref_best_rd < 0) is_cost_valid = 0;
@@ -4657,7 +4946,7 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
 
-  if (is_inter_block(mbmi) && is_cost_valid) {
+  if (is_inter && is_cost_valid) {
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
       av1_subtract_plane(x, bsize, plane);
   }
@@ -4665,15 +4954,26 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   if (is_cost_valid) {
     for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
       RD_STATS pn_rd_stats;
-      txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, 0, plane, bsize,
-                       uv_tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+      int64_t chroma_ref_best_rd = ref_best_rd;
+      // For inter blocks, refined ref_best_rd is used for early exit
+      // For intra blocks, even though current rd crosses ref_best_rd, early
+      // exit is not recommended as current rd is used for gating subsequent
+      // modes as well (say, for angular modes)
+      // TODO(any): Extend the early exit mechanism for intra modes as well
+      if (cpi->sf.perform_best_rd_based_gating_for_chroma && is_inter &&
+          chroma_ref_best_rd != INT64_MAX)
+        chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_rd);
+      txfm_rd_in_plane(x, cpi, &pn_rd_stats, chroma_ref_best_rd, 0, plane,
+                       bsize, uv_tx_size, cpi->sf.use_fast_coef_costing,
+                       FTXS_NONE, 0);
       if (pn_rd_stats.rate == INT_MAX) {
         is_cost_valid = 0;
         break;
       }
       av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-      if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) > ref_best_rd &&
-          RDCOST(x->rdmult, 0, rd_stats->sse) > ref_best_rd) {
+      this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+      if (AOMMIN(this_rd, skip_rd) > ref_best_rd) {
         is_cost_valid = 0;
         break;
       }
@@ -4688,11 +4988,12 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   return is_cost_valid;
 }
 
-static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
-                          int blk_row, int blk_col, int plane, int block,
-                          int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats,
-                          FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost,
-                          TXB_RD_INFO *rd_info_array) {
+// Pick transform type for a transform block of tx_size.
+static void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+                       int blk_row, int blk_col, int plane, int block,
+                       int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats,
+                       FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost,
+                       TXB_RD_INFO *rd_info_array) {
   const struct macroblock_plane *const p = &x->plane[plane];
   const uint16_t cur_joint_ctx =
       (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
@@ -4720,7 +5021,7 @@ static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
 
   RD_STATS this_rd_stats;
   search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
+                  txb_ctx, ftxs_mode, 0, 0, ref_rdcost, &this_rd_stats);
 
   av1_merge_rd_stats(rd_stats, &this_rd_stats);
 
@@ -4885,9 +5186,9 @@ static void try_tx_block_no_split(
   rd_stats->zero_rate = zero_blk_rate;
   const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
   mbmi->inter_tx_size[index] = tx_size;
-  tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
-                &txb_ctx, rd_stats, ftxs_mode, ref_best_rd,
-                rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+  tx_type_rd(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, &txb_ctx,
+             rd_stats, ftxs_mode, ref_best_rd,
+             rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
   assert(rd_stats->rate < INT_MAX);
 
   if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
@@ -4895,7 +5196,7 @@ static void try_tx_block_no_split(
        rd_stats->skip == 1) &&
       !xd->lossless[mbmi->segment_id]) {
 #if CONFIG_RD_DEBUG
-    av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
+    av1_update_txb_coeff_cost(rd_stats, 0, tx_size, blk_row, blk_col,
                               zero_blk_rate - rd_stats->rate);
 #endif  // CONFIG_RD_DEBUG
     rd_stats->rate = zero_blk_rate;
@@ -4918,13 +5219,6 @@ static void try_tx_block_no_split(
   const int txk_type_idx =
       av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
   no_split->tx_type = mbmi->txk_type[txk_type_idx];
-
-#if CONFIG_ONE_PASS_SVM
-  if (plane_bsize >= BLOCK_8X8) {
-    av1_add_reg_stat(rd_stats, p->eobs[block], no_split->rd, rd_stats->sse,
-                     blk_row, blk_col, plane_bsize, txsize_to_bsize[tx_size]);
-  }
-#endif
 }
 
 static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
@@ -4932,8 +5226,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                             BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
                             ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
                             TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
-                            int64_t ref_best_rd, int *is_cost_valid,
-                            FAST_TX_SEARCH_MODE ftxs_mode,
+                            int64_t prev_level_rd, int64_t ref_best_rd,
+                            int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
                             TXB_RD_INFO_NODE *rd_info_node);
 
 static void try_tx_block_split(
@@ -4943,6 +5237,7 @@ static void try_tx_block_split(
     int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
     FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
     RD_STATS *split_rd_stats, int64_t *split_rd) {
+  assert(tx_size < TX_SIZES_ALL);
   MACROBLOCKD *const xd = &x->e_mbd;
   const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
@@ -4950,44 +5245,37 @@ static void try_tx_block_split(
   const int bsw = tx_size_wide_unit[sub_txs];
   const int bsh = tx_size_high_unit[sub_txs];
   const int sub_step = bsw * bsh;
-  RD_STATS this_rd_stats;
-  int this_cost_valid = 1;
+  const int nblks =
+      (tx_size_high_unit[tx_size] / bsh) * (tx_size_wide_unit[tx_size] / bsw);
+  assert(nblks > 0);
+  int blk_idx = 0;
   int64_t tmp_rd = 0;
-
+  *split_rd = INT64_MAX;
   split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
 
-  assert(tx_size < TX_SIZES_ALL);
-
-  int blk_idx = 0;
   for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
     for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) {
+      assert(blk_idx < 4);
       const int offsetr = blk_row + r;
       const int offsetc = blk_col + c;
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-      assert(blk_idx < 4);
+
+      RD_STATS this_rd_stats;
+      int this_cost_valid = 1;
       select_tx_block(
           cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta,
-          tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd,
-          &this_cost_valid, ftxs_mode,
+          tl, tx_above, tx_left, &this_rd_stats, no_split_rd / nblks,
+          ref_best_rd - tmp_rd, &this_cost_valid, ftxs_mode,
           (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
-
-      if (!this_cost_valid) goto LOOP_EXIT;
-
+      if (!this_cost_valid) return;
       av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
-
       tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
-
-      if (no_split_rd < tmp_rd) {
-        this_cost_valid = 0;
-        goto LOOP_EXIT;
-      }
+      if (no_split_rd < tmp_rd) return;
       block += sub_step;
     }
   }
 
-LOOP_EXIT : {}
-
-  if (this_cost_valid) *split_rd = tmp_rd;
+  *split_rd = tmp_rd;
 }
 
 // Search for the best tx partition/type for a given luma block.
@@ -4996,8 +5284,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                             BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
                             ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
                             TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
-                            int64_t ref_best_rd, int *is_cost_valid,
-                            FAST_TX_SEARCH_MODE ftxs_mode,
+                            int64_t prev_level_rd, int64_t ref_best_rd,
+                            int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
                             TXB_RD_INFO_NODE *rd_info_node) {
   assert(tx_size < TX_SIZES_ALL);
   av1_init_rd_stats(rd_stats);
@@ -5017,7 +5305,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                                          mbmi->sb_type, tx_size);
   struct macroblock_plane *const p = &x->plane[0];
 
-  const int try_no_split = 1;
+  const int try_no_split =
+      cpi->oxcf.enable_tx64 || txsize_sqr_up_map[tx_size] != TX_64X64;
   int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
 #if CONFIG_DIST_8X8
   if (x->using_dist_8x8)
@@ -5042,6 +5331,13 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     if (cpi->sf.txb_split_cap) {
       if (p->eobs[block] == 0) try_split = 0;
     }
+
+    if (cpi->sf.adaptive_txb_search_level &&
+        (no_split.rd -
+         (no_split.rd >> (2 + cpi->sf.adaptive_txb_search_level))) >
+            prev_level_rd) {
+      try_split = 0;
+    }
   }
 
   if (x->e_mbd.bd == 8 && !x->cb_partition_scan && try_split) {
@@ -5089,98 +5385,12 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
   }
 }
 
-static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
-                                   RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                                   int64_t ref_best_rd,
-                                   FAST_TX_SEARCH_MODE ftxs_mode,
-                                   TXB_RD_INFO_NODE *rd_info_tree) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int is_cost_valid = 1;
-  int64_t this_rd = 0, skip_rd = 0;
-
-  if (ref_best_rd < 0) is_cost_valid = 0;
-
-  av1_init_rd_stats(rd_stats);
-
-  if (is_cost_valid) {
-    const struct macroblockd_plane *const pd = &xd->plane[0];
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    const int mi_width = mi_size_wide[plane_bsize];
-    const int mi_height = mi_size_high[plane_bsize];
-    const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
-    const int bh = tx_size_high_unit[max_tx_size];
-    const int bw = tx_size_wide_unit[max_tx_size];
-    int idx, idy;
-    int block = 0;
-    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
-    ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
-    ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
-    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
-    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
-
-    RD_STATS pn_rd_stats;
-    const int init_depth =
-        get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
-    av1_init_rd_stats(&pn_rd_stats);
-
-    av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
-    memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
-    memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
-    const int skip_ctx = av1_get_skip_context(xd);
-    const int s0 = x->skip_cost[skip_ctx][0];
-    const int s1 = x->skip_cost[skip_ctx][1];
-
-    skip_rd = RDCOST(x->rdmult, s1, 0);
-    this_rd = RDCOST(x->rdmult, s0, 0);
-    for (idy = 0; idy < mi_height; idy += bh) {
-      for (idx = 0; idx < mi_width; idx += bw) {
-        int64_t best_rd_sofar = (ref_best_rd - (AOMMIN(skip_rd, this_rd)));
-        select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth,
-                        plane_bsize, ctxa, ctxl, tx_above, tx_left,
-                        &pn_rd_stats, best_rd_sofar, &is_cost_valid, ftxs_mode,
-                        rd_info_tree);
-        if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
-          av1_invalid_rd_stats(rd_stats);
-          return;
-        }
-        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-        skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
-        this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
-        block += step;
-        if (rd_info_tree != NULL) rd_info_tree += 1;
-      }
-    }
-    if (skip_rd <= this_rd) {
-      rd_stats->rate = 0;
-      rd_stats->dist = rd_stats->sse;
-      rd_stats->skip = 1;
-#if CONFIG_ONE_PASS_SVM
-      av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-#endif
-    } else {
-      rd_stats->skip = 0;
-    }
-  }
-
-  if (!is_cost_valid) {
-    // reset cost value
-    av1_invalid_rd_stats(rd_stats);
-  }
-}
-
-static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
+static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                        int64_t ref_best_rd,
                                        TXB_RD_INFO_NODE *rd_info_tree) {
-  const int fast_tx_search = cpi->sf.tx_size_search_method > USE_FULL_RD;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int is_inter = is_inter_block(mbmi);
-  const int skip_ctx = av1_get_skip_context(xd);
-  int s0 = x->skip_cost[skip_ctx][0];
-  int s1 = x->skip_cost[skip_ctx][1];
-  int64_t rd;
+  assert(is_inter_block(xd->mi[0]));
 
   // TODO(debargha): enable this as a speed feature where the
   // select_inter_block_yrd() function above will use a simplified search
@@ -5188,16 +5398,71 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
   // will use more complex search given that the transform partitions have
   // already been decided.
 
+  const int fast_tx_search = cpi->sf.tx_size_search_method > USE_FULL_RD;
   int64_t rd_thresh = ref_best_rd;
   if (fast_tx_search && rd_thresh < INT64_MAX) {
     if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
   }
   assert(rd_thresh > 0);
 
-  FAST_TX_SEARCH_MODE ftxs_mode =
+  const FAST_TX_SEARCH_MODE ftxs_mode =
       fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE;
-  select_inter_block_yrd(cpi, x, rd_stats, bsize, rd_thresh, ftxs_mode,
-                         rd_info_tree);
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  const int mi_width = mi_size_wide[plane_bsize];
+  const int mi_height = mi_size_high[plane_bsize];
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+  memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+  memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int s0 = x->skip_cost[skip_ctx][0];
+  const int s1 = x->skip_cost[skip_ctx][1];
+  const int init_depth =
+      get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
+  const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  const int step = bw * bh;
+  int64_t skip_rd = RDCOST(x->rdmult, s1, 0);
+  int64_t this_rd = RDCOST(x->rdmult, s0, 0);
+  int block = 0;
+
+  av1_init_rd_stats(rd_stats);
+  for (int idy = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
+      const int64_t best_rd_sofar =
+          (rd_thresh == INT64_MAX) ? INT64_MAX
+                                   : (rd_thresh - (AOMMIN(skip_rd, this_rd)));
+      int is_cost_valid = 1;
+      RD_STATS pn_rd_stats;
+      select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth,
+                      plane_bsize, ctxa, ctxl, tx_above, tx_left, &pn_rd_stats,
+                      INT64_MAX, best_rd_sofar, &is_cost_valid, ftxs_mode,
+                      rd_info_tree);
+      if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
+        av1_invalid_rd_stats(rd_stats);
+        return INT64_MAX;
+      }
+      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+      skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+      this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+      block += step;
+      if (rd_info_tree != NULL) rd_info_tree += 1;
+    }
+  }
+
+  if (skip_rd <= this_rd) {
+    rd_stats->skip = 1;
+  } else {
+    rd_stats->skip = 0;
+  }
+
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
 
   // If fast_tx_search is true, only DCT and 1D DCT were tested in
@@ -5208,20 +5473,15 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
       return INT64_MAX;
   }
 
+  int64_t rd;
   if (rd_stats->skip) {
     rd = RDCOST(x->rdmult, s1, rd_stats->sse);
-#if CONFIG_ONE_PASS_SVM
-    // TODO(chiyotsai@google.com): Investigate if these updates are really
-    // needed.
-    av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-#endif
   } else {
     rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+    if (!xd->lossless[xd->mi[0]->segment_id])
+      rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
   }
 
-  if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip))
-    rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
-
   return rd;
 }
 
@@ -5260,8 +5520,8 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                                   .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
     rd_stats->zero_rate = zero_blk_rate;
     rd_stats->ref_rdcost = ref_best_rd;
-    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
-                  &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL);
+    tx_type_rd(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
+               &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL);
     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
     if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
             RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
@@ -5274,20 +5534,9 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
       x->plane[0].txb_entropy_ctx[block] = 0;
       update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
                        DCT_DCT);
-#if CONFIG_ONE_PASS_SVM
-      av1_add_reg_stat(rd_stats, 0, RDCOST(x->rdmult, 0, rd_stats->sse),
-                       rd_stats->sse, blk_row, blk_col, plane_bsize,
-                       txsize_to_bsize[tx_size]);
-#endif
     } else {
       rd_stats->skip = 0;
       set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0);
-#if CONFIG_ONE_PASS_SVM
-      av1_add_reg_stat(rd_stats, x->plane[0].eobs[block],
-                       RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
-                       rd_stats->sse, blk_row, blk_col, plane_bsize,
-                       txsize_to_bsize[tx_size]);
-#endif
     }
     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
       rd_stats->rate += x->txfm_partition_cost[ctx][0];
@@ -5395,11 +5644,6 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
     rd_stats->rate = 0;
     rd_stats->dist = rd_stats->sse;
     rd_stats->skip = 1;
-#if CONFIG_ONE_PASS_SVM
-    // TODO(chiyotasi@google.com): Investigate if these updates are really
-    // needed.
-    av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-#endif
   }
   if (this_rd > ref_best_rd) is_cost_valid = 0;
 
@@ -5410,52 +5654,6 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   return is_cost_valid;
 }
 
-static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  const int rows = block_size_high[bsize];
-  const int cols = block_size_wide[bsize];
-  const int16_t *diff = x->plane[0].src_diff;
-  const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
-                                             (uint8_t *)diff, 2 * rows * cols);
-  return (hash << 5) + bsize;
-}
-
-static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
-                            const RD_STATS *const rd_stats,
-                            MB_RD_RECORD *tx_rd_record) {
-  int index;
-  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
-    index =
-        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
-    ++tx_rd_record->num;
-  } else {
-    index = tx_rd_record->index_start;
-    tx_rd_record->index_start =
-        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
-  }
-  MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index];
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = xd->mi[0];
-  tx_rd_info->hash_value = hash;
-  tx_rd_info->tx_size = mbmi->tx_size;
-  memcpy(tx_rd_info->blk_skip, x->blk_skip,
-         sizeof(tx_rd_info->blk_skip[0]) * n4);
-  av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
-  av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
-  tx_rd_info->rd_stats = *rd_stats;
-}
-
-static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info,
-                             RD_STATS *const rd_stats, MACROBLOCK *const x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  mbmi->tx_size = tx_rd_info->tx_size;
-  memcpy(x->blk_skip, tx_rd_info->blk_skip,
-         sizeof(tx_rd_info->blk_skip[0]) * n4);
-  av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
-  av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
-  *rd_stats = tx_rd_info->rd_stats;
-}
-
 static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
                                 const uint32_t hash) {
   // Linear search through the circular buffer to find matching hash.
@@ -5706,158 +5904,13 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
   return 1;
 }
 
-// origin_threshold * 128 / 100
-static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
-  {
-      64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
-      68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
-  },
-  {
-      88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
-      68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
-  },
-  {
-      90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
-      74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
-  },
-};
-
-// lookup table for predict_skip_flag
-// int max_tx_size = max_txsize_rect_lookup[bsize];
-// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
-//   max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
-static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
-  TX_4X4,   TX_4X8,   TX_8X4,   TX_8X8,   TX_8X16,  TX_16X8,
-  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
-  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16,  TX_16X4,
-  TX_8X8,   TX_8X8,   TX_16X16, TX_16X16,
-};
-
-// Uses simple features on top of DCT coefficients to quickly predict
-// whether optimal RD decision is to skip encoding the residual.
-// The sse value is stored in dist.
-static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
-                             int reduced_tx_set) {
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-  const MACROBLOCKD *xd = &x->e_mbd;
-  const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
-
-  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
-  const int64_t mse = *dist / bw / bh;
-  // Normalized quantizer takes the transform upscaling factor (8 for tx size
-  // smaller than 32) into account.
-  const int16_t normalized_dc_q = dc_q >> 3;
-  const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
-  // Predict not to skip when mse is larger than threshold.
-  if (mse > mse_thresh) return 0;
-
-  const int max_tx_size = max_predict_sf_tx_size[bsize];
-  const int tx_h = tx_size_high[max_tx_size];
-  const int tx_w = tx_size_wide[max_tx_size];
-  DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
-  TxfmParam param;
-  param.tx_type = DCT_DCT;
-  param.tx_size = max_tx_size;
-  param.bd = xd->bd;
-  param.is_hbd = get_bitdepth_data_path_index(xd);
-  param.lossless = 0;
-  param.tx_set_type = av1_get_ext_tx_set_type(
-      param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
-  const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
-  const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
-  const int16_t *src_diff = x->plane[0].src_diff;
-  const int n_coeff = tx_w * tx_h;
-  const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
-  const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
-  const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
-  for (int row = 0; row < bh; row += tx_h) {
-    for (int col = 0; col < bw; col += tx_w) {
-      av1_fwd_txfm(src_diff + col, coefs, bw, &param);
-      // Operating on TX domain, not pixels; we want the QTX quantizers
-      const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
-      if (dc_coef >= dc_thresh) return 0;
-      for (int i = 1; i < n_coeff; ++i) {
-        const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
-        if (ac_coef >= ac_thresh) return 0;
-      }
-    }
-    src_diff += tx_h * bw;
-  }
-  return 1;
-}
-
-#if CONFIG_ONE_PASS_SVM
-static void calc_regional_sse(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t dist,
-                              RD_STATS *rd_stats) {
-  // TODO(chiyotsai@google.com): Don't need regional sse's unless we are doing
-  // none.
-  const int bw = block_size_wide[bsize];
-  const int bw_mi = bw >> tx_size_wide_log2[0];
-  const int bh_mi = bw >> tx_size_high_log2[0];
-  const BLOCK_SIZE split_size = get_partition_subsize(bsize, PARTITION_SPLIT);
-  int64_t dist_0, dist_1, dist_2, dist_3;
-  MACROBLOCKD *xd = &x->e_mbd;
-  dist_0 = pixel_diff_dist(x, AOM_PLANE_Y, 0, 0, bsize, split_size);
-  dist_1 = pixel_diff_dist(x, AOM_PLANE_Y, 0, bw_mi / 2, bsize, split_size);
-  dist_2 = pixel_diff_dist(x, AOM_PLANE_Y, bh_mi / 2, 0, bsize, split_size);
-  dist_3 =
-      pixel_diff_dist(x, AOM_PLANE_Y, bh_mi / 2, bw_mi / 2, bsize, split_size);
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
-    dist_0 = ROUND_POWER_OF_TWO(dist_0, (xd->bd - 8) * 2);
-    dist_1 = ROUND_POWER_OF_TWO(dist_1, (xd->bd - 8) * 2);
-    dist_2 = ROUND_POWER_OF_TWO(dist_2, (xd->bd - 8) * 2);
-    dist_3 = ROUND_POWER_OF_TWO(dist_3, (xd->bd - 8) * 2);
-  }
-  const int scaling_factor = MAX_MIB_SIZE * MAX_MIB_SIZE;
-  rd_stats->y_sse = (dist << 4);
-  rd_stats->sse_0 = (dist_0 << 4) * scaling_factor;
-  rd_stats->sse_1 = (dist_1 << 4) * scaling_factor;
-  rd_stats->sse_2 = (dist_2 << 4) * scaling_factor;
-  rd_stats->sse_3 = (dist_3 << 4) * scaling_factor;
-  av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-}
-#endif
-
-// Used to set proper context for early termination with skip = 1.
-static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
-                          int64_t dist) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int n4 = bsize_to_num_blk(bsize);
-  const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
-  memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
-  memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
-  mbmi->tx_size = tx_size;
-  for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
-  rd_stats->skip = 1;
-  rd_stats->rate = 0;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
-  rd_stats->dist = rd_stats->sse = (dist << 4);
-}
-
-static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
-                               RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
-                               int mi_col, int64_t ref_best_rd) {
+// Search for best transform size and type for luma inter blocks.
+static void pick_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                  RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                  int mi_row, int mi_col, int64_t ref_best_rd) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  int64_t rd = INT64_MAX;
-  int64_t best_rd = INT64_MAX;
-  const int is_inter = is_inter_block(mbmi);
-  const int n4 = bsize_to_num_blk(bsize);
-  // Get the tx_size 1 level down
-  const TX_SIZE min_tx_size = sub_tx_size_map[max_txsize_rect_lookup[bsize]];
-  const TxSetType tx_set_type =
-      av1_get_ext_tx_set_type(min_tx_size, is_inter, cm->reduced_tx_set_used);
-  const int within_border =
-      mi_row >= xd->tile.mi_row_start &&
-      (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
-      mi_col >= xd->tile.mi_col_start &&
-      (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
+  assert(is_inter_block(xd->mi[0]));
 
   av1_invalid_rd_stats(rd_stats);
 
@@ -5874,8 +5927,7 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
     // tighter.
     assert(cpi->sf.model_based_prune_tx_search_level >= 0 &&
            cpi->sf.model_based_prune_tx_search_level <= 2);
-    static const int prune_factor_by8[] = { 2 + MODELRD_TYPE_TX_SEARCH_PRUNE,
-                                            4 + MODELRD_TYPE_TX_SEARCH_PRUNE };
+    static const int prune_factor_by8[] = { 3, 5 };
     if (!model_skip &&
         ((model_rd *
           prune_factor_by8[cpi->sf.model_based_prune_tx_search_level - 1]) >>
@@ -5883,38 +5935,41 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
       return;
   }
 
-  const uint32_t hash = get_block_residue_hash(x, bsize);
-  MB_RD_RECORD *mb_rd_record = &x->mb_rd_record;
-
-  if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_mb_rd_hash) {
-    for (int i = 0; i < mb_rd_record->num; ++i) {
-      const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
-      // If there is a match in the tx_rd_record, fetch the RD decision and
-      // terminate early.
-      if (mb_rd_record->tx_rd_info[index].hash_value == hash) {
-        MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[index];
-        fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
-        return;
-      }
+  uint32_t hash = 0;
+  int32_t match_index = -1;
+  MB_RD_RECORD *mb_rd_record = NULL;
+  const int within_border =
+      mi_row >= xd->tile.mi_row_start &&
+      (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
+      mi_col >= xd->tile.mi_col_start &&
+      (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
+  const int is_mb_rd_hash_enabled = (within_border && cpi->sf.use_mb_rd_hash);
+  const int n4 = bsize_to_num_blk(bsize);
+  if (is_mb_rd_hash_enabled) {
+    hash = get_block_residue_hash(x, bsize);
+    mb_rd_record = &x->mb_rd_record;
+    match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+    if (match_index != -1) {
+      MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
+      fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
+      return;
     }
   }
 
   // If we predict that skip is the optimal RD decision - set the respective
   // context and terminate early.
   int64_t dist;
-  if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction &&
+  if (cpi->sf.tx_type_search.use_skip_flag_prediction &&
       predict_skip_flag(x, bsize, &dist, cm->reduced_tx_set_used)) {
     set_skip_flag(x, rd_stats, bsize, dist);
-#if CONFIG_ONE_PASS_SVM
-    if (bsize >= BLOCK_8X8 && mi_size_wide[bsize] == mi_size_high[bsize] &&
-        mbmi->partition == PARTITION_NONE) {
-      calc_regional_sse(x, bsize, dist, rd_stats);
-    }
-#endif
     // Save the RD search results into tx_rd_record.
-    if (within_border) save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+    if (is_mb_rd_hash_enabled)
+      save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
     return;
   }
+#if CONFIG_SPEED_STATS
+  ++x->tx_search_count;
+#endif  // CONFIG_SPEED_STATS
 
   // Precompute residual hashes and find existing or add new RD records to
   // store and reuse rate and distortion values to speed up TX size search.
@@ -5925,20 +5980,20 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
         find_tx_size_rd_records(x, bsize, mi_row, mi_col, matched_rd_info);
   }
 
+  // Get the tx_size 1 level down
+  const TX_SIZE min_tx_size = sub_tx_size_map[max_txsize_rect_lookup[bsize]];
+  const TxSetType tx_set_type =
+      av1_get_ext_tx_set_type(min_tx_size, 1, cm->reduced_tx_set_used);
   prune_tx(cpi, bsize, x, xd, tx_set_type);
 
   int found = 0;
-
   RD_STATS this_rd_stats;
   av1_init_rd_stats(&this_rd_stats);
+  const int64_t rd =
+      select_tx_size_and_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
+                              found_rd_info ? matched_rd_info : NULL);
 
-  rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
-                               found_rd_info ? matched_rd_info : NULL);
-  assert(IMPLIES(this_rd_stats.skip && !this_rd_stats.invalid_rate,
-                 this_rd_stats.rate == 0));
-
-  ref_best_rd = AOMMIN(rd, ref_best_rd);
-  if (rd < best_rd) {
+  if (rd < INT64_MAX) {
     *rd_stats = this_rd_stats;
     found = 1;
   }
@@ -5954,136 +6009,76 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   if (!found) return;
 
   // Save the RD search results into tx_rd_record.
-  if (within_border && cpi->sf.use_mb_rd_hash)
+  if (is_mb_rd_hash_enabled) {
+    assert(mb_rd_record != NULL);
     save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
-}
-
-#define FAVOR_CHROMA_SKIP 1
-static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
-                          int blk_col, int plane, int block, TX_SIZE tx_size,
-                          BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
-                          ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats,
-                          FAST_TX_SEARCH_MODE ftxs_mode) {
-  assert(plane > 0);
-  assert(tx_size < TX_SIZES_ALL);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-
-  ENTROPY_CONTEXT *ta = above_ctx + blk_col;
-  ENTROPY_CONTEXT *tl = left_ctx + blk_row;
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx);
-  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_UV]
-                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
-  tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize,
-                &txb_ctx, rd_stats, ftxs_mode, INT64_MAX, NULL);
-
-  const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  const int blk_idx = blk_row * mi_width + blk_col;
-  const int64_t rdmult = x->rdmult * plane_rd_mult[1][PLANE_TYPE_UV] /
-                         plane_rd_mult[1][PLANE_TYPE_Y];
-  av1_set_txb_context(x, plane, block, tx_size, ta, tl);
-  if ((RDCOST(rdmult, rd_stats->rate, rd_stats->dist) >=
-           RDCOST(rdmult, zero_blk_rate, rd_stats->sse) ||
-       rd_stats->skip == 1) &&
-      !xd->lossless[mbmi->segment_id]) {
-    rd_stats->rate = zero_blk_rate;
-    rd_stats->dist = rd_stats->sse;
-    rd_stats->skip = 1;
-#if FAVOR_CHROMA_SKIP
-    x->plane[plane].eobs[block] = 0;
-    x->plane[plane].txb_entropy_ctx[block] = 0;
-    set_blk_skip(x, plane, blk_idx, 1);
-#else
-    set_blk_skip(x, plane, blk_idx, 0);
-#endif
-  } else {
-    set_blk_skip(x, plane, blk_idx, 0);
   }
 }
 
-// Return value 0: early termination triggered, no valid rd cost available;
-//              1: rd cost values are valid.
-static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
-                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                            int64_t non_skip_ref_best_rd,
-                            int64_t skip_ref_best_rd,
-                            FAST_TX_SEARCH_MODE ftxs_mode) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  int plane;
-  int is_cost_valid = 1;
-  int64_t this_rd = 0;
-  int64_t skip_rd = 0;
-
-  if ((non_skip_ref_best_rd < 0) && (skip_ref_best_rd < 0)) is_cost_valid = 0;
-
-  av1_init_rd_stats(rd_stats);
+static void model_rd_for_sb_with_fullrdy(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+  const int ref = xd->mi[0]->ref_frame[0];
 
-  if (x->skip_chroma_rd) {
-    if (!is_cost_valid) av1_invalid_rd_stats(rd_stats);
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
 
-    return is_cost_valid;
-  }
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    int64_t sse;
+    int rate;
+    int64_t dist;
 
-  const BLOCK_SIZE bsizec = scale_chroma_bsize(
-      bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+    if (x->skip_chroma_rd && plane) continue;
 
-  if (is_inter_block(mbmi) && is_cost_valid) {
-    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
-      av1_subtract_plane(x, bsizec, plane);
-  }
+    if (is_cur_buf_hbd(xd)) {
+      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+                           pd->dst.stride, bw, bh);
+    } else {
+      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+                    bh);
+    }
+    sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
 
-  if (is_cost_valid) {
-    for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const BLOCK_SIZE plane_bsize =
-          get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
-      const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-      const int mi_height =
-          block_size_high[plane_bsize] >> tx_size_high_log2[0];
-      const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
-      const int bh = tx_size_high_unit[max_tx_size];
-      const int bw = tx_size_wide_unit[max_tx_size];
-      int idx, idy;
-      int block = 0;
-      const int step = bh * bw;
-      ENTROPY_CONTEXT ta[MAX_MIB_SIZE];
-      ENTROPY_CONTEXT tl[MAX_MIB_SIZE];
-      av1_get_entropy_contexts(bsizec, pd, ta, tl);
-
-      for (idy = 0; idy < mi_height; idy += bh) {
-        for (idx = 0; idx < mi_width; idx += bw) {
-          RD_STATS pn_rd_stats;
-          av1_init_rd_stats(&pn_rd_stats);
-          tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size,
-                        plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode);
-          if (pn_rd_stats.rate == INT_MAX) {
-            av1_invalid_rd_stats(rd_stats);
-            return 0;
-          }
-          av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-          this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-          skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
-          if ((this_rd > non_skip_ref_best_rd) &&
-              (skip_rd > skip_ref_best_rd)) {
-            av1_invalid_rd_stats(rd_stats);
-            return 0;
-          }
-          block += step;
-        }
+    RD_STATS rd_stats;
+    if (plane == 0) {
+      pick_tx_size_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col,
+                            INT64_MAX);
+      if (rd_stats.invalid_rate) {
+        rate = 0;
+        dist = sse << 4;
+      } else {
+        rate = rd_stats.rate;
+        dist = rd_stats.dist;
       }
+    } else {
+      model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+                            &dist);
     }
-  } else {
-    // reset cost value
-    av1_invalid_rd_stats(rd_stats);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
   }
 
-  return is_cost_valid;
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
 }
 
 static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -6331,7 +6326,7 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
 
   const BLOCK_SIZE bsize = mbmi->sb_type;
 #if CONFIG_DEBUG
-  assert(is_cfl_allowed(xd));
+  assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
   const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
   const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
   const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, ssx, ssy);
@@ -6368,7 +6363,7 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
         mbmi->cfl_alpha_idx = 0;
         mbmi->cfl_alpha_signs = joint_sign;
         txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1, bsize,
-                         tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+                         tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE, 0);
         if (rd_stats.rate == INT_MAX) break;
       }
       const int alpha_rate = x->cfl_cost[joint_sign][plane][0];
@@ -6396,7 +6391,8 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
             mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
             mbmi->cfl_alpha_signs = joint_sign;
             txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1, bsize,
-                             tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+                             tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE,
+                             0);
             if (rd_stats.rate == INT_MAX) break;
           }
           const int alpha_rate = x->cfl_cost[joint_sign][plane][c];
@@ -6469,18 +6465,24 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
           (1 << mode)))
       continue;
+    if (!cpi->oxcf.enable_smooth_intra && mode >= UV_SMOOTH_PRED &&
+        mode <= UV_SMOOTH_H_PRED)
+      continue;
+
+    if (!cpi->oxcf.enable_paeth_intra && mode == UV_PAETH_PRED) continue;
 
     mbmi->uv_mode = mode;
     int cfl_alpha_rate = 0;
     if (mode == UV_CFL_PRED) {
-      if (!is_cfl_allowed(xd)) continue;
+      if (!is_cfl_allowed(xd) || !cpi->oxcf.enable_cfl_intra) continue;
       assert(!is_directional_mode);
       const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
       cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
       if (cfl_alpha_rate == INT_MAX) continue;
     }
     mbmi->angle_delta[PLANE_TYPE_UV] = 0;
-    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
+    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type) &&
+        cpi->oxcf.enable_angle_delta) {
       const int rate_overhead =
           x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
       if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
@@ -6497,7 +6499,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     this_rate = tokenonly_rd_stats.rate +
                 intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
     if (mode == UV_CFL_PRED) {
-      assert(is_cfl_allowed(xd));
+      assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
 #if CONFIG_DEBUG
       if (!xd->lossless[mbmi->segment_id])
         assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost);
@@ -6516,6 +6518,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 
   const int try_palette =
+      cpi->oxcf.enable_palette &&
       av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
   if (try_palette) {
     uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
@@ -6619,35 +6622,6 @@ static int get_interinter_compound_mask_rate(const MACROBLOCK *const x,
   }
 }
 
-typedef struct {
-  int eobs;
-  int brate;
-  int byrate;
-  int64_t bdist;
-  int64_t bsse;
-  int64_t brdcost;
-  int_mv mvs[2];
-  int_mv pred_mv[2];
-  int_mv ref_mv[2];
-
-  ENTROPY_CONTEXT ta[2];
-  ENTROPY_CONTEXT tl[2];
-} SEG_RDSTAT;
-
-typedef struct {
-  int_mv *ref_mv[2];
-  int_mv mvp;
-
-  int64_t segment_rd;
-  int r;
-  int64_t d;
-  int64_t sse;
-  int segment_yrate;
-  PREDICTION_MODE modes[4];
-  SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
-  int mvthresh;
-} BEST_SEG_INFO;
-
 static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
   return (mv->row >> 3) < mv_limits->row_min ||
          (mv->row >> 3) > mv_limits->row_max ||
@@ -6693,7 +6667,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
 
   ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-  conv_params.use_jnt_comp_avg = 0;
+  conv_params.use_dist_wtd_comp_avg = 0;
   WarpTypesAllowed warp_types[2];
   for (ref = 0; ref < 2; ++ref) {
     const WarpedMotionParams *const wm =
@@ -6734,7 +6708,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
       } else {
         int_mv cur_int_mv, init_int_mv;
         cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
-        cur_int_mv.as_mv.row = cur_mv[id].as_mv.col >> 3;
+        cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3;
         init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
         init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
         if (cur_int_mv.as_int == init_int_mv.as_int) {
@@ -6780,9 +6754,9 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                               mi_row * MI_SIZE, xd, cm->allow_warped_motion);
 
     const int order_idx = id != 0;
-    av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
-                               &xd->jcp_param.bck_offset,
-                               &xd->jcp_param.use_jnt_comp_avg, 1);
+    av1_dist_wtd_comp_weight_assign(
+        cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
+        &xd->jcp_param.bck_offset, &xd->jcp_param.use_dist_wtd_comp_avg, 1);
 
     // Do full-pixel compound motion search on the current reference frame.
     if (id) xd->plane[plane].pre[0] = ref_yv12[id];
@@ -7036,19 +7010,25 @@ static void setup_buffer_ref_mvs_inter(
     struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref_frame);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  const struct scale_factors *const sf =
-      &cm->current_frame.frame_refs[ref_frame - 1].sf;
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(cm, ref_frame);
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
   assert(yv12 != NULL);
 
-  // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
-  // use the UV scaling factors.
-  av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf,
-                       num_planes);
+  if (scaled_ref_frame) {
+    // Setup pred block based on scaled reference, because av1_mv_pred() doesn't
+    // support scaling.
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, mi_row,
+                         mi_col, NULL, NULL, num_planes);
+  } else {
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf,
+                         num_planes);
+  }
 
   // Gets an initial list of candidate vectors from neighbours and orders them
   av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
@@ -7056,11 +7036,18 @@ static void setup_buffer_ref_mvs_inter(
                    mi_col, mbmi_ext->mode_context);
 
   // Further refinement that is encode side only to test the top few candidates
-  // in full and choose the best as the centre point for subsequent searches.
+  // in full and choose the best as the center point for subsequent searches.
   // The current implementation doesn't support scaling.
-  (void)block_size;
-  av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
-              block_size);
+  av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12_mb[ref_frame][0].stride,
+              ref_frame, block_size);
+
+  // Go back to unscaled reference.
+  if (scaled_ref_frame) {
+    // We had temporarily setup pred block based on scaled reference above. Go
+    // back to unscaled reference now, for subsequent use.
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf,
+                         num_planes);
+  }
 }
 
 static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -7165,13 +7152,13 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
       bestsme = av1_full_pixel_search(
           cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
           sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
-          (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0);
+          (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0, &cpi->ss_cfg[SS_CFG_SRC]);
       break;
     case OBMC_CAUSAL:
-      bestsme = av1_obmc_full_pixel_search(cpi, x, &mvp_full, step_param, sadpb,
-                                           MAX_MVSEARCH_STEPS - 1 - step_param,
-                                           1, &cpi->fn_ptr[bsize], &ref_mv,
-                                           &(x->best_mv.as_mv), 0);
+      bestsme = av1_obmc_full_pixel_search(
+          cpi, x, &mvp_full, step_param, sadpb,
+          MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
+          &(x->best_mv.as_mv), 0, &cpi->ss_cfg[SS_CFG_SRC]);
       break;
     default: assert(0 && "Invalid motion mode!\n");
   }
@@ -7264,10 +7251,9 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
     x->pred_mv[ref] = x->best_mv.as_mv;
 }
 
-static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst,
+static INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst,
                                    const int num_planes) {
-  int i;
-  for (i = 0; i < num_planes; i++) {
+  for (int i = 0; i < num_planes; i++) {
     xd->plane[i].dst.buf = dst.plane[i];
     xd->plane[i].dst.stride = dst.stride[i];
   }
@@ -7314,9 +7300,9 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
                             plane, !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE,
                             mi_row * MI_SIZE, xd, cm->allow_warped_motion);
 
-  av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
-                             &xd->jcp_param.bck_offset,
-                             &xd->jcp_param.use_jnt_comp_avg, 1);
+  av1_dist_wtd_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
+                                  &xd->jcp_param.bck_offset,
+                                  &xd->jcp_param.use_dist_wtd_comp_avg, 1);
 }
 
 // Search for the best mv for one component of a compound,
@@ -7442,7 +7428,7 @@ static void compound_single_motion_search_interinter(
   // Prediction buffer from second frame.
   DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
   uint8_t *second_pred;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+  if (is_cur_buf_hbd(xd))
     second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
   else
     second_pred = (uint8_t *)second_pred_alloc_16;
@@ -7572,7 +7558,7 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
   const BLOCK_SIZE f_index = split_qtr[bsize];
   assert(f_index != BLOCK_INVALID);
 
-  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(&x->e_mbd)) {
     pred0 = CONVERT_TO_BYTEPTR(pred0);
     pred1 = CONVERT_TO_BYTEPTR(pred1);
   }
@@ -7622,7 +7608,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
   int wedge_types = (1 << get_wedge_bits_lookup(bsize));
   const uint8_t *mask;
   uint64_t sse;
-  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int hbd = is_cur_buf_hbd(xd);
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
 
   DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]);  // src - pred0
@@ -7693,7 +7679,7 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
   int wedge_types = (1 << get_wedge_bits_lookup(bsize));
   const uint8_t *mask;
   uint64_t sse;
-  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int hbd = is_cur_buf_hbd(xd);
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
     mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
@@ -7759,7 +7745,7 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
   DIFFWTD_MASK_TYPE cur_mask_type;
   int64_t best_rd = INT64_MAX;
   DIFFWTD_MASK_TYPE best_mask_type = 0;
-  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int hbd = is_cur_buf_hbd(xd);
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
   DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
   uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
@@ -7810,7 +7796,7 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
   const int bh = block_size_high[bsize];
   DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]);  // src - pred1
   DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]);     // pred1 - pred0
-  if (get_bitdepth_data_path_index(xd)) {
+  if (is_cur_buf_hbd(xd)) {
     aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
     aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
@@ -7889,7 +7875,7 @@ static void get_inter_predictors_masked_compound(
   av1_build_inter_predictors_for_planes_single_buf(
       xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
   const struct buf_2d *const src = &x->plane[0].src;
-  if (get_bitdepth_data_path_index(xd)) {
+  if (is_cur_buf_hbd(xd)) {
     aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
                               CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd);
     aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
@@ -7904,21 +7890,24 @@ static void get_inter_predictors_masked_compound(
 static int64_t build_and_cost_compound_type(
     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
     const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
-    int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
+    int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
     uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
     int mi_row, int mi_col, int mode_rate, int64_t ref_best_rd,
-    int *calc_pred_masked_compound) {
+    int *calc_pred_masked_compound, int32_t *comp_rate, int64_t *comp_dist,
+    int64_t *const comp_model_rd, const int64_t comp_best_model_rd,
+    int64_t *const comp_model_rd_cur) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  int rate_sum;
-  int64_t dist_sum;
   int64_t best_rd_cur = INT64_MAX;
   int64_t rd = INT64_MAX;
-  int tmp_skip_txfm_sb;
-  int64_t tmp_skip_sse_sb;
   const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+  int rate_sum, tmp_skip_txfm_sb;
+  int64_t dist_sum, tmp_skip_sse_sb;
 
+  // TODO(any): Save pred and mask calculation as well into records. However
+  // this may increase memory requirements as compound segment mask needs to be
+  // stored in each record.
   if (*calc_pred_masked_compound) {
     get_inter_predictors_masked_compound(cpi, x, bsize, mi_row, mi_col, preds0,
                                          preds1, residual1, diff10, strides);
@@ -7926,7 +7915,7 @@ static int64_t build_and_cost_compound_type(
   }
   if (cpi->sf.prune_wedge_pred_diff_based && compound_type == COMPOUND_WEDGE) {
     unsigned int sse;
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    if (is_cur_buf_hbd(xd))
       (void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
                                   CONVERT_TO_BYTEPTR(*preds1), *strides, &sse);
     else
@@ -7934,8 +7923,10 @@ static int64_t build_and_cost_compound_type(
     const unsigned int mse =
         ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]);
     // If two predictors are very similar, skip wedge compound mode search
-    if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64))
+    if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) {
+      *comp_model_rd_cur = INT64_MAX;
       return INT64_MAX;
+    }
   }
 
   best_rd_cur =
@@ -7947,34 +7938,76 @@ static int64_t build_and_cost_compound_type(
   // is unlikely to be the best mode considering the transform rd cost and other
   // mode overhead cost
   int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
-  if (mode_rd > ref_best_rd) return INT64_MAX;
-
-  if (have_newmv_in_inter_mode(this_mode) && compound_type == COMPOUND_WEDGE) {
-    *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize,
-                                                     this_mode, mi_row, mi_col);
-    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
-    model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
-        cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
-        &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
-    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
-    if (rd >= best_rd_cur) {
-      mbmi->mv[0].as_int = cur_mv[0].as_int;
-      mbmi->mv[1].as_int = cur_mv[1].as_int;
+  if (mode_rd > ref_best_rd) {
+    *comp_model_rd_cur = INT64_MAX;
+    return INT64_MAX;
+  }
+
+  // Reuse data if matching record is found
+  if (comp_rate[compound_type] == INT_MAX) {
+    if (have_newmv_in_inter_mode(this_mode) &&
+        compound_type == COMPOUND_WEDGE &&
+        !cpi->sf.disable_interinter_wedge_newmv_search) {
+      *out_rate_mv = interinter_compound_motion_search(
+          cpi, x, cur_mv, bsize, this_mode, mi_row, mi_col);
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+
+      model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+          &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+      rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
+      *comp_model_rd_cur = rd;
+      if (rd >= best_rd_cur) {
+        mbmi->mv[0].as_int = cur_mv[0].as_int;
+        mbmi->mv[1].as_int = cur_mv[1].as_int;
+        *out_rate_mv = rate_mv;
+        av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+                                                 strides, preds1, strides);
+        *comp_model_rd_cur = best_rd_cur;
+      }
+    } else {
       *out_rate_mv = rate_mv;
       av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
                                                preds1, strides);
+      model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+          &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+      *comp_model_rd_cur =
+          RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
     }
 
+    RD_STATS rd_stats;
+
+    if (cpi->sf.prune_comp_type_by_model_rd &&
+        (*comp_model_rd_cur > comp_best_model_rd) &&
+        comp_best_model_rd != INT64_MAX) {
+      *comp_model_rd_cur = INT64_MAX;
+      return INT64_MAX;
+    }
+    rd = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &rd_stats);
+    if (rd != INT64_MAX) {
+      rd =
+          RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist);
+      // Backup rate and distortion for future reuse
+      comp_rate[compound_type] = rd_stats.rate;
+      comp_dist[compound_type] = rd_stats.dist;
+      comp_model_rd[compound_type] = *comp_model_rd_cur;
+    }
   } else {
+    assert(comp_dist[compound_type] != INT64_MAX);
+    // When disable_interinter_wedge_newmv_search is set, motion refinement is
+    // disabled. Hence rate and distortion can be reused in this case as well
+    assert(IMPLIES(have_newmv_in_inter_mode(this_mode),
+                   cpi->sf.disable_interinter_wedge_newmv_search));
+    assert(mbmi->mv[0].as_int == cur_mv[0].as_int);
+    assert(mbmi->mv[1].as_int == cur_mv[1].as_int);
     *out_rate_mv = rate_mv;
-    av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
-                                             preds1, strides);
+    // Calculate RD cost based on stored stats
+    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type],
+                comp_dist[compound_type]);
+    *comp_model_rd_cur = comp_model_rd[compound_type];
   }
-  rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                           &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
-  if (rd != INT64_MAX)
-    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
-
   return rd;
 }
 
@@ -8172,8 +8205,9 @@ static INLINE int get_switchable_rate(MACROBLOCK *const x,
 
 // calculate the rdcost of given interpolation_filter
 static INLINE int64_t interpolation_filter_rd(
-    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-    int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col,
+    const BUFFER_SET *const orig_dst, int64_t *const rd,
     int *const switchable_rate, int *const skip_txfm_sb,
     int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx,
     const int switchable_ctx[2], const int skip_pred, int *rate,
@@ -8196,6 +8230,8 @@ static INLINE int64_t interpolation_filter_rd(
     return 0;
   }
 
+  (void)tile_data;
+
   assert(skip_pred != 2);
   assert((skip_pred >= 0) && (skip_pred <= cpi->default_interp_skip_flags));
   assert(rate[0] >= 0);
@@ -8209,11 +8245,13 @@ static INLINE int64_t interpolation_filter_rd(
 
   if (skip_pred != cpi->default_interp_skip_flags) {
     if (skip_pred != DEFAULT_LUMA_INTERP_SKIP_FLAG) {
-      av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
 #if CONFIG_COLLECT_RD_STATS == 3
       RD_STATS rd_stats_y;
-      select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
-      PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+      pick_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col,
+                            INT64_MAX);
+      PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
 #endif  // CONFIG_COLLECT_RD_STATS == 3
       model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
           cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
@@ -8234,8 +8272,8 @@ static INLINE int64_t interpolation_filter_rd(
           mbmi->interp_filters = last_best;
           return 0;
         }
-        av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                       plane);
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      plane, plane);
         model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
             cpi, bsize, x, xd, plane, plane, mi_row, mi_col, &tmp_rate_uv,
             &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, NULL, NULL);
@@ -8287,21 +8325,103 @@ static INLINE int64_t interpolation_filter_rd(
   return 0;
 }
 
+static INLINE void pred_dual_interp_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col,
+    const BUFFER_SET *const orig_dst, int64_t *const rd,
+    int *const switchable_rate, int *const skip_txfm_sb,
+    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
+    InterpFilters filter_idx, const int switchable_ctx[2], const int skip_pred,
+    int *rate, int64_t *dist, InterpFilters af_horiz, InterpFilters af_vert,
+    InterpFilters lf_horiz, InterpFilters lf_vert) {
+  if ((af_horiz == lf_horiz) && (af_horiz != SWITCHABLE)) {
+    if (((af_vert == lf_vert) && (af_vert != SWITCHABLE))) {
+      filter_idx = af_horiz + (af_vert * SWITCHABLE_FILTERS);
+      if (filter_idx) {
+        interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
+                                orig_dst, rd, switchable_rate, skip_txfm_sb,
+                                skip_sse_sb, dst_bufs, filter_idx,
+                                switchable_ctx, skip_pred, rate, dist);
+      }
+    } else {
+      for (filter_idx = af_horiz; filter_idx < (DUAL_FILTER_SET_SIZE);
+           filter_idx += SWITCHABLE_FILTERS) {
+        if (filter_idx) {
+          interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
+                                  orig_dst, rd, switchable_rate, skip_txfm_sb,
+                                  skip_sse_sb, dst_bufs, filter_idx,
+                                  switchable_ctx, skip_pred, rate, dist);
+        }
+      }
+    }
+  } else if ((af_vert == lf_vert) && (af_vert != SWITCHABLE)) {
+    for (filter_idx = (af_vert * SWITCHABLE_FILTERS);
+         filter_idx <= ((af_vert * SWITCHABLE_FILTERS) + 2); filter_idx += 1) {
+      if (filter_idx) {
+        interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
+                                orig_dst, rd, switchable_rate, skip_txfm_sb,
+                                skip_sse_sb, dst_bufs, filter_idx,
+                                switchable_ctx, skip_pred, rate, dist);
+      }
+    }
+  }
+}
+
 // Find the best interp filter if dual_interp_filter = 0
 static INLINE void find_best_non_dual_interp_filter(
-    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-    int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col,
+    const BUFFER_SET *const orig_dst, int64_t *const rd,
     int *const switchable_rate, int *const skip_txfm_sb,
     int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
     const int switchable_ctx[2], const int skip_ver, const int skip_hor,
     int *rate, int64_t *dist, int filter_set_size) {
   int16_t i;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
 
   // Regular filter evaluation should have been done and hence the same should
   // be the winner
   assert(x->e_mbd.mi[0]->interp_filters == filter_sets[0]);
   assert(filter_set_size == DUAL_FILTER_SET_SIZE);
-
+  if ((skip_hor & skip_ver) != cpi->default_interp_skip_flags) {
+    const AV1_COMMON *cm = &cpi->common;
+    int bsl, pred_filter_search;
+    InterpFilters af = SWITCHABLE, lf = SWITCHABLE, filter_idx = 0;
+    const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+    const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+    bsl = mi_size_wide_log2[bsize];
+    pred_filter_search =
+        cpi->sf.cb_pred_filter_search
+            ? (((mi_row + mi_col) >> bsl) +
+               get_chessboard_index(cm->current_frame.frame_number)) &
+                  0x1
+            : 0;
+    if (above_mbmi && is_inter_block(above_mbmi)) {
+      af = above_mbmi->interp_filters;
+    }
+    if (left_mbmi && is_inter_block(left_mbmi)) {
+      lf = left_mbmi->interp_filters;
+    }
+    pred_filter_search &= ((af == lf) && (af != SWITCHABLE));
+    if (pred_filter_search) {
+      filter_idx = SWITCHABLE * (af & 0xf);
+      // This assert tells that (filter_x == filter_y) for non-dual filter case
+      assert((filter_sets[filter_idx] & 0xffff) ==
+             (filter_sets[filter_idx] >> 16));
+      if (cpi->sf.adaptive_interp_filter_search &&
+          (cpi->sf.interp_filter_search_mask & (1 << (filter_idx >> 2)))) {
+        return;
+      }
+      if (filter_idx) {
+        interpolation_filter_rd(
+            x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd,
+            switchable_rate, skip_txfm_sb, skip_sse_sb, dst_bufs, filter_idx,
+            switchable_ctx, (skip_hor & skip_ver), rate, dist);
+      }
+      return;
+    }
+  }
   // Reuse regular filter's modeled rd data for sharp filter for following
   // cases
   // 1) When bsize is 4x4
@@ -8321,10 +8441,14 @@ static INLINE void find_best_non_dual_interp_filter(
     for (i = filter_set_size - 1; i > 0; i -= (SWITCHABLE_FILTERS + 1)) {
       // This assert tells that (filter_x == filter_y) for non-dual filter case
       assert((filter_sets[i] & 0xffff) == (filter_sets[i] >> 16));
-      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
-                              switchable_rate, skip_txfm_sb, skip_sse_sb,
-                              dst_bufs, i, switchable_ctx, skip_pred, rate,
-                              dist);
+      if (cpi->sf.adaptive_interp_filter_search &&
+          (cpi->sf.interp_filter_search_mask & (1 << (i >> 2)))) {
+        continue;
+      }
+      interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
+                              orig_dst, rd, switchable_rate, skip_txfm_sb,
+                              skip_sse_sb, dst_bufs, i, switchable_ctx,
+                              skip_pred, rate, dist);
       skip_pred = (skip_hor & skip_ver);
     }
   } else {
@@ -8333,10 +8457,14 @@ static INLINE void find_best_non_dual_interp_filter(
          i += (SWITCHABLE_FILTERS + 1)) {
       // This assert tells that (filter_x == filter_y) for non-dual filter case
       assert((filter_sets[i] & 0xffff) == (filter_sets[i] >> 16));
-      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
-                              switchable_rate, skip_txfm_sb, skip_sse_sb,
-                              dst_bufs, i, switchable_ctx, skip_pred, rate,
-                              dist);
+      if (cpi->sf.adaptive_interp_filter_search &&
+          (cpi->sf.interp_filter_search_mask & (1 << (i >> 2)))) {
+        continue;
+      }
+      interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
+                              orig_dst, rd, switchable_rate, skip_txfm_sb,
+                              skip_sse_sb, dst_bufs, i, switchable_ctx,
+                              skip_pred, rate, dist);
       // In first iteration, smooth filter is evaluated. If smooth filter
       // (which is less sharper) is the winner among regular and smooth filters,
       // sharp filter evaluation is skipped
@@ -8344,8 +8472,6 @@ static INLINE void find_best_non_dual_interp_filter(
       // accounting switchable filter rate)
       if (cpi->sf.skip_sharp_interp_filter_search &&
           skip_pred != cpi->default_interp_skip_flags) {
-        MACROBLOCKD *const xd = &x->e_mbd;
-        MB_MODE_INFO *const mbmi = xd->mi[0];
         if (mbmi->interp_filters == filter_sets[(SWITCHABLE_FILTERS + 1)])
           break;
       }
@@ -8366,6 +8492,52 @@ static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
   return 1;
 }
 
+// Checks if characteristics of search match
+static INLINE int is_comp_rd_match(const AV1_COMP *const cpi,
+                                   const MACROBLOCK *const x,
+                                   const COMP_RD_STATS *st,
+                                   const MB_MODE_INFO *const mi,
+                                   int32_t *comp_rate, int64_t *comp_dist,
+                                   int64_t *comp_model_rd) {
+  // TODO(ranjit): Ensure that compound type search use regular filter always
+  // and check if following check can be removed
+  // Check if interp filter matches with previous case
+  if (st->filter != mi->interp_filters) return 0;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  // Match MV and reference indices
+  for (int i = 0; i < 2; ++i) {
+    if ((st->ref_frames[i] != mi->ref_frame[i]) ||
+        (st->mv[i].as_int != mi->mv[i].as_int)) {
+      return 0;
+    }
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]];
+    if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0;
+  }
+
+  // Store the stats for compound average
+  comp_rate[COMPOUND_AVERAGE] = st->rate[COMPOUND_AVERAGE];
+  comp_dist[COMPOUND_AVERAGE] = st->dist[COMPOUND_AVERAGE];
+  comp_model_rd[COMPOUND_AVERAGE] = st->comp_model_rd[COMPOUND_AVERAGE];
+  comp_rate[COMPOUND_DISTWTD] = st->rate[COMPOUND_DISTWTD];
+  comp_dist[COMPOUND_DISTWTD] = st->dist[COMPOUND_DISTWTD];
+  comp_model_rd[COMPOUND_DISTWTD] = st->comp_model_rd[COMPOUND_DISTWTD];
+
+  // For compound wedge/segment, reuse data only if NEWMV is not present in
+  // either of the directions
+  if ((!have_newmv_in_inter_mode(mi->mode) &&
+       !have_newmv_in_inter_mode(st->mode)) ||
+      (cpi->sf.disable_interinter_wedge_newmv_search)) {
+    memcpy(&comp_rate[COMPOUND_WEDGE], &st->rate[COMPOUND_WEDGE],
+           sizeof(comp_rate[COMPOUND_WEDGE]) * 2);
+    memcpy(&comp_dist[COMPOUND_WEDGE], &st->dist[COMPOUND_WEDGE],
+           sizeof(comp_dist[COMPOUND_WEDGE]) * 2);
+    memcpy(&comp_model_rd[COMPOUND_WEDGE], &st->comp_model_rd[COMPOUND_WEDGE],
+           sizeof(comp_model_rd[COMPOUND_WEDGE]) * 2);
+  }
+  return 1;
+}
+
 static INLINE int find_interp_filter_in_stats(MACROBLOCK *x,
                                               MB_MODE_INFO *const mbmi) {
   const int comp_idx = mbmi->compound_idx;
@@ -8379,9 +8551,27 @@ static INLINE int find_interp_filter_in_stats(MACROBLOCK *x,
   }
   return -1;  // no match result found
 }
+// Checks if similar compound type search case is accounted earlier
+// If found, returns relevant rd data
+static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi,
+                                        const MACROBLOCK *x,
+                                        const MB_MODE_INFO *const mbmi,
+                                        int32_t *comp_rate, int64_t *comp_dist,
+                                        int64_t *comp_model_rd) {
+  for (int j = 0; j < x->comp_rd_stats_idx; ++j) {
+    if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate,
+                         comp_dist, comp_model_rd)) {
+      return 1;
+    }
+  }
+  return 0;  // no match result found
+}
 
 static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
-                                                  MB_MODE_INFO *const mbmi) {
+                                                  MB_MODE_INFO *const mbmi,
+                                                  int64_t rd, int skip_txfm_sb,
+                                                  int64_t skip_sse_sb,
+                                                  unsigned int pred_sse) {
   const int comp_idx = mbmi->compound_idx;
   const int offset = x->interp_filter_stats_idx[comp_idx];
   if (offset < MAX_INTERP_FILTER_STATS) {
@@ -8389,19 +8579,52 @@ static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
                                         { mbmi->mv[0], mbmi->mv[1] },
                                         { mbmi->ref_frame[0],
                                           mbmi->ref_frame[1] },
-                                        mbmi->interinter_comp.type };
+                                        mbmi->interinter_comp.type,
+                                        rd,
+                                        skip_txfm_sb,
+                                        skip_sse_sb,
+                                        pred_sse };
     x->interp_filter_stats[comp_idx][offset] = stat;
     x->interp_filter_stats_idx[comp_idx]++;
   }
 }
 
+static INLINE void save_comp_rd_search_stat(MACROBLOCK *x,
+                                            const MB_MODE_INFO *const mbmi,
+                                            const int32_t *comp_rate,
+                                            const int64_t *comp_dist,
+                                            const int64_t *comp_model_rd,
+                                            const int_mv *cur_mv) {
+  const int offset = x->comp_rd_stats_idx;
+  if (offset < MAX_COMP_RD_STATS) {
+    COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset;
+    memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate));
+    memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist));
+    memcpy(rd_stats->comp_model_rd, comp_model_rd,
+           sizeof(rd_stats->comp_model_rd));
+    memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv));
+    memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames));
+    rd_stats->mode = mbmi->mode;
+    rd_stats->filter = mbmi->interp_filters;
+    rd_stats->ref_mv_idx = mbmi->ref_mv_idx;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    for (int i = 0; i < 2; ++i) {
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mbmi->ref_frame[i]];
+      rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype);
+    }
+    ++x->comp_rd_stats_idx;
+  }
+}
+
 static int64_t interpolation_filter_search(
-    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-    int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
-    BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES],
-    int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
-    int64_t *const skip_sse_sb, const int skip_build_pred,
-    HandleInterModeArgs *args, int64_t ref_best_rd) {
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col,
+    const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
+    InterpFilter (*const single_filter)[REF_FRAMES], int64_t *const rd,
+    int *const switchable_rate, int *const skip_txfm_sb,
+    int64_t *const skip_sse_sb, int *skip_build_pred, HandleInterModeArgs *args,
+    int64_t ref_best_rd) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -8418,12 +8641,23 @@ static int64_t interpolation_filter_search(
   const int ref_frame = xd->mi[0]->ref_frame[0];
 
   (void)single_filter;
-  int match_found = -1;
+  int match_found_idx = -1;
   const InterpFilter assign_filter = cm->interp_filter;
   if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
-    match_found = find_interp_filter_in_stats(x, mbmi);
+    match_found_idx = find_interp_filter_in_stats(x, mbmi);
+  }
+  if (match_found_idx != -1) {
+    const int comp_idx = mbmi->compound_idx;
+    *rd = x->interp_filter_stats[comp_idx][match_found_idx].rd;
+    *skip_txfm_sb =
+        x->interp_filter_stats[comp_idx][match_found_idx].skip_txfm_sb;
+    *skip_sse_sb =
+        x->interp_filter_stats[comp_idx][match_found_idx].skip_sse_sb;
+    x->pred_sse[ref_frame] =
+        x->interp_filter_stats[comp_idx][match_found_idx].pred_sse;
+    return 0;
   }
-  if (!need_search || match_found == -1) {
+  if (!need_search || match_found_idx == -1) {
     set_default_interp_filters(mbmi, assign_filter);
   }
   int switchable_ctx[2];
@@ -8431,13 +8665,16 @@ static int64_t interpolation_filter_search(
   switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
   *switchable_rate =
       get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
-  if (!skip_build_pred)
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+  if (!(*skip_build_pred)) {
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, 0,
+                                  av1_num_planes(cm) - 1);
+    *skip_build_pred = 1;
+  }
 
 #if CONFIG_COLLECT_RD_STATS == 3
   RD_STATS rd_stats_y;
-  select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
-  PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+  pick_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
+  PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
 #endif  // CONFIG_COLLECT_RD_STATS == 3
   model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
       cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
@@ -8458,7 +8695,7 @@ static int64_t interpolation_filter_search(
   *skip_sse_sb = best_skip_sse_sb[1];
   x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
 
-  if (assign_filter != SWITCHABLE || match_found != -1) {
+  if (assign_filter != SWITCHABLE || match_found_idx != -1) {
     return 0;
   }
   if (!need_search) {
@@ -8493,9 +8730,8 @@ static int64_t interpolation_filter_search(
   const int is_compound = has_second_ref(mbmi);
   assert(is_intrabc_block(mbmi) == 0);
   for (int j = 0; j < 1 + is_compound; ++j) {
-    const RefBuffer *ref_buf =
-        &cm->current_frame.frame_refs[mbmi->ref_frame[j] - LAST_FRAME];
-    const struct scale_factors *const sf = &ref_buf->sf;
+    const struct scale_factors *const sf =
+        get_ref_scale_factors_const(cm, mbmi->ref_frame[j]);
     // TODO(any): Refine skip flag calculation considering scaling
     if (av1_is_scaled(sf)) {
       skip_hor = 0;
@@ -8543,38 +8779,72 @@ static int64_t interpolation_filter_search(
     int best_dual_mode = 0;
     // Find best of {R}x{R,Sm,Sh}
     const int bw = block_size_wide[bsize];
-    int skip_pred = bw <= 4 ? cpi->default_interp_skip_flags : skip_hor;
-    for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
-      if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
-                                  switchable_rate, best_skip_txfm_sb,
-                                  best_skip_sse_sb, dst_bufs, i, switchable_ctx,
-                                  skip_pred, tmp_rate, tmp_dist)) {
-        best_dual_mode = i;
-      }
-      skip_pred = skip_hor;
-    }
-    // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
     const int bh = block_size_high[bsize];
-    skip_pred = bh <= 4 ? cpi->default_interp_skip_flags : skip_ver;
-    assert(filter_set_size == DUAL_FILTER_SET_SIZE);
-    for (i = (best_dual_mode + (SWITCHABLE_FILTERS * 2));
-         i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) {
-      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
-                              switchable_rate, best_skip_txfm_sb,
-                              best_skip_sse_sb, dst_bufs, i, switchable_ctx,
-                              skip_pred, tmp_rate, tmp_dist);
-      skip_pred = skip_ver;
+    int skip_pred;
+    int bsl, pred_filter_search;
+    InterpFilters af_horiz = SWITCHABLE, af_vert = SWITCHABLE,
+                  lf_horiz = SWITCHABLE, lf_vert = SWITCHABLE, filter_idx = 0;
+    const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+    const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+    bsl = mi_size_wide_log2[bsize];
+    pred_filter_search =
+        cpi->sf.cb_pred_filter_search
+            ? (((mi_row + mi_col) >> bsl) +
+               get_chessboard_index(cm->current_frame.frame_number)) &
+                  0x1
+            : 0;
+    if (above_mbmi && is_inter_block(above_mbmi)) {
+      af_horiz = av1_extract_interp_filter(above_mbmi->interp_filters, 1);
+      af_vert = av1_extract_interp_filter(above_mbmi->interp_filters, 0);
+    }
+    if (left_mbmi && is_inter_block(left_mbmi)) {
+      lf_horiz = av1_extract_interp_filter(left_mbmi->interp_filters, 1);
+      lf_vert = av1_extract_interp_filter(left_mbmi->interp_filters, 0);
+    }
+    pred_filter_search &= !have_newmv_in_inter_mode(mbmi->mode);
+    pred_filter_search &=
+        ((af_horiz == lf_horiz) && (af_horiz != SWITCHABLE)) ||
+        ((af_vert == lf_vert) && (af_vert != SWITCHABLE));
+    if (pred_filter_search) {
+      pred_dual_interp_filter_rd(
+          x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd,
+          switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs,
+          filter_idx, switchable_ctx, (skip_hor & skip_ver), tmp_rate, tmp_dist,
+          af_horiz, af_vert, lf_horiz, lf_vert);
+    } else {
+      skip_pred = bw <= 4 ? cpi->default_interp_skip_flags : skip_hor;
+      for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
+        if (interpolation_filter_rd(
+                x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd,
+                switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs,
+                i, switchable_ctx, skip_pred, tmp_rate, tmp_dist)) {
+          best_dual_mode = i;
+        }
+        skip_pred = skip_hor;
+      }
+      // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+      skip_pred = bh <= 4 ? cpi->default_interp_skip_flags : skip_ver;
+      assert(filter_set_size == DUAL_FILTER_SET_SIZE);
+      for (i = (best_dual_mode + (SWITCHABLE_FILTERS * 2));
+           i >= (best_dual_mode + SWITCHABLE_FILTERS);
+           i -= SWITCHABLE_FILTERS) {
+        interpolation_filter_rd(
+            x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd,
+            switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, i,
+            switchable_ctx, skip_pred, tmp_rate, tmp_dist);
+        skip_pred = skip_ver;
+      }
     }
   } else if (cm->seq_params.enable_dual_filter == 0) {
     find_best_non_dual_interp_filter(
-        x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
+        x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
         best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
         skip_hor, tmp_rate, tmp_dist, filter_set_size);
   } else {
     // EIGHTTAP_REGULAR mode is calculated beforehand
     for (i = 1; i < filter_set_size; ++i) {
-      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
-                              switchable_rate, best_skip_txfm_sb,
+      interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
+                              orig_dst, rd, switchable_rate, best_skip_txfm_sb,
                               best_skip_sse_sb, dst_bufs, i, switchable_ctx,
                               (skip_hor & skip_ver), tmp_rate, tmp_dist);
     }
@@ -8586,7 +8856,8 @@ static int64_t interpolation_filter_search(
     // in either of the directions  Condition below is necessary, but not
     // sufficient
     assert((skip_hor == 1) || (skip_ver == 1));
-    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  AOM_PLANE_Y, AOM_PLANE_Y);
   }
   *skip_txfm_sb = best_skip_txfm_sb[1];
   *skip_sse_sb = best_skip_sse_sb[1];
@@ -8594,174 +8865,145 @@ static int64_t interpolation_filter_search(
 
   // save search results
   if (cpi->sf.skip_repeat_interpolation_filter_search) {
-    assert(match_found == -1);
-    save_interp_filter_search_stat(x, mbmi);
+    assert(match_found_idx == -1);
+    save_interp_filter_search_stat(x, mbmi, *rd, *skip_txfm_sb, *skip_sse_sb,
+                                   x->pred_sse[ref_frame]);
   }
   return 0;
 }
 
-static int txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                       int mi_row, int mi_col, RD_STATS *rd_stats,
-                       RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
-                       int mode_rate, int64_t ref_best_rd) {
+static int txfm_search(const AV1_COMP *cpi, const TileDataEnc *tile_data,
+                       MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+                       RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                       RD_STATS *rd_stats_uv, int mode_rate,
+                       int64_t ref_best_rd) {
   /*
    * This function combines y and uv planes' transform search processes
-   * together, when the prediction is generated. It first does subtration to
+   * together, when the prediction is generated. It first does subtraction to
    * obtain the prediction error. Then it calls
-   * select_tx_type_yrd/super_block_yrd and inter_block_uvrd sequentially and
-   * handles the early terminations happen in those functions. At the end, it
+   * pick_tx_size_type_yrd/super_block_yrd and super_block_uvrd sequentially and
+   * handles the early terminations happening in those functions. At the end, it
    * computes the rd_stats/_y/_uv accordingly.
    */
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  int skip_txfm_sb = 0;
-  const int num_planes = av1_num_planes(cm);
   const int ref_frame_1 = mbmi->ref_frame[1];
   const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
   const int64_t rd_thresh =
       ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
   const int skip_ctx = av1_get_skip_context(xd);
+  const int skip_flag_cost[2] = { x->skip_cost[skip_ctx][0],
+                                  x->skip_cost[skip_ctx][1] };
   const int64_t min_header_rate =
-      mode_rate + AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+      mode_rate + AOMMIN(skip_flag_cost[0], skip_flag_cost[1]);
   // Account for minimum skip and non_skip rd.
   // Eventually either one of them will be added to mode_rate
   const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
+  (void)tile_data;
 
   if (min_header_rd_possible > ref_best_rd) {
     av1_invalid_rd_stats(rd_stats_y);
-    av1_invalid_rd_stats(rd_stats);
     return 0;
   }
 
   av1_init_rd_stats(rd_stats);
   av1_init_rd_stats(rd_stats_y);
-  av1_init_rd_stats(rd_stats_uv);
   rd_stats->rate = mode_rate;
 
-  if (!cpi->common.all_lossless)
-    check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
-  if (!skip_txfm_sb) {
-    int64_t non_skip_rdcosty = INT64_MAX;
-    int64_t skip_rdcosty = INT64_MAX;
-    int64_t min_rdcosty = INT64_MAX;
-    int is_cost_valid_uv = 0;
-
-    // cost and distortion
-    av1_subtract_plane(x, bsize, 0);
-    if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-      // Motion mode
-      select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh);
+  // cost and distortion
+  av1_subtract_plane(x, bsize, 0);
+  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+    pick_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh);
 #if CONFIG_COLLECT_RD_STATS == 2
-      PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize);
+    PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize);
 #endif  // CONFIG_COLLECT_RD_STATS == 2
-    } else {
-      super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
-      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
-      for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
-        set_blk_skip(x, 0, i, rd_stats_y->skip);
-    }
-
-    if (rd_stats_y->rate == INT_MAX) {
-      av1_invalid_rd_stats(rd_stats);
-      // TODO(angiebird): check if we need this
-      // restore_dst_buf(xd, *orig_dst, num_planes);
-      mbmi->ref_frame[1] = ref_frame_1;
-      return 0;
-    }
+  } else {
+    super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+    for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+      set_blk_skip(x, 0, i, rd_stats_y->skip);
+  }
 
-    av1_merge_rd_stats(rd_stats, rd_stats_y);
+  if (rd_stats_y->rate == INT_MAX) {
+    // TODO(angiebird): check if we need this
+    // restore_dst_buf(xd, *orig_dst, num_planes);
+    mbmi->ref_frame[1] = ref_frame_1;
+    return 0;
+  }
 
-    non_skip_rdcosty = RDCOST(
-        x->rdmult, rd_stats->rate + x->skip_cost[skip_ctx][0], rd_stats->dist);
-    skip_rdcosty =
-        RDCOST(x->rdmult, mode_rate + x->skip_cost[skip_ctx][1], rd_stats->sse);
-    min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
+  av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+  const int64_t non_skip_rdcosty =
+      RDCOST(x->rdmult, rd_stats->rate + skip_flag_cost[0], rd_stats->dist);
+  const int64_t skip_rdcosty =
+      RDCOST(x->rdmult, mode_rate + skip_flag_cost[1], rd_stats->sse);
+  const int64_t min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
+  if (min_rdcosty > ref_best_rd) {
+    const int64_t tokenonly_rdy =
+        AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
+               RDCOST(x->rdmult, 0, rd_stats_y->sse));
+    // Invalidate rd_stats_y to skip the rest of the motion modes search
+    if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.prune_motion_mode_level) >
+        rd_thresh)
+      av1_invalid_rd_stats(rd_stats_y);
+    mbmi->ref_frame[1] = ref_frame_1;
+    return 0;
+  }
 
-    if (min_rdcosty > ref_best_rd) {
-      int64_t tokenonly_rdy =
-          AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
-                 RDCOST(x->rdmult, 0, rd_stats_y->sse));
-      // Invalidate rd_stats_y to skip the rest of the motion modes search
-      if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.adaptive_txb_search_level) >
-          rd_thresh)
-        av1_invalid_rd_stats(rd_stats_y);
+  av1_init_rd_stats(rd_stats_uv);
+  const int num_planes = av1_num_planes(cm);
+  if (num_planes > 1) {
+    int64_t ref_best_chroma_rd = ref_best_rd;
+    // Calculate best rd cost possible for chroma
+    if (cpi->sf.perform_best_rd_based_gating_for_chroma &&
+        (ref_best_chroma_rd != INT64_MAX)) {
+      ref_best_chroma_rd =
+          (ref_best_chroma_rd - AOMMIN(non_skip_rdcosty, skip_rdcosty));
+    }
+    const int is_cost_valid_uv =
+        super_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd);
+    if (!is_cost_valid_uv) {
       mbmi->ref_frame[1] = ref_frame_1;
       return 0;
     }
+    av1_merge_rd_stats(rd_stats, rd_stats_uv);
+  }
 
-    if (num_planes > 1) {
-      /* clang-format off */
-      is_cost_valid_uv =
-          inter_block_uvrd(cpi, x, rd_stats_uv, bsize,
-                           ref_best_rd - non_skip_rdcosty,
-                           ref_best_rd - skip_rdcosty, FTXS_NONE);
-      if (!is_cost_valid_uv) {
-        mbmi->ref_frame[1] = ref_frame_1;
-        return 0;
-      }
-      /* clang-format on */
-      av1_merge_rd_stats(rd_stats, rd_stats_uv);
-    } else {
-      av1_init_rd_stats(rd_stats_uv);
-    }
-    if (rd_stats->skip) {
-      rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
-      rd_stats_y->rate = 0;
-      rd_stats_uv->rate = 0;
-      rd_stats->rate += x->skip_cost[skip_ctx][1];
-      mbmi->skip = 0;
-      // here mbmi->skip temporarily plays a role as what this_skip2 does
-
-      int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-      if (tmprd > ref_best_rd) {
-        mbmi->ref_frame[1] = ref_frame_1;
-        return 0;
-      }
-#if CONFIG_ONE_PASS_SVM
-      av1_reg_stat_skipmode_update(rd_stats_y, x->rdmult);
-#endif
-    } else if (!xd->lossless[mbmi->segment_id] &&
-               (RDCOST(x->rdmult,
-                       rd_stats_y->rate + rd_stats_uv->rate +
-                           x->skip_cost[skip_ctx][0],
-                       rd_stats->dist) >=
-                RDCOST(x->rdmult, x->skip_cost[skip_ctx][1], rd_stats->sse))) {
-      rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
-      rd_stats->rate += x->skip_cost[skip_ctx][1];
-      rd_stats->dist = rd_stats->sse;
-      rd_stats_y->rate = 0;
-      rd_stats_uv->rate = 0;
-      mbmi->skip = 1;
-#if CONFIG_ONE_PASS_SVM
-      av1_reg_stat_skipmode_update(rd_stats_y, x->rdmult);
-#endif
-    } else {
-      rd_stats->rate += x->skip_cost[skip_ctx][0];
-      mbmi->skip = 0;
-    }
-  } else {
-    x->skip = 1;
-    mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
-    // The cost of skip bit needs to be added.
-    mbmi->skip = 0;
-    rd_stats->rate += x->skip_cost[skip_ctx][1];
-
-    rd_stats->dist = 0;
-    rd_stats->sse = 0;
+  if (rd_stats->skip) {
+    rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
     rd_stats_y->rate = 0;
     rd_stats_uv->rate = 0;
-    rd_stats->skip = 1;
-    int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+    rd_stats->dist = rd_stats->sse;
+    rd_stats_y->dist = rd_stats_y->sse;
+    rd_stats_uv->dist = rd_stats_uv->sse;
+    rd_stats->rate += skip_flag_cost[1];
+    mbmi->skip = 1;
+    // here mbmi->skip temporarily plays a role as what this_skip2 does
+
+    const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
     if (tmprd > ref_best_rd) {
       mbmi->ref_frame[1] = ref_frame_1;
       return 0;
     }
-#if CONFIG_ONE_PASS_SVM
-    av1_add_reg_stat(rd_stats, 0, 0, 0, 0, 0, bsize, bsize);
-    av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-#endif
+  } else if (!xd->lossless[mbmi->segment_id] &&
+             (RDCOST(x->rdmult,
+                     rd_stats_y->rate + rd_stats_uv->rate + skip_flag_cost[0],
+                     rd_stats->dist) >=
+              RDCOST(x->rdmult, skip_flag_cost[1], rd_stats->sse))) {
+    rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+    rd_stats->rate += skip_flag_cost[1];
+    rd_stats->dist = rd_stats->sse;
+    rd_stats_y->dist = rd_stats_y->sse;
+    rd_stats_uv->dist = rd_stats_uv->sse;
+    rd_stats_y->rate = 0;
+    rd_stats_uv->rate = 0;
+    mbmi->skip = 1;
+  } else {
+    rd_stats->rate += skip_flag_cost[0];
+    mbmi->skip = 0;
   }
+
   return 1;
 }
 
@@ -8773,18 +9015,30 @@ static INLINE bool enable_wedge_search(MACROBLOCK *const x,
          x->edge_strength > cpi->sf.disable_wedge_search_edge_thresh;
 }
 
+static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x,
+                                                  const AV1_COMP *const cpi) {
+  return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interinter_wedge;
+}
+
+static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x,
+                                                  const AV1_COMP *const cpi) {
+  return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interintra_wedge &&
+         !cpi->sf.disable_wedge_interintra_search;
+}
+
 static int handle_inter_intra_mode(const AV1_COMP *const cpi,
                                    MACROBLOCK *const x, BLOCK_SIZE bsize,
                                    int mi_row, int mi_col, MB_MODE_INFO *mbmi,
                                    HandleInterModeArgs *args,
                                    int64_t ref_best_rd, int *rate_mv,
-                                   int *tmp_rate2, BUFFER_SET *orig_dst) {
+                                   int *tmp_rate2, const BUFFER_SET *orig_dst) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
 
   INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
-  int64_t rd, best_interintra_rd = INT64_MAX;
+  int64_t rd = INT64_MAX;
+  int64_t best_interintra_rd = INT64_MAX;
   int rmode, rate_sum;
   int64_t dist_sum;
   int tmp_rate_mv = 0;
@@ -8803,60 +9057,118 @@ static int handle_inter_intra_mode(const AV1_COMP *const cpi,
   mbmi->ref_frame[1] = NONE_FRAME;
   xd->plane[0].dst.buf = tmp_buf;
   xd->plane[0].dst.stride = bw;
-  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
 
   restore_dst_buf(xd, *orig_dst, num_planes);
   mbmi->ref_frame[1] = INTRA_FRAME;
-  mbmi->use_wedge_interintra = 0;
   best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]];
-  int j = 0;
-  if (cpi->sf.reuse_inter_intra_mode == 0 ||
-      best_interintra_mode == INTERINTRA_MODES) {
-    for (j = 0; j < INTERINTRA_MODES; ++j) {
-      mbmi->interintra_mode = (INTERINTRA_MODE)j;
-      rmode = interintra_mode_cost[mbmi->interintra_mode];
+
+  if (cpi->oxcf.enable_smooth_interintra &&
+      !cpi->sf.disable_smooth_interintra) {
+    mbmi->use_wedge_interintra = 0;
+    int j = 0;
+    if (cpi->sf.reuse_inter_intra_mode == 0 ||
+        best_interintra_mode == INTERINTRA_MODES) {
+      for (j = 0; j < INTERINTRA_MODES; ++j) {
+        if ((!cpi->oxcf.enable_smooth_intra || cpi->sf.disable_smooth_intra) &&
+            (INTERINTRA_MODE)j == II_SMOOTH_PRED)
+          continue;
+        mbmi->interintra_mode = (INTERINTRA_MODE)j;
+        rmode = interintra_mode_cost[mbmi->interintra_mode];
+        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                  intrapred, bw);
+        av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+        model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
+            cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+            &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+        rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
+        if (rd < best_interintra_rd) {
+          best_interintra_rd = rd;
+          best_interintra_mode = mbmi->interintra_mode;
+        }
+      }
+      args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+    }
+    assert(IMPLIES(!cpi->oxcf.enable_smooth_interintra ||
+                       cpi->sf.disable_smooth_interintra,
+                   best_interintra_mode != II_SMOOTH_PRED));
+    rmode = interintra_mode_cost[best_interintra_mode];
+    if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) {
+      mbmi->interintra_mode = best_interintra_mode;
       av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
                                                 intrapred, bw);
       av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-      model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
-          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
-          &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
-      rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
-      if (rd < best_interintra_rd) {
-        best_interintra_rd = rd;
-        best_interintra_mode = mbmi->interintra_mode;
-      }
-    }
-    args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
-  }
-  if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) {
-    mbmi->interintra_mode = best_interintra_mode;
-    rmode = interintra_mode_cost[mbmi->interintra_mode];
-    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                              intrapred, bw);
-    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-  }
-  rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                           &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
-  if (rd != INT64_MAX)
-    rd = RDCOST(x->rdmult, *rate_mv + rmode + rate_sum + rwedge, dist_sum);
-  best_interintra_rd = rd;
-  if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) {
-    return -1;
+    }
+
+    RD_STATS rd_stats;
+    rd = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &rd_stats);
+    if (rd != INT64_MAX) {
+      rd = RDCOST(x->rdmult, *rate_mv + rmode + rd_stats.rate + rwedge,
+                  rd_stats.dist);
+    }
+    best_interintra_rd = rd;
+    if (ref_best_rd < INT64_MAX &&
+        ((best_interintra_rd >> 4) * 9) > ref_best_rd) {
+      return -1;
+    }
   }
   if (is_wedge_used) {
     int64_t best_interintra_rd_nowedge = rd;
     int64_t best_interintra_rd_wedge = INT64_MAX;
     int_mv tmp_mv;
-    if (enable_wedge_search(x, cpi)) {
+    if (enable_wedge_interintra_search(x, cpi)) {
       mbmi->use_wedge_interintra = 1;
 
       rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
                x->wedge_interintra_cost[bsize][1];
 
-      best_interintra_rd_wedge =
-          pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+      if (!cpi->oxcf.enable_smooth_interintra ||
+          cpi->sf.disable_smooth_interintra) {
+        if (best_interintra_mode == INTERINTRA_MODES) {
+          mbmi->interintra_mode = II_SMOOTH_PRED;
+          best_interintra_mode = II_SMOOTH_PRED;
+          av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                    intrapred, bw);
+          best_interintra_rd_wedge =
+              pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+
+          int j = 0;
+          for (j = 0; j < INTERINTRA_MODES; ++j) {
+            mbmi->interintra_mode = (INTERINTRA_MODE)j;
+            rmode = interintra_mode_cost[mbmi->interintra_mode];
+            av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0,
+                                                      orig_dst, intrapred, bw);
+            av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+            model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
+                cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+            rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
+            if (rd < best_interintra_rd) {
+              best_interintra_rd_wedge = rd;
+              best_interintra_mode = mbmi->interintra_mode;
+            }
+          }
+          args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+          mbmi->interintra_mode = best_interintra_mode;
+
+          if (best_interintra_mode != II_SMOOTH_PRED) {
+            av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0,
+                                                      orig_dst, intrapred, bw);
+          }
+        } else {
+          mbmi->interintra_mode = best_interintra_mode;
+          av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                    intrapred, bw);
+          best_interintra_rd_wedge =
+              pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+        }
+      } else {
+        best_interintra_rd_wedge =
+            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+      }
 
+      rmode = interintra_mode_cost[mbmi->interintra_mode];
       best_interintra_rd_wedge +=
           RDCOST(x->rdmult, rmode + *rate_mv + rwedge, 0);
       rd = INT64_MAX;
@@ -8871,8 +9183,8 @@ static int handle_inter_intra_mode(const AV1_COMP *const cpi,
                                       0);
         if (mbmi->mv[0].as_int != tmp_mv.as_int) {
           mbmi->mv[0].as_int = tmp_mv.as_int;
-          av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
-                                         bsize);
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                        AOM_PLANE_Y, AOM_PLANE_Y);
           model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
               cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
               &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
@@ -8886,12 +9198,17 @@ static int handle_inter_intra_mode(const AV1_COMP *const cpi,
         av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
       }
       // Evaluate closer to true rd
-      rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                               &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
-      if (rd != INT64_MAX)
-        rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
-                    dist_sum);
+      RD_STATS rd_stats;
+      rd = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &rd_stats);
+      if (rd != INT64_MAX) {
+        rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rd_stats.rate,
+                    rd_stats.dist);
+      }
       best_interintra_rd_wedge = rd;
+      if ((!cpi->oxcf.enable_smooth_interintra ||
+           cpi->sf.disable_smooth_interintra) &&
+          best_interintra_rd_wedge == INT64_MAX)
+        return -1;
       if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
         mbmi->use_wedge_interintra = 1;
         mbmi->mv[0].as_int = tmp_mv.as_int;
@@ -8900,33 +9217,133 @@ static int handle_inter_intra_mode(const AV1_COMP *const cpi,
       } else {
         mbmi->use_wedge_interintra = 0;
         mbmi->mv[0].as_int = mv0.as_int;
-        av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
       }
     } else {
+      if (!cpi->oxcf.enable_smooth_interintra ||
+          cpi->sf.disable_smooth_interintra)
+        return -1;
       mbmi->use_wedge_interintra = 0;
     }
-  }  // if (is_interintra_wedge_used(bsize))
+  } else {
+    if (best_interintra_rd == INT64_MAX) return -1;
+  }
   if (num_planes > 1) {
-    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize);
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  AOM_PLANE_U, num_planes - 1);
+  }
+  return 0;
+}
+
+// If number of valid neighbours is 1,
+// 1) ROTZOOM parameters can be obtained reliably (2 parameters from
+// one neighbouring MV)
+// 2) For IDENTITY/TRANSLATION cases, warp can perform better due to
+// a different interpolation filter being used. However the quality
+// gains (due to the same) may not be much
+// For above 2 cases warp evaluation is skipped
+
+static int check_if_optimal_warp(const AV1_COMP *cpi,
+                                 WarpedMotionParams *wm_params,
+                                 int num_proj_ref) {
+  int is_valid_warp = 1;
+  if (cpi->sf.prune_warp_using_wmtype) {
+    TransformationType wmtype = get_wmtype(wm_params);
+    if (num_proj_ref == 1) {
+      if (wmtype != ROTZOOM) is_valid_warp = 0;
+    } else {
+      if (wmtype < ROTZOOM) is_valid_warp = 0;
+    }
+  }
+  return is_valid_warp;
+}
+
+struct obmc_check_mv_field_ctxt {
+  MB_MODE_INFO *current_mi;
+  int mv_field_check_result;
+};
+
+static INLINE void obmc_check_identical_mv(MACROBLOCKD *xd, int rel_mi_col,
+                                           uint8_t nb_mi_width,
+                                           MB_MODE_INFO *nb_mi, void *fun_ctxt,
+                                           const int num_planes) {
+  (void)xd;
+  (void)rel_mi_col;
+  (void)nb_mi_width;
+  (void)num_planes;
+  struct obmc_check_mv_field_ctxt *ctxt =
+      (struct obmc_check_mv_field_ctxt *)fun_ctxt;
+  const MB_MODE_INFO *current_mi = ctxt->current_mi;
+
+  if (ctxt->mv_field_check_result == 0) return;
+
+  if (nb_mi->ref_frame[0] != current_mi->ref_frame[0] ||
+      nb_mi->mv[0].as_int != current_mi->mv[0].as_int ||
+      nb_mi->interp_filters != current_mi->interp_filters) {
+    ctxt->mv_field_check_result = 0;
+  }
+}
+
+// Check if the neighbors' motions used by obmc have same parameters as for
+// the current block. If all the parameters are identical, obmc will produce
+// the same prediction as from regular bmc, therefore we can skip the
+// overlapping operations for less complexity. The parameters checked include
+// reference frame, motion vector, and interpolation filter.
+int check_identical_obmc_mv_field(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                  int mi_row, int mi_col) {
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  struct obmc_check_mv_field_ctxt mv_field_check_ctxt = { xd->mi[0], 1 };
+
+  foreach_overlappable_nb_above(cm, xd, mi_col,
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                obmc_check_identical_mv, &mv_field_check_ctxt);
+  foreach_overlappable_nb_left(cm, xd, mi_row,
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
+                               obmc_check_identical_mv, &mv_field_check_ctxt);
+
+  return mv_field_check_ctxt.mv_field_check_result;
+}
+
+static int skip_interintra_based_on_first_pass_stats(const AV1_COMP *const cpi,
+                                                     MACROBLOCK *const x,
+                                                     BLOCK_SIZE bsize,
+                                                     int mi_row, int mi_col) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  if (cpi->two_pass_partition_search &&
+      cpi->sf.use_first_partition_pass_interintra_stats &&
+      !x->cb_partition_scan) {
+    const int mi_width = mi_size_wide[bsize];
+    const int mi_height = mi_size_high[bsize];
+    // Search in the stats table to see if obmc motion mode was used in the
+    // first pass of partition search.
+    for (int row = mi_row; row < mi_row + mi_width;
+         row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+      for (int col = mi_col; col < mi_col + mi_height;
+           col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+        const int index = av1_first_partition_pass_stats_index(row, col);
+        const FIRST_PARTITION_PASS_STATS *const stats =
+            &x->first_partition_pass_stats[index];
+        if (stats->interintra_motion_mode_count[mbmi->ref_frame[0]]) {
+          return 0;
+        }
+      }
+    }
+    return 1;
   }
   return 0;
 }
 
 // TODO(afergs): Refactor the MBMI references in here - there's four
 // TODO(afergs): Refactor optional args - add them to a struct or remove
-static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                              BLOCK_SIZE bsize, RD_STATS *rd_stats,
-                              RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
-                              int *disable_skip, int mi_row, int mi_col,
-                              HandleInterModeArgs *const args,
-                              int64_t ref_best_rd, const int *refs,
-                              int *rate_mv, BUFFER_SET *orig_dst
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-                              ,
-                              TileDataEnc *tile_data, int64_t *best_est_rd,
-                              int do_tx_search, InterModesInfo *inter_modes_info
-#endif
-) {
+static int64_t motion_mode_rd(
+    const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x,
+    BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col,
+    HandleInterModeArgs *const args, int64_t ref_best_rd, const int *refs,
+    int *rate_mv, const BUFFER_SET *orig_dst, int64_t *best_est_rd,
+    int do_tx_search, InterModesInfo *inter_modes_info) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
@@ -8936,16 +9353,17 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   const int rate2_nocoeff = rd_stats->rate;
   int best_xskip = 0, best_disable_skip = 0;
   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
-  MB_MODE_INFO base_mbmi, best_mbmi;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
   const int rate_mv0 = *rate_mv;
-
-  int interintra_allowed = cm->seq_params.enable_interintra_compound &&
-                           is_interintra_allowed(mbmi) && mbmi->compound_idx;
+  int skip_interintra_mode = 0;
+  const int interintra_allowed = cm->seq_params.enable_interintra_compound &&
+                                 is_interintra_allowed(mbmi) &&
+                                 mbmi->compound_idx;
   int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
 
   assert(mbmi->ref_frame[1] != INTRA_FRAME);
   const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
+  (void)tile_data;
   av1_invalid_rd_stats(&best_rd_stats);
   aom_clear_system_state();
   mbmi->num_proj_ref = 1;  // assume num_proj_ref >=1
@@ -8957,21 +9375,22 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   if (last_motion_mode_allowed == WARPED_CAUSAL) {
     mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
   }
-  int total_samples = mbmi->num_proj_ref;
+  const int total_samples = mbmi->num_proj_ref;
   if (total_samples == 0) {
     last_motion_mode_allowed = OBMC_CAUSAL;
   }
-  base_mbmi = *mbmi;
 
-  SimpleRDState *simple_states = &args->simple_rd_state[mbmi->ref_mv_idx];
+  const MB_MODE_INFO base_mbmi = *mbmi;
+  MB_MODE_INFO best_mbmi;
+  SimpleRDState *const simple_states = &args->simple_rd_state[mbmi->ref_mv_idx];
   const int switchable_rate =
       av1_is_interp_needed(xd) ? av1_get_switchable_rate(cm, x, xd) : 0;
   int64_t best_rd = INT64_MAX;
   int best_rate_mv = rate_mv0;
-  int identical_obmc_mv_field_detected =
+  const int identical_obmc_mv_field_detected =
       (cpi->sf.skip_obmc_in_uniform_mv_field ||
        cpi->sf.skip_wm_in_uniform_mv_field)
-          ? av1_check_identical_obmc_mv_field(cm, xd, mi_row, mi_col)
+          ? check_identical_obmc_mv_field(cm, xd, mi_row, mi_col)
           : 0;
   for (int mode_index = (int)SIMPLE_TRANSLATION;
        mode_index <= (int)last_motion_mode_allowed + interintra_allowed;
@@ -8980,10 +9399,8 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
     if (cpi->sf.prune_single_motion_modes_by_simple_trans &&
         args->single_ref_first_pass && mode_index)
       break;
-    int64_t tmp_rd = INT64_MAX;
     int tmp_rate2 = rate2_nocoeff;
-    int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
-    int skip_txfm_sb = 0;
+    const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
     int tmp_rate_mv = rate_mv0;
 
     *mbmi = base_mbmi;
@@ -8994,6 +9411,9 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
       assert(mbmi->ref_frame[1] != INTRA_FRAME);
     }
 
+    if (cpi->oxcf.enable_obmc == 0 && mbmi->motion_mode == OBMC_CAUSAL)
+      continue;
+
     if (identical_obmc_mv_field_detected) {
       if (cpi->sf.skip_obmc_in_uniform_mv_field &&
           mbmi->motion_mode == OBMC_CAUSAL)
@@ -9007,28 +9427,29 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
       // SIMPLE_TRANSLATION mode: no need to recalculate.
       // The prediction is calculated before motion_mode_rd() is called in
       // handle_inter_mode()
-      if (cpi->sf.prune_single_motion_modes_by_simple_trans &&
-          args->single_ref_first_pass == 0 && !is_comp_pred) {
-        if (simple_states->early_skipped) {
-          assert(simple_states->rd_stats.rdcost == INT64_MAX);
-          return INT64_MAX;
-        }
-        if (simple_states->rd_stats.rdcost != INT64_MAX) {
-          best_rd = simple_states->rd_stats.rdcost;
-          best_rd_stats = simple_states->rd_stats;
-          best_rd_stats_y = simple_states->rd_stats_y;
-          best_rd_stats_uv = simple_states->rd_stats_uv;
-          memcpy(best_blk_skip, simple_states->blk_skip,
-                 sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
-          best_xskip = simple_states->skip;
-          best_disable_skip = simple_states->disable_skip;
-          best_mbmi = *mbmi;
+      if (cpi->sf.prune_single_motion_modes_by_simple_trans && !is_comp_pred) {
+        if (args->single_ref_first_pass == 0) {
+          if (simple_states->early_skipped) {
+            assert(simple_states->rd_stats.rdcost == INT64_MAX);
+            return INT64_MAX;
+          }
+          if (simple_states->rd_stats.rdcost != INT64_MAX) {
+            best_rd = simple_states->rd_stats.rdcost;
+            best_rd_stats = simple_states->rd_stats;
+            best_rd_stats_y = simple_states->rd_stats_y;
+            best_rd_stats_uv = simple_states->rd_stats_uv;
+            memcpy(best_blk_skip, simple_states->blk_skip,
+                   sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+            best_xskip = simple_states->skip;
+            best_disable_skip = simple_states->disable_skip;
+            best_mbmi = *mbmi;
+          }
+          continue;
         }
-        continue;
+        simple_states->early_skipped = 0;
       }
-      simple_states->early_skipped = 0;
     } else if (mbmi->motion_mode == OBMC_CAUSAL) {
-      uint32_t cur_mv = mbmi->mv[0].as_int;
+      const uint32_t cur_mv = mbmi->mv[0].as_int;
       assert(!is_comp_pred);
       if (have_newmv_in_inter_mode(this_mode)) {
         single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
@@ -9041,7 +9462,8 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
         tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
       }
       if (mbmi->mv[0].as_int != cur_mv) {
-        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      0, av1_num_planes(cm) - 1);
       }
       av1_build_obmc_inter_prediction(
           cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
@@ -9069,7 +9491,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
         if (have_newmv_in_inter_mode(this_mode)) {
           const int_mv mv0 = mbmi->mv[0];
           const WarpedMotionParams wm_params0 = mbmi->wm_params;
-          int num_proj_ref0 = mbmi->num_proj_ref;
+          const int num_proj_ref0 = mbmi->num_proj_ref;
+
+          if (cpi->sf.prune_warp_using_wmtype) {
+            TransformationType wmtype = get_wmtype(&mbmi->wm_params);
+            if (wmtype < ROTZOOM) continue;
+          }
 
           // Refine MV in a small range.
           av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
@@ -9098,24 +9525,27 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
             mbmi->wm_params = wm_params0;
             mbmi->num_proj_ref = num_proj_ref0;
           }
+        } else {
+          if (!check_if_optimal_warp(cpi, &mbmi->wm_params, mbmi->num_proj_ref))
+            continue;
         }
 
-        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      av1_num_planes(cm) - 1);
       } else {
         continue;
       }
     } else if (is_interintra_mode) {
+      skip_interintra_mode = skip_interintra_based_on_first_pass_stats(
+          cpi, x, bsize, mi_row, mi_col);
+      if (skip_interintra_mode) continue;
       const int ret = handle_inter_intra_mode(
           cpi, x, bsize, mi_row, mi_col, mbmi, args, ref_best_rd, &tmp_rate_mv,
           &tmp_rate2, orig_dst);
       if (ret < 0) continue;
     }
 
-    if (!cpi->common.all_lossless)
-      check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
-
     x->skip = 0;
-
     rd_stats->dist = 0;
     rd_stats->sse = 0;
     rd_stats->skip = 1;
@@ -9146,85 +9576,93 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
       }
     }
 
-    if (!skip_txfm_sb) {
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-      int64_t est_rd = 0;
-      int est_skip = 0;
-      if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
-          cm->tile_rows == 1) {
-        InterModeRdModel *md = &tile_data->inter_mode_rd_models[mbmi->sb_type];
-        if (md->ready) {
-          const int64_t curr_sse = get_sse(cpi, x);
-          est_rd = get_est_rd(tile_data, mbmi->sb_type, x->rdmult, curr_sse,
-                              rd_stats->rate);
-          est_skip = est_rd * 0.8 > *best_est_rd;
-          if (est_skip) {
-            mbmi->ref_frame[1] = ref_frame_1;
-            continue;
-          } else {
-            if (est_rd < *best_est_rd) {
-              *best_est_rd = est_rd;
-            }
-          }
-        }
+    if (cpi->sf.model_based_motion_mode_rd_breakout && do_tx_search) {
+      int model_rate;
+      int64_t model_dist;
+      model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD](
+          cpi, mbmi->sb_type, x, xd, 0, num_planes - 1, mi_row, mi_col,
+          &model_rate, &model_dist, NULL, NULL, NULL, NULL, NULL);
+      const int64_t est_rd =
+          RDCOST(x->rdmult, rd_stats->rate + model_rate, model_dist);
+      if ((est_rd >> 3) * 6 > ref_best_rd) {
+        mbmi->ref_frame[1] = ref_frame_1;
+        continue;
       }
-#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
     }
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
     if (!do_tx_search) {
-      const int64_t curr_sse = get_sse(cpi, x);
+      int64_t curr_sse = -1;
       int est_residue_cost = 0;
       int64_t est_dist = 0;
-      const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
-                                               &est_residue_cost, &est_dist);
-      (void)has_est_rd;
-      assert(has_est_rd);
+      int64_t est_rd = 0;
+      if (cpi->sf.inter_mode_rd_model_estimation == 1) {
+        curr_sse = get_sse(cpi, x);
+        const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
+                                                 &est_residue_cost, &est_dist);
+        (void)has_est_rd;
+        assert(has_est_rd);
+      } else if (cpi->sf.inter_mode_rd_model_estimation == 2 ||
+                 cpi->sf.use_nonrd_pick_mode) {
+        model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD](
+            cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col,
+            &est_residue_cost, &est_dist, NULL, &curr_sse, NULL, NULL, NULL);
+      }
+      est_rd = RDCOST(x->rdmult, rd_stats->rate + est_residue_cost, est_dist);
+      if (est_rd * 0.8 > *best_est_rd) {
+        mbmi->ref_frame[1] = ref_frame_1;
+        continue;
+      }
       const int mode_rate = rd_stats->rate;
       rd_stats->rate += est_residue_cost;
       rd_stats->dist = est_dist;
-      rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      rd_stats->rdcost = est_rd;
+      *best_est_rd = AOMMIN(*best_est_rd, rd_stats->rdcost);
       if (cm->current_frame.reference_mode == SINGLE_REFERENCE) {
         if (!is_comp_pred) {
+          assert(curr_sse >= 0);
           inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
-                                rd_stats->rdcost, mbmi);
+                                rd_stats->rdcost, false, NULL, rd_stats,
+                                rd_stats_y, rd_stats_uv, mbmi);
         }
       } else {
+        assert(curr_sse >= 0);
         inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
-                              rd_stats->rdcost, mbmi);
+                              rd_stats->rdcost, false, NULL, rd_stats,
+                              rd_stats_y, rd_stats_uv, mbmi);
       }
     } else {
-#endif
-      if (!txfm_search(cpi, x, bsize, mi_row, mi_col, rd_stats, rd_stats_y,
-                       rd_stats_uv, rd_stats->rate, ref_best_rd)) {
+      if (!txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, rd_stats,
+                       rd_stats_y, rd_stats_uv, rd_stats->rate, ref_best_rd)) {
         if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
-          simple_states->early_skipped = 1;
+          if (cpi->sf.prune_single_motion_modes_by_simple_trans &&
+              !is_comp_pred) {
+            simple_states->early_skipped = 1;
+          }
           return INT64_MAX;
         }
         continue;
       }
-      if (!skip_txfm_sb) {
-        const int64_t curr_rd =
-            RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-        if (curr_rd < ref_best_rd) {
-          ref_best_rd = curr_rd;
-        }
-        *disable_skip = 0;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-        if (cpi->sf.inter_mode_rd_model_estimation) {
-          const int skip_ctx = av1_get_skip_context(xd);
-          inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
-                               rd_stats->dist,
-                               rd_stats_y->rate + rd_stats_uv->rate +
-                                   x->skip_cost[skip_ctx][mbmi->skip]);
-        }
-#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
-      } else {
-        *disable_skip = 1;
+
+      const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      ref_best_rd = AOMMIN(ref_best_rd, curr_rd);
+      *disable_skip = 0;
+      if (cpi->sf.inter_mode_rd_model_estimation == 1) {
+        const int skip_ctx = av1_get_skip_context(xd);
+        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
+                             rd_stats->dist,
+                             rd_stats_y->rate + rd_stats_uv->rate +
+                                 x->skip_cost[skip_ctx][mbmi->skip]);
+      }
+
+      // 2 means to both do the tx search and also update the inter_modes_info
+      // structure, since some modes will be conditionally TX searched.
+      if (do_tx_search == 2) {
+        rd_stats->rdcost = curr_rd;
+        inter_modes_info_push(inter_modes_info, rd_stats->rate, rd_stats->sse,
+                              curr_rd, true, x->blk_skip, rd_stats, rd_stats_y,
+                              rd_stats_uv, mbmi);
       }
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
     }
-#endif
 
     if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
       if (is_nontrans_global_motion(xd, xd->mi[0])) {
@@ -9233,7 +9671,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
       }
     }
 
-    tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+    const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
     if (mode_index == 0) {
       args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
       if (!is_comp_pred) {
@@ -9247,7 +9685,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
         simple_states->disable_skip = *disable_skip;
       }
     }
-    if ((mode_index == 0) || (tmp_rd < best_rd)) {
+    if (mode_index == 0 || tmp_rd < best_rd) {
       best_mbmi = *mbmi;
       best_rd = tmp_rd;
       best_rd_stats = *rd_stats;
@@ -9283,11 +9721,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
 
 static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
                             MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row,
-                            int mi_col, BUFFER_SET *const orig_dst) {
+                            int mi_col, const BUFFER_SET *const orig_dst) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
-  av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, 0,
+                                av1_num_planes(cm) - 1);
 
   int64_t total_sse = 0;
   for (int plane = 0; plane < num_planes; ++plane) {
@@ -9299,44 +9738,8 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
     const int bh = block_size_high[plane_bsize];
 
     av1_subtract_plane(x, bsize, plane);
-    int64_t sse;
-#if CONFIG_ONE_PASS_SVM
-    if (plane == AOM_PLANE_Y && bsize >= BLOCK_8X8 && bw == bh) {
-      rd_stats->sse_0 = aom_sum_squares_2d_i16(p->src_diff, bw, bw / 2, bh / 2)
-                        << 4;
-      rd_stats->sse_1 =
-          aom_sum_squares_2d_i16(p->src_diff + bw / 2, bw, bw / 2, bh / 2) << 4;
-      rd_stats->sse_2 =
-          aom_sum_squares_2d_i16(p->src_diff + bh / 2 * bw, bw, bw / 2, bh / 2)
-          << 4;
-      rd_stats->sse_3 =
-          aom_sum_squares_2d_i16(p->src_diff + bh / 2 * bw + bw / 2, bw, bw / 2,
-                                 bh / 2)
-          << 4;
-
-      sse =
-          rd_stats->sse_0 + rd_stats->sse_1 + rd_stats->sse_2 + rd_stats->sse_3;
-      total_sse += sse;
-
-      const int scaling_factor = MAX_MIB_SIZE * MAX_MIB_SIZE;
-      rd_stats->sse = sse;
-      rd_stats->sse_0 = rd_stats->sse_0 * scaling_factor;
-      rd_stats->sse_1 = rd_stats->sse_1 * scaling_factor;
-      rd_stats->sse_2 = rd_stats->sse_2 * scaling_factor;
-      rd_stats->sse_3 = rd_stats->sse_3 * scaling_factor;
-      rd_stats->y_sse = sse;
-      // TODO(chiyotsai@google.com): Don't manually set the flags
-      av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-    } else {
-      sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
-      sse = sse << 4;
-      total_sse += sse;
-    }
-#else
-    sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
-    sse = sse << 4;
+    int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh) << 4;
     total_sse += sse;
-#endif
   }
   const int skip_mode_ctx = av1_get_skip_mode_context(xd);
   rd_stats->dist = rd_stats->sse = total_sse;
@@ -9456,25 +9859,20 @@ typedef struct {
   uint8_t *tmp_best_mask_buf;  // backup of the best segmentation mask
 } CompoundTypeRdBuffers;
 
-static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                            BLOCK_SIZE bsize, int mi_col, int mi_row,
-                            int_mv *cur_mv, int masked_compound_used,
-                            BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst,
-                            CompoundTypeRdBuffers *buffers, int *rate_mv,
-                            int64_t *rd, RD_STATS *rd_stats,
-                            int64_t ref_best_rd) {
+static int compound_type_rd(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_col,
+    int mi_row, int_mv *cur_mv, int mode_search_mask, int masked_compound_used,
+    const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst,
+    CompoundTypeRdBuffers *buffers, int *rate_mv, int64_t *rd,
+    RD_STATS *rd_stats, int64_t ref_best_rd, int *is_luma_interp_done) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   const PREDICTION_MODE this_mode = mbmi->mode;
   const int bw = block_size_wide[bsize];
-  int rate_sum, rs2;
-  int64_t dist_sum;
-
+  int rs2;
   int_mv best_mv[2];
   int best_tmp_rate_mv = *rate_mv;
-  int tmp_skip_txfm_sb;
-  int64_t tmp_skip_sse_sb;
   INTERINTER_COMPOUND_DATA best_compound_data;
   best_compound_data.type = COMPOUND_AVERAGE;
   uint8_t *preds0[1] = { buffers->pred0 };
@@ -9486,56 +9884,214 @@ static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   COMPOUND_TYPE cur_type;
   int best_compmode_interinter_cost = 0;
   int calc_pred_masked_compound = 1;
+  int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+                                        INT64_MAX };
+  int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+  int64_t comp_model_rd[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+                                            INT64_MAX };
+  const int match_found =
+      find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rd);
 
   best_mv[0].as_int = cur_mv[0].as_int;
   best_mv[1].as_int = cur_mv[1].as_int;
   *rd = INT64_MAX;
+  int rate_sum, tmp_skip_txfm_sb;
+  int64_t dist_sum, tmp_skip_sse_sb;
+  int64_t comp_best_model_rd = INT64_MAX;
+  // Special handling if both compound_average and compound_distwtd
+  // are to be searched. In this case, first estimate between the two
+  // modes and then call estimate_yrd_for_sb() only for the better of
+  // the two.
+  const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE));
+  const int try_distwtd_comp =
+      ((mode_search_mask & (1 << COMPOUND_DISTWTD)) &&
+       cm->seq_params.order_hint_info.enable_dist_wtd_comp == 1 &&
+       cpi->sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
+  const int try_average_and_distwtd_comp =
+      try_average_comp && try_distwtd_comp &&
+      comp_rate[COMPOUND_AVERAGE] == INT_MAX &&
+      comp_rate[COMPOUND_DISTWTD] == INT_MAX;
   for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
-    if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
+    if (((1 << cur_type) & mode_search_mask) == 0) {
+      if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+      continue;
+    }
     if (!is_interinter_compound_used(cur_type, bsize)) continue;
+    if (cur_type >= COMPOUND_WEDGE && !masked_compound_used) break;
+    if (cur_type == COMPOUND_DISTWTD && !try_distwtd_comp) continue;
+    if (cur_type == COMPOUND_AVERAGE && try_average_and_distwtd_comp) continue;
+
+    int64_t comp_model_rd_cur = INT64_MAX;
     tmp_rate_mv = *rate_mv;
     int64_t best_rd_cur = INT64_MAX;
-    mbmi->interinter_comp.type = cur_type;
-    int masked_type_cost = 0;
-
     const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
     const int comp_index_ctx = get_comp_index_context(cm, xd);
-    mbmi->compound_idx = 1;
-    if (cur_type == COMPOUND_AVERAGE) {
+
+    if (cur_type == COMPOUND_DISTWTD && try_average_and_distwtd_comp) {
+      int est_rate[2];
+      int64_t est_dist[2], est_rd[2];
+
+      int masked_type_cost[2] = { 0, 0 };
       mbmi->comp_group_idx = 0;
+
+      // First find the modeled rd cost for COMPOUND_AVERAGE
+      mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+      mbmi->compound_idx = 1;
       if (masked_compound_used) {
-        masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0];
-      }
-      masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
-      rs2 = masked_type_cost;
-      const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
-      if (mode_rd < ref_best_rd) {
-        av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
-        int64_t est_rd =
-            estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
-        if (est_rd != INT64_MAX)
-          best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
-      }
-      // use spare buffer for following compound type try
+        masked_type_cost[COMPOUND_AVERAGE] +=
+            x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx];
+      }
+      masked_type_cost[COMPOUND_AVERAGE] +=
+          x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+      *is_luma_interp_done = 1;
+      model_rd_sb_fn[MODELRD_CURVFIT](
+          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &est_rate[COMPOUND_AVERAGE],
+          &est_dist[COMPOUND_AVERAGE], NULL, NULL, NULL, NULL, NULL);
+      est_rate[COMPOUND_AVERAGE] += masked_type_cost[COMPOUND_AVERAGE];
+      est_rd[COMPOUND_AVERAGE] =
+          RDCOST(x->rdmult, est_rate[COMPOUND_AVERAGE] + *rate_mv,
+                 est_dist[COMPOUND_AVERAGE]);
       restore_dst_buf(xd, *tmp_dst, 1);
+
+      // Next find the modeled rd cost for COMPOUND_DISTWTD
+      mbmi->interinter_comp.type = COMPOUND_DISTWTD;
+      mbmi->compound_idx = 0;
+      if (masked_compound_used) {
+        masked_type_cost[COMPOUND_DISTWTD] +=
+            x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx];
+      }
+      masked_type_cost[COMPOUND_DISTWTD] +=
+          x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+      model_rd_sb_fn[MODELRD_CURVFIT](
+          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &est_rate[COMPOUND_DISTWTD],
+          &est_dist[COMPOUND_DISTWTD], NULL, NULL, NULL, NULL, NULL);
+      est_rate[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_DISTWTD];
+      est_rd[COMPOUND_DISTWTD] =
+          RDCOST(x->rdmult, est_rate[COMPOUND_DISTWTD] + *rate_mv,
+                 est_dist[COMPOUND_DISTWTD]);
+
+      // Choose the better of the two based on modeled cost and call
+      // estimate_yrd_for_sb() for that one.
+      if (est_rd[COMPOUND_AVERAGE] <= est_rd[COMPOUND_DISTWTD]) {
+        mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+        mbmi->compound_idx = 1;
+        restore_dst_buf(xd, *orig_dst, 1);
+        RD_STATS est_rd_stats;
+        const int64_t est_rd_ =
+            estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+        rs2 = masked_type_cost[COMPOUND_AVERAGE];
+        if (est_rd_ != INT64_MAX) {
+          best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+                               est_rd_stats.dist);
+          restore_dst_buf(xd, *tmp_dst, 1);
+          comp_rate[COMPOUND_AVERAGE] = est_rd_stats.rate;
+          comp_dist[COMPOUND_AVERAGE] = est_rd_stats.dist;
+          comp_model_rd[COMPOUND_AVERAGE] = est_rd[COMPOUND_AVERAGE];
+          comp_model_rd_cur = est_rd[COMPOUND_AVERAGE];
+        }
+        restore_dst_buf(xd, *tmp_dst, 1);
+      } else {
+        RD_STATS est_rd_stats;
+        const int64_t est_rd_ =
+            estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+        rs2 = masked_type_cost[COMPOUND_DISTWTD];
+        if (est_rd_ != INT64_MAX) {
+          best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+                               est_rd_stats.dist);
+          comp_rate[COMPOUND_DISTWTD] = est_rd_stats.rate;
+          comp_dist[COMPOUND_DISTWTD] = est_rd_stats.dist;
+          comp_model_rd[COMPOUND_DISTWTD] = est_rd[COMPOUND_DISTWTD];
+          comp_model_rd_cur = est_rd[COMPOUND_DISTWTD];
+        }
+      }
     } else {
-      mbmi->comp_group_idx = 1;
-      masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
-      masked_type_cost += x->compound_type_cost[bsize][cur_type - 1];
-      rs2 = masked_type_cost;
-      if (enable_wedge_search(x, cpi) && *rd / 3 < ref_best_rd) {
-        best_rd_cur = build_and_cost_compound_type(
-            cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
-            &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10,
-            strides, mi_row, mi_col, rd_stats->rate, ref_best_rd,
-            &calc_pred_masked_compound);
+      mbmi->interinter_comp.type = cur_type;
+      int masked_type_cost = 0;
+      if (cur_type == COMPOUND_AVERAGE || cur_type == COMPOUND_DISTWTD) {
+        mbmi->comp_group_idx = 0;
+        mbmi->compound_idx = (cur_type == COMPOUND_AVERAGE);
+        if (masked_compound_used) {
+          masked_type_cost +=
+              x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx];
+        }
+        masked_type_cost +=
+            x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
+        rs2 = masked_type_cost;
+        const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+        if (mode_rd < ref_best_rd) {
+          // Reuse data if matching record is found
+          if (comp_rate[cur_type] == INT_MAX) {
+            av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst,
+                                          bsize, AOM_PLANE_Y, AOM_PLANE_Y);
+            if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
+            RD_STATS est_rd_stats;
+            const int64_t est_rd =
+                estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+            if (comp_rate[cur_type] != INT_MAX) {
+              assert(comp_rate[cur_type] == est_rd_stats.rate);
+              assert(comp_dist[cur_type] == est_rd_stats.dist);
+            }
+            if (est_rd != INT64_MAX) {
+              best_rd_cur =
+                  RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+                         est_rd_stats.dist);
+              model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+                  cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+                  &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+              comp_model_rd_cur =
+                  RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+
+              // Backup rate and distortion for future reuse
+              comp_rate[cur_type] = est_rd_stats.rate;
+              comp_dist[cur_type] = est_rd_stats.dist;
+              comp_model_rd[cur_type] = comp_model_rd_cur;
+            }
+          } else {
+            // Calculate RD cost based on stored stats
+            assert(comp_dist[cur_type] != INT64_MAX);
+            best_rd_cur =
+                RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type],
+                       comp_dist[cur_type]);
+            comp_model_rd_cur = comp_model_rd[cur_type];
+          }
+        }
+        // use spare buffer for following compound type try
+        if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+      } else {
+        mbmi->comp_group_idx = 1;
+        mbmi->compound_idx = 1;
+        masked_type_cost +=
+            x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx];
+        masked_type_cost +=
+            x->compound_type_cost[bsize][cur_type - COMPOUND_WEDGE];
+        rs2 = masked_type_cost;
+
+        if (((*rd / cpi->max_comp_type_rd_threshold_div) *
+             cpi->max_comp_type_rd_threshold_mul) < ref_best_rd) {
+          const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+
+          if (!((compound_type == COMPOUND_WEDGE &&
+                 !enable_wedge_interinter_search(x, cpi)) ||
+                (compound_type == COMPOUND_DIFFWTD &&
+                 !cpi->oxcf.enable_diff_wtd_comp)))
+            best_rd_cur = build_and_cost_compound_type(
+                cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
+                &tmp_rate_mv, preds0, preds1, buffers->residual1,
+                buffers->diff10, strides, mi_row, mi_col, rd_stats->rate,
+                ref_best_rd, &calc_pred_masked_compound, comp_rate, comp_dist,
+                comp_model_rd, comp_best_model_rd, &comp_model_rd_cur);
+        }
       }
     }
     if (best_rd_cur < *rd) {
       *rd = best_rd_cur;
+      comp_best_model_rd = comp_model_rd_cur;
       best_compound_data = mbmi->interinter_comp;
-      if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) {
+      if (masked_compound_used && cur_type >= COMPOUND_WEDGE) {
         memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len);
       }
       best_compmode_interinter_cost = rs2;
@@ -9555,8 +10111,8 @@ static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
     mbmi->mv[1].as_int = cur_mv[1].as_int;
   }
   if (mbmi->interinter_comp.type != best_compound_data.type) {
-    mbmi->comp_group_idx =
-        (best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1;
+    mbmi->comp_group_idx = (best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1;
+    mbmi->compound_idx = !(best_compound_data.type == COMPOUND_DISTWTD);
     mbmi->interinter_comp = best_compound_data;
     memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len);
   }
@@ -9569,6 +10125,9 @@ static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
   restore_dst_buf(xd, *orig_dst, 1);
+  if (!match_found)
+    save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rd,
+                             cur_mv);
   return best_compmode_interinter_cost;
 }
 
@@ -9609,20 +10168,13 @@ typedef struct {
   int_mv mv;
 } inter_mode_info;
 
-static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                 BLOCK_SIZE bsize, RD_STATS *rd_stats,
-                                 RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
-                                 int *disable_skip, int mi_row, int mi_col,
-                                 HandleInterModeArgs *args, int64_t ref_best_rd,
-                                 uint8_t *const tmp_buf,
-                                 CompoundTypeRdBuffers *rd_buffers
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-                                 ,
-                                 TileDataEnc *tile_data, int64_t *best_est_rd,
-                                 const int do_tx_search,
-                                 InterModesInfo *inter_modes_info
-#endif
-) {
+static int64_t handle_inter_mode(
+    AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x,
+    BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col,
+    HandleInterModeArgs *args, int64_t ref_best_rd, uint8_t *const tmp_buf,
+    CompoundTypeRdBuffers *rd_buffers, int64_t *best_est_rd,
+    const int do_tx_search, InterModesInfo *inter_modes_info) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
@@ -9642,7 +10194,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   // one for future predictions. In the end, copy from tmp_buf to
   // dst if necessary.
   struct macroblockd_plane *p = xd->plane;
-  BUFFER_SET orig_dst = {
+  const BUFFER_SET orig_dst = {
     { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
     { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
   };
@@ -9668,11 +10220,20 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   int backup_rate_mv = 0;
   inter_mode_info mode_info[MAX_REF_MV_SERCH];
 
-  int comp_idx;
-  const int search_jnt_comp = is_comp_pred &
-                              cm->seq_params.order_hint_info.enable_jnt_comp &
-                              (mbmi->mode != GLOBAL_GLOBALMV) &
-                              (cpi->sf.use_jnt_comp_flag != JNT_COMP_DISABLED);
+  int mode_search_mask[2];
+  const int do_two_loop_comp_search =
+      is_comp_pred && cpi->sf.two_loop_comp_search;
+  if (do_two_loop_comp_search) {
+    // TODO(debargha): Change this to try alternate ways of splitting
+    // modes while doing two pass compound_mode search.
+    mode_search_mask[0] = (1 << COMPOUND_AVERAGE);
+  } else {
+    mode_search_mask[0] = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
+                          (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
+  }
+  mode_search_mask[1] = ((1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
+                         (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD)) -
+                        mode_search_mask[0];
 
   // TODO(jingning): This should be deprecated shortly.
   const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
@@ -9729,42 +10290,35 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
 
     const RD_STATS backup_rd_stats = *rd_stats;
-    // If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
-    for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
+
+    for (int comp_loop_idx = 0; comp_loop_idx <= do_two_loop_comp_search;
+         ++comp_loop_idx) {
       int rs = 0;
       int compmode_interinter_cost = 0;
-      mbmi->compound_idx = comp_idx;
-      if (is_comp_pred && comp_idx == 0) {
-        *rd_stats = backup_rd_stats;
-        mbmi->interinter_comp.type = COMPOUND_AVERAGE;
-        mbmi->num_proj_ref = 0;
-        mbmi->motion_mode = SIMPLE_TRANSLATION;
-        mbmi->comp_group_idx = 0;
 
-        const int comp_index_ctx = get_comp_index_context(cm, xd);
-        compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0];
-      }
+      if (is_comp_pred && comp_loop_idx == 1) *rd_stats = backup_rd_stats;
 
       int_mv cur_mv[2];
       if (!build_cur_mv(cur_mv, this_mode, cm, x)) {
         continue;
       }
       if (have_newmv_in_inter_mode(this_mode)) {
-        if (comp_idx == 0) {
+        if (comp_loop_idx == 1) {
           cur_mv[0] = backup_mv[0];
           cur_mv[1] = backup_mv[1];
           rate_mv = backup_rate_mv;
         }
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        start_timing(cpi, handle_newmv_time);
+#endif
         if (cpi->sf.prune_single_motion_modes_by_simple_trans &&
             args->single_ref_first_pass == 0 && !is_comp_pred) {
           const int ref0 = mbmi->ref_frame[0];
           newmv_ret_val = args->single_newmv_valid[ref_mv_idx][ref0] ? 0 : 1;
           cur_mv[0] = args->single_newmv[ref_mv_idx][ref0];
           rate_mv = args->single_newmv_rate[ref_mv_idx][ref0];
-        } else if (!(search_jnt_comp &&
-                     (cpi->sf.use_jnt_comp_flag == JNT_COMP_SKIP_MV_SEARCH) &&
-                     comp_idx == 0)) {
+        } else if (comp_loop_idx == 0) {
           newmv_ret_val = handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col,
                                        &rate_mv, args);
 
@@ -9774,6 +10328,9 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
           backup_mv[1] = cur_mv[1];
           backup_rate_mv = rate_mv;
         }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        end_timing(cpi, handle_newmv_time);
+#endif
 
         if (newmv_ret_val != 0) {
           continue;
@@ -9817,7 +10374,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                       best_rd = RDCOST(x->rdmult, best_rd_stats.rate,
                                        best_rd_stats.dist);
                       if (best_rd < ref_best_rd) ref_best_rd = best_rd;
-
                       skip = 1;
                       break;
                     }
@@ -9869,46 +10425,90 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         continue;
       }
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, compound_type_rd_time);
+#endif
       int skip_build_pred = 0;
-      if (is_comp_pred && comp_idx) {
-        // Find matching interp filter or set to default interp filter
-        const int need_search =
-            av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
-        int match_found = -1;
-        const InterpFilter assign_filter = cm->interp_filter;
-        if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
-          match_found = find_interp_filter_in_stats(x, mbmi);
-        }
-        if (!need_search || match_found == -1) {
-          set_default_interp_filters(mbmi, assign_filter);
-        }
+      if (is_comp_pred) {
+        if (mode_search_mask[comp_loop_idx] == (1 << COMPOUND_AVERAGE)) {
+          // Only compound_average
+          mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+          mbmi->num_proj_ref = 0;
+          mbmi->motion_mode = SIMPLE_TRANSLATION;
+          mbmi->comp_group_idx = 0;
+          mbmi->compound_idx = 1;
+          const int comp_index_ctx = get_comp_index_context(cm, xd);
+          compmode_interinter_cost +=
+              x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
+        } else if (mode_search_mask[comp_loop_idx] == (1 << COMPOUND_DISTWTD)) {
+          // Only compound_distwtd
+          if (!cm->seq_params.order_hint_info.enable_dist_wtd_comp ||
+              cpi->sf.use_dist_wtd_comp_flag == DIST_WTD_COMP_DISABLED ||
+              (do_two_loop_comp_search && mbmi->mode == GLOBAL_GLOBALMV))
+            continue;
+          mbmi->interinter_comp.type = COMPOUND_DISTWTD;
+          mbmi->num_proj_ref = 0;
+          mbmi->motion_mode = SIMPLE_TRANSLATION;
+          mbmi->comp_group_idx = 0;
+          mbmi->compound_idx = 0;
+          const int comp_index_ctx = get_comp_index_context(cm, xd);
+          compmode_interinter_cost +=
+              x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
+        } else {
+          // Find matching interp filter or set to default interp filter
+          const int need_search =
+              av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
+          int match_found = -1;
+          const InterpFilter assign_filter = cm->interp_filter;
+          int is_luma_interp_done = 0;
+          if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
+            match_found = find_interp_filter_in_stats(x, mbmi);
+          }
+          if (!need_search || match_found == -1) {
+            set_default_interp_filters(mbmi, assign_filter);
+          }
 
-        int64_t best_rd_compound;
-        compmode_interinter_cost = compound_type_rd(
-            cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used,
-            &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound,
-            rd_stats, ref_best_rd);
-        if (ref_best_rd < INT64_MAX &&
-            (best_rd_compound >> 3) * 6 > ref_best_rd) {
-          restore_dst_buf(xd, orig_dst, num_planes);
-          continue;
-        }
-        // No need to call av1_build_inter_predictors_sby if
-        // COMPOUND_AVERAGE is selected because it is the first
-        // candidate in compound_type_rd, and the following
-        // compound types searching uses tmp_dst buffer
-        if (mbmi->interinter_comp.type == COMPOUND_AVERAGE) {
-          if (num_planes > 1)
-            av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, &orig_dst,
-                                            bsize);
-          skip_build_pred = 1;
+          int64_t best_rd_compound;
+          compmode_interinter_cost = compound_type_rd(
+              cpi, x, bsize, mi_col, mi_row, cur_mv,
+              mode_search_mask[comp_loop_idx], masked_compound_used, &orig_dst,
+              &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound, rd_stats,
+              ref_best_rd, &is_luma_interp_done);
+          if (ref_best_rd < INT64_MAX &&
+              (best_rd_compound >> 4) * (11 + 2 * do_two_loop_comp_search) >
+                  ref_best_rd) {
+            restore_dst_buf(xd, orig_dst, num_planes);
+            continue;
+          }
+          // No need to call av1_enc_build_inter_predictor for luma if
+          // COMPOUND_AVERAGE is selected because it is the first
+          // candidate in compound_type_rd, and the following
+          // compound types searching uses tmp_dst buffer
+
+          if (mbmi->interinter_comp.type == COMPOUND_AVERAGE &&
+              is_luma_interp_done) {
+            if (num_planes > 1) {
+              av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+                                            bsize, AOM_PLANE_U, num_planes - 1);
+            }
+            skip_build_pred = 1;
+          }
         }
       }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, compound_type_rd_time);
+#endif
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, interpolation_filter_search_time);
+#endif
       ret_val = interpolation_filter_search(
-          x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
+          x, cpi, tile_data, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
           args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb,
-          skip_build_pred, args, ref_best_rd);
+          &skip_build_pred, args, ref_best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, interpolation_filter_search_time);
+#endif
       if (args->modelled_rd != NULL && !is_comp_pred) {
         args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
       }
@@ -9939,8 +10539,12 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         }
       }
       rd_stats->rate += compmode_interinter_cost;
+      if (skip_build_pred != 1) {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize,
+                                      0, av1_num_planes(cm) - 1);
+      }
 
-      if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
+      if (cpi->sf.second_loop_comp_fast_tx_search && comp_loop_idx == 1) {
         // TODO(chengchen): this speed feature introduces big loss.
         // Need better estimation of rate distortion.
         int dummy_rate;
@@ -9949,7 +10553,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         int64_t plane_sse[MAX_MB_PLANE] = { 0 };
         int64_t plane_dist[MAX_MB_PLANE] = { 0 };
 
-        model_rd_sb_fn[MODELRD_TYPE_JNT_COMPOUND](
+        model_rd_sb_fn[MODELRD_TYPE_DIST_WTD_COMPOUND](
             cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, &dummy_rate,
             &dummy_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, plane_sse,
             plane_dist);
@@ -9965,15 +10569,15 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         rd_stats_y->dist = plane_dist[0];
         rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
       } else {
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-        ret_val = motion_mode_rd(
-            cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip,
-            mi_row, mi_col, args, ref_best_rd, refs, &rate_mv, &orig_dst,
-            tile_data, best_est_rd, do_tx_search, inter_modes_info);
-#else
-        ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y,
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        start_timing(cpi, motion_mode_rd_time);
+#endif
+        ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y,
                                  rd_stats_uv, disable_skip, mi_row, mi_col,
-                                 args, ref_best_rd, refs, &rate_mv, &orig_dst);
+                                 args, ref_best_rd, refs, &rate_mv, &orig_dst,
+                                 best_est_rd, do_tx_search, inter_modes_info);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        end_timing(cpi, motion_mode_rd_time);
 #endif
       }
       mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
@@ -10019,10 +10623,10 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 }
 
 static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
-                                       RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                        int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
-  if (!av1_allow_intrabc(cm)) return INT64_MAX;
+  if (!av1_allow_intrabc(cm) || !cpi->oxcf.enable_intrabc) return INT64_MAX;
   const int num_planes = av1_num_planes(cm);
 
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -10074,7 +10678,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   };
 
   MB_MODE_INFO best_mbmi = *mbmi;
-  RD_STATS best_rdcost = *rd_cost;
+  RD_STATS best_rdstats = *rd_stats;
   int best_skip = x->skip;
 
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
@@ -10118,17 +10722,18 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     MV mvp_full = dv_ref.as_mv;
     mvp_full.col >>= 3;
     mvp_full.row >>= 3;
-    int sadpb = x->sadperbit16;
+    const int sadpb = x->sadperbit16;
     int cost_list[5];
-    int bestsme = av1_full_pixel_search(
+    const int bestsme = av1_full_pixel_search(
         cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
         sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
-        (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
+        (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1,
+        &cpi->ss_cfg[SS_CFG_LOOKAHEAD]);
 
     x->mv_limits = tmp_mv_limits;
     if (bestsme == INT_MAX) continue;
     mvp_full = x->best_mv.as_mv;
-    MV dv = { .row = mvp_full.row * 8, .col = mvp_full.col * 8 };
+    const MV dv = { .row = mvp_full.row * 8, .col = mvp_full.col * 8 };
     if (mv_check_bounds(&x->mv_limits, &dv)) continue;
     if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
                          cm->seq_params.mib_size_log2))
@@ -10147,74 +10752,39 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
     mbmi->skip = 0;
     x->skip = 0;
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                  av1_num_planes(cm) - 1);
 
     int *dvcost[2] = { (int *)&cpi->dv_cost[0][MV_MAX],
                        (int *)&cpi->dv_cost[1][MV_MAX] };
     // TODO(aconverse@google.com): The full motion field defining discount
     // in MV_COST_WEIGHT is too large. Explore other values.
-    int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, cpi->dv_joint_cost,
-                                  dvcost, MV_COST_WEIGHT_SUB);
+    const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, cpi->dv_joint_cost,
+                                        dvcost, MV_COST_WEIGHT_SUB);
     const int rate_mode = x->intrabc_cost[1];
-    RD_STATS rd_stats, rd_stats_uv;
-    av1_subtract_plane(x, bsize, 0);
-    if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-      // Intrabc
-      select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
-    } else {
-      super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
-      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
-      for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
-        set_blk_skip(x, 0, i, rd_stats.skip);
-    }
-    if (num_planes > 1) {
-      super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-      av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
-    }
-#if CONFIG_RD_DEBUG
-    mbmi->rd_stats = rd_stats;
-#endif
-
-    const int skip_ctx = av1_get_skip_context(xd);
-
-    RD_STATS rdc_noskip;
-    av1_init_rd_stats(&rdc_noskip);
-    rdc_noskip.rate =
-        rate_mode + rate_mv + rd_stats.rate + x->skip_cost[skip_ctx][0];
-    rdc_noskip.dist = rd_stats.dist;
-    rdc_noskip.rdcost = RDCOST(x->rdmult, rdc_noskip.rate, rdc_noskip.dist);
-    if (rdc_noskip.rdcost < best_rd) {
-      best_rd = rdc_noskip.rdcost;
+    RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv;
+    if (!txfm_search(cpi, NULL, x, bsize, mi_row, mi_col, &rd_stats_yuv,
+                     &rd_stats_y, &rd_stats_uv, rate_mode + rate_mv, INT64_MAX))
+      continue;
+    rd_stats_yuv.rdcost =
+        RDCOST(x->rdmult, rd_stats_yuv.rate, rd_stats_yuv.dist);
+    if (rd_stats_yuv.rdcost < best_rd) {
+      best_rd = rd_stats_yuv.rdcost;
       best_mbmi = *mbmi;
-      best_skip = x->skip;
-      best_rdcost = rdc_noskip;
+      best_skip = mbmi->skip;
+      best_rdstats = rd_stats_yuv;
       memcpy(best_blk_skip, x->blk_skip,
              sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
     }
-
-    if (!xd->lossless[mbmi->segment_id]) {
-      x->skip = 1;
-      mbmi->skip = 1;
-      RD_STATS rdc_skip;
-      av1_init_rd_stats(&rdc_skip);
-      rdc_skip.rate = rate_mode + rate_mv + x->skip_cost[skip_ctx][1];
-      rdc_skip.dist = rd_stats.sse;
-      rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist);
-      if (rdc_skip.rdcost < best_rd) {
-        best_rd = rdc_skip.rdcost;
-        best_mbmi = *mbmi;
-        best_skip = x->skip;
-        best_rdcost = rdc_skip;
-        memcpy(best_blk_skip, x->blk_skip,
-               sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
-      }
-    }
   }
   *mbmi = best_mbmi;
-  *rd_cost = best_rdcost;
+  *rd_stats = best_rdstats;
   x->skip = best_skip;
   memcpy(x->blk_skip, best_blk_skip,
          sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+#if CONFIG_RD_DEBUG
+  mbmi->rd_stats = *rd_stats;
+#endif
   return best_rd;
 }
 
@@ -10340,15 +10910,6 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
                                       int above_stride, const uint8_t *left,
                                       int left_stride);
 
-static const int ref_frame_flag_list[REF_FRAMES] = { 0,
-                                                     AOM_LAST_FLAG,
-                                                     AOM_LAST2_FLAG,
-                                                     AOM_LAST3_FLAG,
-                                                     AOM_GOLD_FLAG,
-                                                     AOM_BWD_FLAG,
-                                                     AOM_ALT2_FLAG,
-                                                     AOM_ALT_FLAG };
-
 static void rd_pick_skip_mode(RD_STATS *rd_cost,
                               InterModeSearchState *search_state,
                               const AV1_COMP *const cpi, MACROBLOCK *const x,
@@ -10381,6 +10942,10 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost,
     return;
   }
 
+  if (!cpi->oxcf.enable_onesided_comp && cpi->all_one_sided_refs) {
+    return;
+  }
+
   mbmi->mode = this_mode;
   mbmi->uv_mode = UV_DC_PRED;
   mbmi->ref_frame[0] = ref_frame;
@@ -10437,7 +11002,8 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost,
                    rd_cost->dist)
           : INT64_MAX;
 
-  if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost) {
+  if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost &&
+      (!xd->lossless[mbmi->segment_id] || skip_mode_rd_stats.dist == 0)) {
     assert(mode_index != -1);
     search_state->best_mbmode.skip_mode = 1;
     search_state->best_mbmode = *mbmi;
@@ -10483,13 +11049,6 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost,
     rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist;
     rd_cost->rdcost = skip_mode_rd_stats.rdcost;
 
-#if CONFIG_ONE_PASS_SVM
-    if (bsize >= BLOCK_8X8 &&
-        block_size_high[bsize] == block_size_wide[bsize]) {
-      av1_copy_reg_stat(rd_cost, &skip_mode_rd_stats);
-    }
-#endif
-
     search_state->best_rd = rd_cost->rdcost;
     search_state->best_skip2 = 1;
     search_state->best_mode_skippable = 1;
@@ -10539,15 +11098,15 @@ static void sf_refine_fast_tx_type_search(
     }
 
     if (is_inter_mode(mbmi->mode)) {
-      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                    av1_num_planes(cm) - 1);
       if (mbmi->motion_mode == OBMC_CAUSAL)
         av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
 
       av1_subtract_plane(x, bsize, 0);
       if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-        // av1_rd_pick_inter_mode_sb
-        select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col,
-                           INT64_MAX);
+        pick_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col,
+                              INT64_MAX);
         assert(rd_stats_y.rate != INT_MAX);
       } else {
         super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
@@ -10555,19 +11114,14 @@ static void sf_refine_fast_tx_type_search(
         for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
           set_blk_skip(x, 0, i, rd_stats_y.skip);
       }
-      if (num_planes > 1) {
-        inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, INT64_MAX,
-                         FTXS_NONE);
-      } else {
-        av1_init_rd_stats(&rd_stats_uv);
-      }
     } else {
       super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-      if (num_planes > 1) {
-        super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-      } else {
-        av1_init_rd_stats(&rd_stats_uv);
-      }
+    }
+
+    if (num_planes > 1) {
+      super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+    } else {
+      av1_init_rd_stats(&rd_stats_uv);
     }
 
     if (RDCOST(x->rdmult,
@@ -10602,13 +11156,193 @@ static void sf_refine_fast_tx_type_search(
   }
 }
 
+typedef struct {
+  // Mask for each reference frame, specifying which prediction modes to NOT try
+  // during search.
+  uint32_t pred_modes[REF_FRAMES];
+  // If ref_combo[i][j + 1] is true, do NOT try prediction using combination of
+  // reference frames (i, j).
+  // Note: indexing with 'j + 1' is due to the fact that 2nd reference can be -1
+  // (NONE_FRAME).
+  bool ref_combo[REF_FRAMES][REF_FRAMES + 1];
+} mode_skip_mask_t;
+
+// Update 'ref_combo' mask to disable given 'ref' in single and compound modes.
+static void disable_reference(MV_REFERENCE_FRAME ref,
+                              bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+  for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
+    ref_combo[ref][ref2 + 1] = true;
+  }
+}
+
+// Update 'ref_combo' mask to disable all inter references except ALTREF.
+static void disable_inter_references_except_altref(
+    bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+  disable_reference(LAST_FRAME, ref_combo);
+  disable_reference(LAST2_FRAME, ref_combo);
+  disable_reference(LAST3_FRAME, ref_combo);
+  disable_reference(GOLDEN_FRAME, ref_combo);
+  disable_reference(BWDREF_FRAME, ref_combo);
+  disable_reference(ALTREF2_FRAME, ref_combo);
+}
+
+static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = {
+  { LAST_FRAME, NONE_FRAME },     { ALTREF_FRAME, NONE_FRAME },
+  { LAST_FRAME, ALTREF_FRAME },   { GOLDEN_FRAME, NONE_FRAME },
+  { INTRA_FRAME, NONE_FRAME },    { GOLDEN_FRAME, ALTREF_FRAME },
+  { LAST_FRAME, GOLDEN_FRAME },   { LAST_FRAME, INTRA_FRAME },
+  { LAST_FRAME, BWDREF_FRAME },   { LAST_FRAME, LAST3_FRAME },
+  { GOLDEN_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, INTRA_FRAME },
+  { BWDREF_FRAME, NONE_FRAME },   { BWDREF_FRAME, ALTREF_FRAME },
+  { ALTREF_FRAME, INTRA_FRAME },  { BWDREF_FRAME, INTRA_FRAME },
+};
+
+static const MV_REFERENCE_FRAME real_time_ref_combos[][2] = {
+  { LAST_FRAME, NONE_FRAME },
+  { ALTREF_FRAME, NONE_FRAME },
+  { GOLDEN_FRAME, NONE_FRAME },
+  { INTRA_FRAME, NONE_FRAME }
+};
+
+typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET;
+
+static void default_skip_mask(mode_skip_mask_t *mask, REF_SET ref_set) {
+  if (ref_set == REF_SET_FULL) {
+    // Everything available by default.
+    memset(mask, 0, sizeof(*mask));
+  } else {
+    // All modes available by default.
+    memset(mask->pred_modes, 0, sizeof(mask->pred_modes));
+    // All references disabled first.
+    for (MV_REFERENCE_FRAME ref1 = INTRA_FRAME; ref1 < REF_FRAMES; ++ref1) {
+      for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
+        mask->ref_combo[ref1][ref2 + 1] = true;
+      }
+    }
+    const MV_REFERENCE_FRAME(*ref_set_combos)[2];
+    int num_ref_combos;
+
+    // Then enable reduced set of references explicitly.
+    switch (ref_set) {
+      case REF_SET_REDUCED:
+        ref_set_combos = reduced_ref_combos;
+        num_ref_combos =
+            (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]);
+        break;
+      case REF_SET_REALTIME:
+        ref_set_combos = real_time_ref_combos;
+        num_ref_combos =
+            (int)sizeof(real_time_ref_combos) / sizeof(real_time_ref_combos[0]);
+        break;
+      default: assert(0); num_ref_combos = 0;
+    }
+
+    for (int i = 0; i < num_ref_combos; ++i) {
+      const MV_REFERENCE_FRAME *const this_combo = ref_set_combos[i];
+      mask->ref_combo[this_combo[0]][this_combo[1] + 1] = false;
+    }
+  }
+}
+
+static void init_mode_skip_mask(mode_skip_mask_t *mask, const AV1_COMP *cpi,
+                                MACROBLOCK *x, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  unsigned char segment_id = mbmi->segment_id;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  REF_SET ref_set = REF_SET_FULL;
+
+  if (sf->use_real_time_ref_set)
+    ref_set = REF_SET_REALTIME;
+  else if (cpi->oxcf.enable_reduced_reference_set)
+    ref_set = REF_SET_REDUCED;
+
+  default_skip_mask(mask, ref_set);
+
+  int min_pred_mv_sad = INT_MAX;
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+    min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame])) {
+      // Skip checking missing reference in both single and compound reference
+      // modes.
+      disable_reference(ref_frame, mask->ref_combo);
+    } else {
+      // Skip fixed mv modes for poor references
+      if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) {
+        mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+      }
+    }
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      // Reference not used for the segment.
+      disable_reference(ref_frame, mask->ref_combo);
+    }
+  }
+  // Note: We use the following drop-out only if the SEG_LVL_REF_FRAME feature
+  // is disabled for this segment. This is to prevent the possibility that we
+  // end up unable to pick any mode.
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      disable_inter_references_except_altref(mask->ref_combo);
+
+      mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+      const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
+      int_mv near_mv, nearest_mv, global_mv;
+      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&near_mv, NEARMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+
+      if (near_mv.as_int != global_mv.as_int)
+        mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV);
+      if (nearest_mv.as_int != global_mv.as_int)
+        mask->pred_modes[ALTREF_FRAME] |= (1 << NEARESTMV);
+    }
+  }
+
+  if (cpi->rc.is_src_frame_alt_ref) {
+    if (sf->alt_ref_search_fp) {
+      assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+      mask->pred_modes[ALTREF_FRAME] = 0;
+      disable_inter_references_except_altref(mask->ref_combo);
+      disable_reference(INTRA_FRAME, mask->ref_combo);
+    }
+  }
+
+  if (sf->alt_ref_search_fp)
+    if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
+      if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+        mask->pred_modes[ALTREF_FRAME] |= INTER_ALL;
+
+  if (sf->adaptive_mode_search) {
+    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+        cpi->rc.frames_since_golden >= 3)
+      if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
+        mask->pred_modes[GOLDEN_FRAME] |= INTER_ALL;
+  }
+
+  if (bsize > sf->max_intra_bsize) {
+    disable_reference(INTRA_FRAME, mask->ref_combo);
+  }
+
+  mask->pred_modes[INTRA_FRAME] |=
+      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+}
+
 // Please add/modify parameter setting in this function, making it consistent
 // and easy to read and maintain.
 static void set_params_rd_pick_inter_mode(
     const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
-    BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2],
-    uint32_t mode_skip_mask[REF_FRAMES], int skip_ref_frame_mask,
-    unsigned int ref_costs_single[REF_FRAMES],
+    BLOCK_SIZE bsize, int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask,
+    int skip_ref_frame_mask, unsigned int ref_costs_single[REF_FRAMES],
     unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
     struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -10616,8 +11350,6 @@ static void set_params_rd_pick_inter_mode(
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  const struct segmentation *const seg = &cm->seg;
-  const SPEED_FEATURES *const sf = &cpi->sf;
   unsigned char segment_id = mbmi->segment_id;
   int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
@@ -10629,7 +11361,7 @@ static void set_params_rd_pick_inter_mode(
   for (int i = 0; i < MB_MODE_COUNT; ++i)
     for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE;
 
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     int len = sizeof(uint16_t);
     args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
     args->above_pred_buf[1] =
@@ -10659,9 +11391,8 @@ static void set_params_rd_pick_inter_mode(
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     x->mbmi_ext->mode_context[ref_frame] = 0;
-    x->mbmi_ext->compound_mode_context[ref_frame] = 0;
     mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
-    if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) {
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
       if (mbmi->partition != PARTITION_NONE &&
           mbmi->partition != PARTITION_SPLIT) {
         if (skip_ref_frame_mask & (1 << ref_frame)) {
@@ -10678,7 +11409,7 @@ static void set_params_rd_pick_inter_mode(
           if (skip) continue;
         }
       }
-      assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+      assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
       setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                                  yv12_mb);
     }
@@ -10688,8 +11419,8 @@ static void set_params_rd_pick_inter_mode(
     x->mbmi_ext->mode_context[ref_frame] = 0;
     mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
     const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
-    if (!((cpi->ref_frame_flags & ref_frame_flag_list[rf[0]]) &&
-          (cpi->ref_frame_flags & ref_frame_flag_list[rf[1]]))) {
+    if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) &&
+          (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) {
       continue;
     }
 
@@ -10722,93 +11453,122 @@ static void set_params_rd_pick_inter_mode(
         args->left_pred_stride[0]);
   }
 
-  int min_pred_mv_sad = INT_MAX;
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
-    min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+  init_mode_skip_mask(mode_skip_mask, cpi, x, bsize);
 
-  for (int i = 0; i < 2; ++i) {
-    ref_frame_skip_mask[i] = 0;
-  }
-  memset(mode_skip_mask, 0, REF_FRAMES * sizeof(*mode_skip_mask));
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])) {
-      // Skip checking missing references in both single and compound reference
-      // modes. Note that a mode will be skipped iff both reference frames
-      // are masked out.
-      ref_frame_skip_mask[0] |= (1 << ref_frame);
-      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-    } else {
-      // Skip fixed mv modes for poor references
-      if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) {
-        mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
-      }
-    }
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
-      ref_frame_skip_mask[0] |= (1 << ref_frame);
-      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-    }
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search ||
+      cpi->oxcf.use_intra_default_tx_only)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
+
+  if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
+    x->use_default_inter_tx_type = 1;
+  else
+    x->use_default_inter_tx_type = 0;
+  if (cpi->sf.skip_repeat_interpolation_filter_search) {
+    x->interp_filter_stats_idx[0] = 0;
+    x->interp_filter_stats_idx[1] = 0;
   }
+  x->comp_rd_stats_idx = 0;
+}
 
-  // Disable this drop out case if the ref frame
-  // segment level feature is enabled for this segment. This is to
-  // prevent the possibility that we end up unable to pick any mode.
-  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
-    // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame,
-    // unless ARNR filtering is enabled in which case we want
-    // an unfiltered alternative. We allow near/nearest as well
-    // because they may result in zero-zero MVs but be cheaper.
-    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-      ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << LAST2_FRAME) |
-                               (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) |
-                               (1 << ALTREF2_FRAME) | (1 << GOLDEN_FRAME);
-      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
-      // TODO(zoeliu): To further explore whether following needs to be done for
-      //               BWDREF_FRAME as well.
-      mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
-      const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
-      int_mv near_mv, nearest_mv, global_mv;
-      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
-      get_this_mv(&near_mv, NEARMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
-      get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+// TODO(kyslov): now this is very similar to set_params_rd_pick_inter_mode
+// (except that doesn't set ALTREF parameters)
+//               consider passing a flag to select non-rd path (similar to
+//               encode_sb_row)
+static void set_params_nonrd_pick_inter_mode(
+    const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+    BLOCK_SIZE bsize, int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask,
+    int skip_ref_frame_mask, unsigned int ref_costs_single[REF_FRAMES],
+    unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  unsigned char segment_id = mbmi->segment_id;
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                   MAX_SB_SIZE >> 1 };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                    MAX_SB_SIZE >> 1 };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-      if (near_mv.as_int != global_mv.as_int)
-        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
-      if (nearest_mv.as_int != global_mv.as_int)
-        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
-    }
-  }
+  for (int i = 0; i < MB_MODE_COUNT; ++i)
+    for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE;
 
-  if (cpi->rc.is_src_frame_alt_ref) {
-    if (sf->alt_ref_search_fp) {
-      assert(cpi->ref_frame_flags & ref_frame_flag_list[ALTREF_FRAME]);
-      mode_skip_mask[ALTREF_FRAME] = 0;
-      ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
-      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
-    }
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
+    args->above_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->above_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
+    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+    args->left_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->left_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
+  } else {
+    args->above_pred_buf[0] = x->above_pred_buf;
+    args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
+    args->left_pred_buf[0] = x->left_pred_buf;
+    args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
   }
 
-  if (sf->alt_ref_search_fp)
-    if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
-      if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
-        mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+  av1_collect_neighbors_ref_counts(xd);
 
-  if (sf->adaptive_mode_search) {
-    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
-        cpi->rc.frames_since_golden >= 3)
-      if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
-        mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
-  }
+  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+                           ref_costs_comp);
 
-  if (bsize > sf->max_intra_bsize) {
-    ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
-    ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+    mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      if (mbmi->partition != PARTITION_NONE &&
+          mbmi->partition != PARTITION_SPLIT) {
+        if (skip_ref_frame_mask & (1 << ref_frame)) {
+          int skip = 1;
+          for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+            if (!(skip_ref_frame_mask & (1 << r))) {
+              const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+              if (rf[0] == ref_frame || rf[1] == ref_frame) {
+                skip = 0;
+                break;
+              }
+            }
+          }
+          if (skip) continue;
+        }
+      }
+      assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
+      setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+                                 yv12_mb);
+    }
   }
+  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
 
-  mode_skip_mask[INTRA_FRAME] |=
-      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+  if (check_num_overlappable_neighbors(mbmi) &&
+      is_motion_variation_allowed_bsize(bsize)) {
+    av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
+                                        args->above_pred_buf, dst_width1,
+                                        dst_height1, args->above_pred_stride);
+    av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
+                                       args->left_pred_buf, dst_width2,
+                                       dst_height2, args->left_pred_stride);
+    av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col,
+                         0, num_planes);
+    calc_target_weighted_pred(
+        cm, x, xd, mi_row, mi_col, args->above_pred_buf[0],
+        args->above_pred_stride[0], args->left_pred_buf[0],
+        args->left_pred_stride[0]);
+  }
+  init_mode_skip_mask(mode_skip_mask, cpi, x, bsize);
 
   if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
     x->use_default_intra_tx_type = 1;
@@ -10900,9 +11660,6 @@ static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
     rate2 -= rd_stats_y.rate;
     if (num_planes > 1) rate2 -= search_state->rate_uv_tokenonly[uv_tx];
     rate2 += x->skip_cost[av1_get_skip_context(xd)][1];
-#if CONFIG_ONE_PASS_SVM
-    av1_reg_stat_skipmode_update(&rd_stats_y, x->rdmult);
-#endif
   } else {
     rate2 += x->skip_cost[av1_get_skip_context(xd)][0];
   }
@@ -10919,9 +11676,6 @@ static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
     search_state->best_mode_skippable = skippable;
     memcpy(ctx->blk_skip, x->blk_skip,
            sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-#if CONFIG_ONE_PASS_SVM
-    av1_copy_reg_stat(rd_cost, &rd_stats_y);
-#endif
   }
 }
 
@@ -11016,32 +11770,89 @@ static void init_inter_mode_search_state(InterModeSearchState *search_state,
   av1_zero(search_state->single_state_modelled_cnt);
 }
 
+bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask,
+                    const MV_REFERENCE_FRAME *ref_frame,
+                    const PREDICTION_MODE this_mode) {
+  if (mode_skip_mask->pred_modes[ref_frame[0]] & (1 << this_mode)) {
+    return true;
+  }
+
+  return mode_skip_mask->ref_combo[ref_frame[0]][ref_frame[1] + 1];
+}
+
+static int inter_mode_compatible_skip(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, int mode_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
+  const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const unsigned char segment_id = mbmi->segment_id;
+  const int comp_pred = ref_frame[1] > INTRA_FRAME;
+
+  if (comp_pred) {
+    if (frame_is_intra_only(cm)) return 1;
+
+    if (current_frame->reference_mode == SINGLE_REFERENCE) return 1;
+
+    // Skip compound inter modes if ARF is not available.
+    if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[1]]))
+      return 1;
+
+    // Do not allow compound prediction if the segment level reference frame
+    // feature is in use as in this case there can only be one reference.
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
+
+    if (!is_comp_ref_allowed(bsize)) return 1;
+  }
+
+  if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) {
+    // Mode must be compatible
+    if (!is_interintra_allowed_mode(this_mode)) return 1;
+    if (!is_interintra_allowed_bsize(bsize)) return 1;
+  }
+
+  return 0;
+}
+
+static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x,
+                                        BLOCK_SIZE bsize, int mib_size,
+                                        int mi_row, int mi_col) {
+  const int sb_size_mask = mib_size - 1;
+  const int mi_row_in_sb = mi_row & sb_size_mask;
+  const int mi_col_in_sb = mi_col & sb_size_mask;
+  const int mi_w = mi_size_wide[bsize];
+  const int mi_h = mi_size_high[bsize];
+  int picked_ref_frames_mask = 0;
+  for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_h; ++i) {
+    for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_w; ++j) {
+      picked_ref_frames_mask |= x->picked_ref_frames_mask[i * 32 + j];
+    }
+  }
+  return picked_ref_frames_mask;
+}
+
 // Case 1: return 0, means don't skip this mode
 // Case 2: return 1, means skip this mode completely
 // Case 3: return 2, means skip compound only, but still try single motion modes
 static int inter_mode_search_order_independent_skip(
-    const AV1_COMP *cpi, const PICK_MODE_CONTEXT *ctx, const MACROBLOCK *x,
-    BLOCK_SIZE bsize, int mode_index, int mi_row, int mi_col,
-    uint32_t *mode_skip_mask, uint16_t *ref_frame_skip_mask,
-    InterModeSearchState *search_state) {
+    const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index,
+    int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask,
+    InterModeSearchState *search_state, int skip_ref_frame_mask) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const AV1_COMMON *const cm = &cpi->common;
-  const struct segmentation *const seg = &cm->seg;
   const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
   const CurrentFrame *const current_frame = &cm->current_frame;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
-  const unsigned char segment_id = mbmi->segment_id;
   const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
   const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+  const int comp_pred = ref_frame[1] > INTRA_FRAME;
   int skip_motion_mode = 0;
 
-  if (mode_skip_mask[ref_frame[0]] & (1 << this_mode)) {
-    return 1;
-  }
-
-  if ((ref_frame_skip_mask[0] & (1 << ref_frame[0])) &&
-      (ref_frame_skip_mask[1] & (1 << AOMMAX(0, ref_frame[1])))) {
+  if (mask_says_skip(mode_skip_mask, ref_frame, this_mode)) {
     return 1;
   }
 
@@ -11053,14 +11864,14 @@ static int inter_mode_search_order_independent_skip(
 
   if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
     const int ref_type = av1_ref_frame_type(ref_frame);
-    int skip_ref = ctx->skip_ref_frame_mask & (1 << ref_type);
+    int skip_ref = skip_ref_frame_mask & (1 << ref_type);
     if (ref_type <= ALTREF_FRAME && skip_ref) {
       // Since the compound ref modes depends on the motion estimation result of
       // two single ref modes( best mv of single ref modes as the start point )
       // If current single ref mode is marked skip, we need to check if it will
       // be used in compound ref modes.
       for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
-        if (!(ctx->skip_ref_frame_mask & (1 << r))) {
+        if (!(skip_ref_frame_mask & (1 << r))) {
           const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
           if (rf[0] == ref_type || rf[1] == ref_type) {
             // Found a not skipped compound ref mode which contains current
@@ -11077,8 +11888,7 @@ static int inter_mode_search_order_independent_skip(
     if (skip_ref) return 1;
   }
 
-  if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
-      !x->cb_partition_scan) {
+  if (cpi->two_pass_partition_search && !x->cb_partition_scan) {
     const int mi_width = mi_size_wide[bsize];
     const int mi_height = mi_size_high[bsize];
     int found = 0;
@@ -11101,12 +11911,6 @@ static int inter_mode_search_order_independent_skip(
     if (!found) return 1;
   }
 
-  if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) {
-    // Mode must by compatible
-    if (!is_interintra_allowed_mode(this_mode)) return 1;
-    if (!is_interintra_allowed_bsize(bsize)) return 1;
-  }
-
   // This is only used in motion vector unit test.
   if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME)
     return 1;
@@ -11121,22 +11925,6 @@ static int inter_mode_search_order_independent_skip(
           x->source_variance < skip_intra_var_thresh)
         return 1;
     }
-  } else {
-    if (!is_comp_ref_allowed(bsize) && ref_frame[1] > INTRA_FRAME) return 1;
-  }
-
-  const int comp_pred = ref_frame[1] > INTRA_FRAME;
-  if (comp_pred) {
-    if (!cpi->allow_comp_inter_inter) return 1;
-
-    if (current_frame->reference_mode == SINGLE_REFERENCE) return 1;
-
-    // Skip compound inter modes if ARF is not available.
-    if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame[1]])) return 1;
-
-    // Do not allow compound prediction if the segment level reference frame
-    // feature is in use as in this case there can only be one reference.
-    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
   }
 
   if (sf->selective_ref_frame) {
@@ -11176,8 +11964,7 @@ static int inter_mode_search_order_independent_skip(
   if ((sf->selective_ref_frame >= 2) && comp_pred && !cpi->all_one_sided_refs) {
     unsigned int ref_offsets[2];
     for (int i = 0; i < 2; ++i) {
-      const RefCntBuffer *const buf =
-          cm->current_frame.frame_refs[ref_frame[i] - LAST_FRAME].buf;
+      const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame[i]);
       assert(buf != NULL);
       ref_offsets[i] = buf->order_hint;
     }
@@ -11192,12 +11979,57 @@ static int inter_mode_search_order_independent_skip(
       return 1;
   }
 
+  if (sf->selective_ref_frame >= 4 && comp_pred) {
+    // Check if one of the reference is ALTREF2_FRAME and BWDREF_FRAME is a
+    // valid reference.
+    if ((ref_frame[0] == ALTREF2_FRAME || ref_frame[1] == ALTREF2_FRAME) &&
+        (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) {
+      // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references.
+      if ((get_relative_dist(
+               order_hint_info,
+               cm->cur_frame->ref_order_hints[ALTREF2_FRAME - LAST_FRAME],
+               current_frame->order_hint) > 0) &&
+          (get_relative_dist(
+               order_hint_info,
+               cm->cur_frame->ref_order_hints[BWDREF_FRAME - LAST_FRAME],
+               current_frame->order_hint) > 0)) {
+        // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer
+        // reference to the current frame than ALTREF2_FRAME
+        if (get_relative_dist(
+                order_hint_info,
+                cm->cur_frame->ref_order_hints[ALTREF2_FRAME - LAST_FRAME],
+                cm->cur_frame->ref_order_hints[BWDREF_FRAME - LAST_FRAME]) >=
+            0) {
+          const RefCntBuffer *const buf_arf2 =
+              get_ref_frame_buf(cm, ALTREF2_FRAME);
+          assert(buf_arf2 != NULL);
+          const RefCntBuffer *const buf_bwd =
+              get_ref_frame_buf(cm, BWDREF_FRAME);
+          assert(buf_bwd != NULL);
+          (void)buf_arf2;
+          (void)buf_bwd;
+          return 1;
+        }
+      }
+    }
+  }
+
   if (skip_repeated_mv(cm, x, this_mode, ref_frame, search_state)) {
     return 1;
   }
   if (skip_motion_mode) {
     return 2;
   }
+
+  if (!cpi->oxcf.enable_global_motion &&
+      (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) {
+    return 1;
+  }
+
+  if (!cpi->oxcf.enable_onesided_comp && comp_pred && cpi->all_one_sided_refs) {
+    return 1;
+  }
+
   return 0;
 }
 
@@ -11233,6 +12065,7 @@ static int64_t handle_intra_mode(InterModeSearchState *search_state,
   assert(mbmi->ref_frame[0] == INTRA_FRAME);
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int try_palette =
+      cpi->oxcf.enable_palette &&
       av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
   const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
   const int intra_cost_penalty = av1_get_intra_cost_penalty(
@@ -11255,14 +12088,14 @@ static int64_t handle_intra_mode(InterModeSearchState *search_state,
 
   TX_SIZE uv_tx;
   int is_directional_mode = av1_is_directional_mode(mbmi->mode);
-  if (is_directional_mode && av1_use_angle_delta(bsize)) {
+  if (is_directional_mode && av1_use_angle_delta(bsize) &&
+      cpi->oxcf.enable_angle_delta) {
     int rate_dummy;
     int64_t model_rd = INT64_MAX;
     if (sf->intra_angle_estimation && !search_state->angle_stats_ready) {
       const int src_stride = x->plane[0].src.stride;
       const uint8_t *src = x->plane[0].src.buf;
-      angle_estimation(src, src_stride, rows, cols, bsize,
-                       xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH,
+      angle_estimation(src, src_stride, rows, cols, bsize, is_cur_buf_hbd(xd),
                        search_state->directional_mode_skip_mask);
       search_state->angle_stats_ready = 1;
     }
@@ -11795,6 +12628,16 @@ static void release_compound_type_rd_buffers(
   av1_zero(*bufs);  // Set all pointers to NULL for safety.
 }
 
+// Enables do_tx_search on a per-mode basis.
+int do_tx_search_mode(int do_tx_search_global, int midx, int adaptive) {
+  if (!adaptive || do_tx_search_global) {
+    return do_tx_search_global;
+  }
+  // A value of 2 indicates it is being turned on conditionally
+  // for the mode. Turn it on for the first 7 modes.
+  return midx < 7 ? 2 : 0;
+}
+
 void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                                MACROBLOCK *x, int mi_row, int mi_col,
                                RD_STATS *rd_cost, BLOCK_SIZE bsize,
@@ -11805,6 +12648,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const int try_palette =
+      cpi->oxcf.enable_palette &&
       av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const struct segmentation *const seg = &cm->seg;
@@ -11815,16 +12659,8 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   unsigned int ref_costs_single[REF_FRAMES];
   unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
   int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
-  int *mode_map = tile_data->mode_map[bsize];
-  uint32_t mode_skip_mask[REF_FRAMES];
-  uint16_t ref_frame_skip_mask[2];
+  mode_skip_mask_t mode_skip_mask;
   uint8_t motion_mode_skip_mask = 0;  // second pass of single ref modes
-#if CONFIG_ONE_PASS_SVM
-  int temp_y_eob = 0, temp_y_eob_0 = 0, temp_y_eob_1 = 0, temp_y_eob_2 = 0,
-      temp_y_eob_3 = 0;
-  int64_t temp_y_rd = 0, temp_y_rd_0 = 0, temp_y_rd_1 = 0, temp_y_rd_2 = 0,
-          temp_y_rd_3 = 0;
-#endif
 
   InterModeSearchState search_state;
   init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
@@ -11847,23 +12683,42 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 
   av1_invalid_rd_stats(rd_cost);
 
+  // Ref frames that are selected by square partition blocks.
+  int picked_ref_frames_mask = 0;
+  if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+      mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+    // prune_ref_frame_for_rect_partitions = 1 implies prune only extended
+    // partition blocks. prune_ref_frame_for_rect_partitions >=2
+    // implies prune for vert, horiz and extended partition blocks.
+    if ((mbmi->partition != PARTITION_VERT &&
+         mbmi->partition != PARTITION_HORZ) ||
+        cpi->sf.prune_ref_frame_for_rect_partitions >= 2) {
+      picked_ref_frames_mask = fetch_picked_ref_frames_mask(
+          x, bsize, cm->seq_params.mib_size, mi_row, mi_col);
+    }
+  }
+
+  // Skip ref frames that never selected by square blocks.
+  const int skip_ref_frame_mask =
+      picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
+
   // init params, set frame modes, speed features
-  set_params_rd_pick_inter_mode(
-      cpi, x, &args, bsize, mi_row, mi_col, ref_frame_skip_mask, mode_skip_mask,
-      ctx->skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb);
+  set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col,
+                                &mode_skip_mask, skip_ref_frame_mask,
+                                ref_costs_single, ref_costs_comp, yv12_mb);
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
   int64_t best_est_rd = INT64_MAX;
   // TODO(angiebird): Turn this on when this speed feature is well tested
-#if 1
   const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
-  const int do_tx_search = !md->ready;
-#else
-  const int do_tx_search = 1;
-#endif
+  // If do_tx_search_global is 0, only estimated RD should be computed.
+  // If do_tx_search_global is 1, all modes have TX search performed.
+  // If do_tx_search_global is 2, some modes will have TX search performed.
+  const int do_tx_search_global =
+      !((cpi->sf.inter_mode_rd_model_estimation == 1 && md->ready) ||
+        (cpi->sf.inter_mode_rd_model_estimation == 2 &&
+         x->source_variance < 512));
   InterModesInfo *inter_modes_info = x->inter_modes_info;
   inter_modes_info->num = 0;
-#endif
 
   int intra_mode_num = 0;
   int intra_mode_idx_ls[MAX_MODES];
@@ -11876,8 +12731,9 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   alloc_compound_type_rd_buffers(cm, &rd_buffers);
 
   for (int midx = 0; midx < MAX_MODES; ++midx) {
-    int mode_index = mode_map[midx];
-    const MODE_DEFINITION *mode_order = &av1_mode_order[mode_index];
+    const int do_tx_search = do_tx_search_mode(
+        do_tx_search_global, midx, sf->inter_mode_rd_model_estimation_adaptive);
+    const MODE_DEFINITION *mode_order = &av1_mode_order[midx];
     this_mode = mode_order->mode;
     const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
     const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
@@ -11899,8 +12755,8 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
         if (args.single_ref_first_pass) {
           // clear stats
           for (int k = 0; k < MAX_REF_MV_SERCH; ++k) {
-            x->simple_rd_state[mode_index][k].rd_stats.rdcost = INT64_MAX;
-            x->simple_rd_state[mode_index][k].early_skipped = 0;
+            x->simple_rd_state[midx][k].rd_stats.rdcost = INT64_MAX;
+            x->simple_rd_state[midx][k].early_skipped = 0;
           }
         } else {
           if (motion_mode_skip_mask & (1 << ref_frame)) {
@@ -11923,14 +12779,16 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     int skippable = 0;
     int this_skip2 = 0;
 
-    init_mbmi(mbmi, mode_index, cm);
+    init_mbmi(mbmi, midx, cm);
 
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
+    if (inter_mode_compatible_skip(cpi, x, bsize, midx)) continue;
+
     const int ret = inter_mode_search_order_independent_skip(
-        cpi, ctx, x, bsize, mode_index, mi_row, mi_col, mode_skip_mask,
-        ref_frame_skip_mask, &search_state);
+        cpi, x, bsize, midx, mi_row, mi_col, &mode_skip_mask, &search_state,
+        skip_ref_frame_mask);
     if (ret == 1) continue;
     args.skip_motion_mode = (ret == 2);
 
@@ -11940,8 +12798,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
       }
     }
 
-    if (search_state.best_rd < search_state.mode_threshold[mode_index])
-      continue;
+    if (search_state.best_rd < search_state.mode_threshold[midx]) continue;
 
     if (sf->prune_comp_search_by_single_result > 0 && comp_pred) {
       if (compound_skip_by_single_states(cpi, &search_state, this_mode,
@@ -11967,7 +12824,12 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     }
 
     if (ref_frame == INTRA_FRAME) {
-      if (sf->adaptive_mode_search)
+      if ((!cpi->oxcf.enable_smooth_intra || sf->disable_smooth_intra) &&
+          (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+           mbmi->mode == SMOOTH_V_PRED))
+        continue;
+      if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
+      if (sf->adaptive_mode_search > 1)
         if ((x->source_variance << num_pels_log2_lookup[bsize]) >
             search_state.best_pred_sse)
           continue;
@@ -11995,7 +12857,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     }
 
     if (ref_frame == INTRA_FRAME) {
-      intra_mode_idx_ls[intra_mode_num++] = mode_index;
+      intra_mode_idx_ls[intra_mode_num++] = midx;
       continue;
     } else {
       mbmi->angle_delta[PLANE_TYPE_Y] = 0;
@@ -12014,30 +12876,25 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
         args.single_newmv_valid = search_state.single_newmv_valid;
         args.single_comp_cost = real_compmode_cost;
         args.ref_frame_cost = ref_frame_cost;
-        if (mode_index < MAX_SINGLE_REF_MODES) {
-          args.simple_rd_state = x->simple_rd_state[mode_index];
+        if (midx < MAX_SINGLE_REF_MODES) {
+          args.simple_rd_state = x->simple_rd_state[midx];
         }
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        start_timing(cpi, handle_inter_mode_time);
+#endif
         this_rd = handle_inter_mode(
-            cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip,
-            mi_row, mi_col, &args, ref_best_rd, tmp_buf, &rd_buffers, tile_data,
-            &best_est_rd, do_tx_search, inter_modes_info);
-#else
-        this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
-                                    &rd_stats_uv, &disable_skip, mi_row, mi_col,
-                                    &args, ref_best_rd, tmp_buf, &rd_buffers);
+            cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+            &disable_skip, mi_row, mi_col, &args, ref_best_rd, tmp_buf,
+            &rd_buffers, &best_est_rd, do_tx_search, inter_modes_info);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        end_timing(cpi, handle_inter_mode_time);
 #endif
         rate2 = rd_stats.rate;
         skippable = rd_stats.skip;
         distortion2 = rd_stats.dist;
         rate_y = rd_stats_y.rate;
         rate_uv = rd_stats_uv.rate;
-#if CONFIG_ONE_PASS_SVM
-        av1_unpack_reg_stat(&rd_stats_y, &temp_y_eob, &temp_y_eob_0,
-                            &temp_y_eob_1, &temp_y_eob_2, &temp_y_eob_3,
-                            &temp_y_rd, &temp_y_rd_0, &temp_y_rd_1,
-                            &temp_y_rd_2, &temp_y_rd_3);
-#endif
       }
 
       if (sf->prune_comp_search_by_single_result > 0 &&
@@ -12063,7 +12920,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
       }
       if (!mode_excluded) {
         // Note index of best mode so far
-        search_state.best_mode_index = mode_index;
+        search_state.best_mode_index = midx;
 
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
@@ -12079,7 +12936,6 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
         search_state.best_mbmode = *mbmi;
         search_state.best_skip2 = this_skip2;
         search_state.best_mode_skippable = skippable;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
         if (do_tx_search) {
           // When do_tx_search == 0, handle_inter_mode won't provide correct
           // rate_y and rate_uv because txfm_search process is replaced by
@@ -12090,24 +12946,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
               rate_y +
               x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
           search_state.best_rate_uv = rate_uv;
-
-#if CONFIG_ONE_PASS_SVM
-          av1_set_reg_stat(rd_cost, temp_y_eob, temp_y_eob_0, temp_y_eob_1,
-                           temp_y_eob_2, temp_y_eob_3, temp_y_rd, temp_y_rd_0,
-                           temp_y_rd_1, temp_y_rd_2, temp_y_rd_3);
-#endif
         }
-#else  // CONFIG_COLLECT_INTER_MODE_RD_STATS
-        search_state.best_rate_y =
-            rate_y +
-            x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
-        search_state.best_rate_uv = rate_uv;
-#if CONFIG_ONE_PASS_SVM
-        av1_set_reg_stat(rd_cost, temp_y_eob, temp_y_eob_0, temp_y_eob_1,
-                         temp_y_eob_2, temp_y_eob_3, temp_y_rd, temp_y_rd_0,
-                         temp_y_rd_1, temp_y_rd_2, temp_y_rd_3);
-#endif
-#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
         memcpy(ctx->blk_skip, x->blk_skip,
                sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
       }
@@ -12148,51 +12987,67 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 
   release_compound_type_rd_buffers(&rd_buffers);
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-  if (!do_tx_search) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, do_tx_search_time);
+#endif
+  if (do_tx_search_global != 1) {
     inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
     search_state.best_rd = INT64_MAX;
 
     int64_t top_est_rd =
-        inter_modes_info->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx];
+        inter_modes_info->num > 0
+            ? inter_modes_info
+                  ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]
+            : INT64_MAX;
     for (int j = 0; j < inter_modes_info->num; ++j) {
       const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
       *mbmi = inter_modes_info->mbmi_arr[data_idx];
       int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
-      if (curr_est_rd * 0.9 > top_est_rd) {
-        continue;
-      }
-      const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
-
-      x->skip = 0;
-      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
-      // Select prediction reference frames.
-      const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
-      for (i = 0; i < num_planes; i++) {
-        xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
-        if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
-      }
+      if (curr_est_rd * 0.80 > top_est_rd) break;
 
       RD_STATS rd_stats;
       RD_STATS rd_stats_y;
       RD_STATS rd_stats_uv;
 
-      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-      if (mbmi->motion_mode == OBMC_CAUSAL)
-        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-
-      if (!txfm_search(cpi, x, bsize, mi_row, mi_col, &rd_stats, &rd_stats_y,
-                       &rd_stats_uv, mode_rate, search_state.best_rd)) {
-        continue;
+      bool true_rd = inter_modes_info->true_rd_arr[data_idx];
+      if (true_rd) {
+        rd_stats = inter_modes_info->rd_cost_arr[data_idx];
+        rd_stats_y = inter_modes_info->rd_cost_y_arr[data_idx];
+        rd_stats_uv = inter_modes_info->rd_cost_uv_arr[data_idx];
+        memcpy(x->blk_skip, inter_modes_info->blk_skip_arr[data_idx],
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
       } else {
-        const int skip_ctx = av1_get_skip_context(xd);
-        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
-                             rd_stats.dist,
-                             rd_stats_y.rate + rd_stats_uv.rate +
-                                 x->skip_cost[skip_ctx][mbmi->skip]);
+        const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+
+        x->skip = 0;
+        set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+        // Select prediction reference frames.
+        const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+        for (i = 0; i < num_planes; i++) {
+          xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+          if (is_comp_pred)
+            xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+        }
+
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      av1_num_planes(cm) - 1);
+        if (mbmi->motion_mode == OBMC_CAUSAL)
+          av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+        if (!txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, &rd_stats,
+                         &rd_stats_y, &rd_stats_uv, mode_rate,
+                         search_state.best_rd)) {
+          continue;
+        } else if (cpi->sf.inter_mode_rd_model_estimation == 1) {
+          const int skip_ctx = av1_get_skip_context(xd);
+          inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
+                               rd_stats.dist,
+                               rd_stats_y.rate + rd_stats_uv.rate +
+                                   x->skip_cost[skip_ctx][mbmi->skip]);
+        }
+        rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
       }
-      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
 
       if (rd_stats.rdcost < search_state.best_rd) {
         search_state.best_rd = rd_stats.rdcost;
@@ -12211,14 +13066,16 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
         search_state.best_rate_uv = rd_stats_uv.rate;
         memcpy(ctx->blk_skip, x->blk_skip,
                sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-#if CONFIG_ONE_PASS_SVM
-        av1_copy_reg_stat(rd_cost, &rd_stats_y);
-#endif
       }
     }
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, do_tx_search_time);
 #endif
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, handle_intra_mode_time);
+#endif
   for (int j = 0; j < intra_mode_num; ++j) {
     const int mode_index = intra_mode_idx_ls[j];
     const MV_REFERENCE_FRAME ref_frame =
@@ -12256,11 +13113,11 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
       search_state.best_rate_uv = intra_rd_stats_uv.rate;
       memcpy(ctx->blk_skip, x->blk_skip,
              sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-#if CONFIG_ONE_PASS_SVM
-      av1_copy_reg_stat(rd_cost, &intra_rd_stats_y);
-#endif
     }
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, handle_intra_mode_time);
+#endif
 
   // In effect only when speed >= 2.
   sf_refine_fast_tx_type_search(
@@ -12273,7 +13130,6 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi,
                         ref_costs_single, &search_state);
   }
-
   search_state.best_mbmode.skip_mode = 0;
   if (cm->current_frame.skip_mode_info.skip_mode_flag &&
       !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
@@ -12351,6 +13207,496 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   }
 }
 
+// TODO(kyslov): now this is very similar to av1_rd_pick_inter_mode_sb except:
+//                 it only checks non-compound mode and
+//                 it doesn't check palette mode
+//                 it doesn't refine tx search
+//               this function is likely to be heavily modified with nonrd mode
+//               decision
+void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
+                                  MACROBLOCK *x, int mi_row, int mi_col,
+                                  RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                  PICK_MODE_CONTEXT *ctx,
+                                  int64_t best_rd_so_far) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  PREDICTION_MODE this_mode;
+  unsigned char segment_id = mbmi->segment_id;
+  int i;
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+  mode_skip_mask_t mode_skip_mask;
+  uint8_t motion_mode_skip_mask = 0;  // second pass of single ref modes
+
+  InterModeSearchState search_state;
+  init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
+                               best_rd_so_far);
+  INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
+    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
+    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
+  };
+  HandleInterModeArgs args = {
+    { NULL },  { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+    { NULL },  { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 },
+    NULL,      NULL,
+    NULL,      search_state.modelled_rd,
+    { { 0 } }, INT_MAX,
+    INT_MAX,   search_state.simple_rd,
+    0,         interintra_modes,
+    1,         NULL
+  };
+  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+
+  av1_invalid_rd_stats(rd_cost);
+
+  // Ref frames that are selected by square partition blocks.
+  int picked_ref_frames_mask = 0;
+  if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+      mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+    // Don't enable for vert and horz partition blocks if current frame
+    // will be used as bwd or arf2.
+    if ((!cpi->refresh_bwd_ref_frame && !cpi->refresh_alt2_ref_frame) ||
+        (mbmi->partition != PARTITION_VERT &&
+         mbmi->partition != PARTITION_HORZ)) {
+      picked_ref_frames_mask = fetch_picked_ref_frames_mask(
+          x, bsize, cm->seq_params.mib_size, mi_row, mi_col);
+    }
+  }
+
+  // Skip ref frames that never selected by square blocks.
+  const int skip_ref_frame_mask =
+      picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
+
+  // init params, set frame modes, speed features
+  set_params_nonrd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col,
+                                   &mode_skip_mask, skip_ref_frame_mask,
+                                   ref_costs_single, ref_costs_comp, yv12_mb);
+
+  int64_t best_est_rd = INT64_MAX;
+  InterModesInfo *inter_modes_info = x->inter_modes_info;
+  inter_modes_info->num = 0;
+
+  int intra_mode_num = 0;
+  int intra_mode_idx_ls[MAX_MODES];
+  int reach_first_comp_mode = 0;
+
+  // Temporary buffers used by handle_inter_mode().
+  uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_obmc_bufs[0]);
+
+  CompoundTypeRdBuffers rd_buffers;
+  alloc_compound_type_rd_buffers(cm, &rd_buffers);
+
+  for (int midx = 0; midx < MAX_MODES; ++midx) {
+    const MODE_DEFINITION *mode_order = &av1_mode_order[midx];
+    this_mode = mode_order->mode;
+    const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
+    const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
+    const int comp_pred = second_ref_frame > INTRA_FRAME;
+
+    if (second_ref_frame != NONE_FRAME) continue;
+
+    // When single ref motion search ends:
+    // 1st pass: To evaluate single ref RD results and rewind to the beginning;
+    // 2nd pass: To continue with compound ref search.
+    if (sf->prune_single_motion_modes_by_simple_trans) {
+      if (comp_pred && args.single_ref_first_pass) {
+        args.single_ref_first_pass = 0;
+        // Reach the first comp ref mode
+        // Reset midx to start the 2nd pass for single ref motion search
+        midx = -1;
+        motion_mode_skip_mask = analyze_simple_trans_states(cpi, x);
+        continue;
+      }
+      if (!comp_pred && ref_frame != INTRA_FRAME) {  // single ref mode
+        if (args.single_ref_first_pass) {
+          // clear stats
+          for (int k = 0; k < MAX_REF_MV_SERCH; ++k) {
+            x->simple_rd_state[midx][k].rd_stats.rdcost = INT64_MAX;
+            x->simple_rd_state[midx][k].early_skipped = 0;
+          }
+        } else {
+          if (motion_mode_skip_mask & (1 << ref_frame)) {
+            continue;
+          }
+        }
+      }
+    }
+
+    // Reach the first compound prediction mode
+    if (sf->prune_comp_search_by_single_result > 0 && comp_pred &&
+        reach_first_comp_mode == 0) {
+      analyze_single_states(cpi, &search_state);
+      reach_first_comp_mode = 1;
+    }
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int rate2 = 0;
+    int64_t distortion2 = 0;
+    int skippable = 0;
+    int this_skip2 = 0;
+
+    init_mbmi(mbmi, midx, cm);
+
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    if (inter_mode_compatible_skip(cpi, x, bsize, midx)) continue;
+
+    const int ret = inter_mode_search_order_independent_skip(
+        cpi, x, bsize, midx, mi_row, mi_col, &mode_skip_mask, &search_state,
+        skip_ref_frame_mask);
+    if (ret == 1) continue;
+    args.skip_motion_mode = (ret == 2);
+
+    if (sf->drop_ref && comp_pred) {
+      if (sf_check_is_drop_ref(mode_order, &search_state)) {
+        continue;
+      }
+    }
+
+    if (search_state.best_rd < search_state.mode_threshold[midx]) continue;
+
+    if (sf->prune_comp_search_by_single_result > 0 && comp_pred) {
+      if (compound_skip_by_single_states(cpi, &search_state, this_mode,
+                                         ref_frame, second_ref_frame, x))
+        continue;
+    }
+
+    const int ref_frame_cost = comp_pred
+                                   ? ref_costs_comp[ref_frame][second_ref_frame]
+                                   : ref_costs_single[ref_frame];
+    const int compmode_cost =
+        is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0;
+    const int real_compmode_cost =
+        cm->current_frame.reference_mode == REFERENCE_MODE_SELECT
+            ? compmode_cost
+            : 0;
+
+    if (comp_pred) {
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          search_state.best_mode_index >= 0 &&
+          search_state.best_mbmode.ref_frame[0] == INTRA_FRAME)
+        continue;
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      if (!cpi->oxcf.enable_smooth_intra &&
+          (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+           mbmi->mode == SMOOTH_V_PRED))
+        continue;
+      if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
+      if (sf->adaptive_mode_search > 1)
+        if ((x->source_variance << num_pels_log2_lookup[bsize]) >
+            search_state.best_pred_sse)
+          continue;
+
+      if (this_mode != DC_PRED) {
+        // Only search the oblique modes if the best so far is
+        // one of the neighboring directional modes
+        if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+            (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+          if (search_state.best_mode_index >= 0 &&
+              search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
+            continue;
+        }
+        if (sf->mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+          if (conditional_skipintra(this_mode, search_state.best_intra_mode))
+            continue;
+        }
+      }
+    }
+
+    // Select prediction reference frames.
+    for (i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      intra_mode_idx_ls[intra_mode_num++] = midx;
+      continue;
+    } else {
+      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
+      mbmi->ref_mv_idx = 0;
+      int64_t ref_best_rd = search_state.best_rd;
+      {
+        RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
+        av1_init_rd_stats(&rd_stats);
+        rd_stats.rate = rate2;
+
+        // Point to variables that are maintained between loop iterations
+        args.single_newmv = search_state.single_newmv;
+        args.single_newmv_rate = search_state.single_newmv_rate;
+        args.single_newmv_valid = search_state.single_newmv_valid;
+        args.single_comp_cost = real_compmode_cost;
+        args.ref_frame_cost = ref_frame_cost;
+        if (midx < MAX_SINGLE_REF_MODES) {
+          args.simple_rd_state = x->simple_rd_state[midx];
+        }
+        this_rd = handle_inter_mode(
+            cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+            &disable_skip, mi_row, mi_col, &args, ref_best_rd, tmp_buf,
+            &rd_buffers, &best_est_rd, 0, inter_modes_info);
+        rate2 = rd_stats.rate;
+        skippable = rd_stats.skip;
+        distortion2 = rd_stats.dist;
+      }
+
+      if (sf->prune_comp_search_by_single_result > 0 &&
+          is_inter_singleref_mode(this_mode) && args.single_ref_first_pass) {
+        collect_single_states(x, &search_state, mbmi);
+      }
+
+      if (this_rd == INT64_MAX) continue;
+
+      this_skip2 = mbmi->skip;
+      this_rd = RDCOST(x->rdmult, rate2, distortion2);
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < search_state.best_rd || x->skip) {
+      int mode_excluded = 0;
+      if (comp_pred) {
+        mode_excluded = cm->current_frame.reference_mode == SINGLE_REFERENCE;
+      }
+      if (!mode_excluded) {
+        // Note index of best mode so far
+        search_state.best_mode_index = midx;
+
+        if (ref_frame == INTRA_FRAME) {
+          /* required for left and above block mv */
+          mbmi->mv[0].as_int = 0;
+        } else {
+          search_state.best_pred_sse = x->pred_sse[ref_frame];
+        }
+
+        rd_cost->rate = rate2;
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
+        search_state.best_rd = this_rd;
+        search_state.best_mbmode = *mbmi;
+        search_state.best_skip2 = this_skip2;
+        search_state.best_mode_skippable = skippable;
+        memcpy(ctx->blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      }
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && ref_frame != INTRA_FRAME) {
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2);
+
+      if (!comp_pred) {
+        if (single_rd < search_state.best_pred_rd[SINGLE_REFERENCE])
+          search_state.best_pred_rd[SINGLE_REFERENCE] = single_rd;
+      } else {
+        if (single_rd < search_state.best_pred_rd[COMPOUND_REFERENCE])
+          search_state.best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+      }
+      if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT])
+        search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+    }
+    if (sf->drop_ref && second_ref_frame == NONE_FRAME) {
+      // Collect data from single ref mode, and analyze data.
+      sf_drop_ref_analyze(&search_state, mode_order, distortion2);
+    }
+
+    if (x->skip && !comp_pred) break;
+  }
+
+  release_compound_type_rd_buffers(&rd_buffers);
+
+  inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+  search_state.best_rd = INT64_MAX;
+
+  if (inter_modes_info->num > 0) {
+    const int data_idx = inter_modes_info->rd_idx_pair_arr[0].idx;
+    *mbmi = inter_modes_info->mbmi_arr[data_idx];
+    const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Select prediction reference frames.
+    const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+    for (i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
+
+    RD_STATS rd_stats;
+    RD_STATS rd_stats_y;
+    RD_STATS rd_stats_uv;
+
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                  av1_num_planes(cm) - 1);
+    if (mbmi->motion_mode == OBMC_CAUSAL)
+      av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+    if (txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, &rd_stats,
+                    &rd_stats_y, &rd_stats_uv, mode_rate,
+                    search_state.best_rd)) {
+      if (cpi->sf.inter_mode_rd_model_estimation == 1) {
+        const int skip_ctx = av1_get_skip_context(xd);
+        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
+                             rd_stats.dist,
+                             rd_stats_y.rate + rd_stats_uv.rate +
+                                 x->skip_cost[skip_ctx][mbmi->skip]);
+      }
+      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+
+      if (rd_stats.rdcost < search_state.best_rd) {
+        search_state.best_rd = rd_stats.rdcost;
+        // Note index of best mode so far
+        const int mode_index = get_prediction_mode_idx(
+            mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        search_state.best_mode_index = mode_index;
+        *rd_cost = rd_stats;
+        search_state.best_rd = rd_stats.rdcost;
+        search_state.best_mbmode = *mbmi;
+        search_state.best_skip2 = mbmi->skip;
+        search_state.best_mode_skippable = rd_stats.skip;
+        search_state.best_rate_y =
+            rd_stats_y.rate +
+            x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip];
+        search_state.best_rate_uv = rd_stats_uv.rate;
+        memcpy(ctx->blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      }
+    }
+  }
+
+  for (int j = 0; j < intra_mode_num; ++j) {
+    const int mode_index = intra_mode_idx_ls[j];
+    const MV_REFERENCE_FRAME ref_frame =
+        av1_mode_order[mode_index].ref_frame[0];
+    assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME);
+    assert(ref_frame == INTRA_FRAME);
+    if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break;
+    init_mbmi(mbmi, mode_index, cm);
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME);
+
+    // Select prediction reference frames.
+    for (i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+    }
+
+    RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
+
+    const int ref_frame_cost = ref_costs_single[ref_frame];
+    intra_rd_stats.rdcost = handle_intra_mode(
+        &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0,
+        &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
+    if (intra_rd_stats.rdcost < search_state.best_rd) {
+      search_state.best_rd = intra_rd_stats.rdcost;
+      // Note index of best mode so far
+      search_state.best_mode_index = mode_index;
+      *rd_cost = intra_rd_stats;
+      search_state.best_rd = intra_rd_stats.rdcost;
+      search_state.best_mbmode = *mbmi;
+      search_state.best_skip2 = 0;
+      search_state.best_mode_skippable = intra_rd_stats.skip;
+      search_state.best_rate_y =
+          intra_rd_stats_y.rate +
+          x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip];
+      search_state.best_rate_uv = intra_rd_stats_uv.rate;
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    }
+  }
+
+  search_state.best_mbmode.skip_mode = 0;
+  if (cm->current_frame.skip_mode_info.skip_mode_flag &&
+      !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+      is_comp_ref_allowed(bsize)) {
+    rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, mi_row, mi_col,
+                      yv12_mb);
+  }
+
+  // Make sure that the ref_mv_idx is only nonzero when we're
+  // using a mode which can support ref_mv_idx
+  if (search_state.best_mbmode.ref_mv_idx != 0 &&
+      !(search_state.best_mbmode.mode == NEWMV ||
+        search_state.best_mbmode.mode == NEW_NEWMV ||
+        have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) {
+    search_state.best_mbmode.ref_mv_idx = 0;
+  }
+
+  if (search_state.best_mode_index < 0 ||
+      search_state.best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  assert(
+      (cm->interp_filter == SWITCHABLE) ||
+      (cm->interp_filter ==
+       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 0)) ||
+      !is_inter_block(&search_state.best_mbmode));
+  assert(
+      (cm->interp_filter == SWITCHABLE) ||
+      (cm->interp_filter ==
+       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 1)) ||
+      !is_inter_block(&search_state.best_mbmode));
+
+  if (!cpi->rc.is_src_frame_alt_ref)
+    av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                              sf->adaptive_rd_thresh, bsize,
+                              search_state.best_mode_index);
+
+  // macroblock modes
+  *mbmi = search_state.best_mbmode;
+  x->skip |= search_state.best_skip2;
+
+  // Note: this section is needed since the mode may have been forced to
+  // GLOBALMV by the all-zero mode handling of ref-mv.
+  if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
+    // Correct the interp filters for GLOBALMV
+    if (is_nontrans_global_motion(xd, xd->mi[0])) {
+      assert(mbmi->interp_filters ==
+             av1_broadcast_interp_filter(
+                 av1_unswitchable_filter(cm->interp_filter)));
+    }
+  }
+
+  for (i = 0; i < REFERENCE_MODES; ++i) {
+    if (search_state.best_pred_rd[i] == INT64_MAX)
+      search_state.best_pred_diff[i] = INT_MIN;
+    else
+      search_state.best_pred_diff[i] =
+          search_state.best_rd - search_state.best_pred_rd[i];
+  }
+
+  x->skip |= search_state.best_mode_skippable;
+
+  assert(search_state.best_mode_index >= 0);
+
+  store_coding_context(x, ctx, search_state.best_mode_index,
+                       search_state.best_pred_diff,
+                       search_state.best_mode_skippable);
+}
+
 void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
                                         TileDataEnc *tile_data, MACROBLOCK *x,
                                         int mi_row, int mi_col,
@@ -12494,7 +13840,7 @@ static INLINE void calc_target_weighted_pred_above(
   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
   int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
   const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hbd = is_cur_buf_hbd(xd);
 
   if (!is_hbd) {
     for (int row = 0; row < ctxt->overlap; ++row) {
@@ -12540,7 +13886,7 @@ static INLINE void calc_target_weighted_pred_left(
   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
   int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
   const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hbd = is_cur_buf_hbd(xd);
 
   if (!is_hbd) {
     for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
@@ -12622,7 +13968,7 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
   int32_t *mask_buf = x->mask_buf;
   int32_t *wsrc_buf = x->wsrc_buf;
 
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hbd = is_cur_buf_hbd(xd);
   const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
 
   // plane 0 should not be subsampled
@@ -12741,12 +14087,14 @@ void gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
   }
 }
 
-static uint16_t edge_probability(const uint8_t *input, int w, int h,
+static EdgeInfo edge_probability(const uint8_t *input, int w, int h,
                                  bool high_bd, int bd) {
   // The probability of an edge in the whole image is the same as the highest
   // probability of an edge for any individual pixel. Use Sobel as the metric
   // for finding an edge.
   uint16_t highest = 0;
+  uint16_t highest_x = 0;
+  uint16_t highest_y = 0;
   // Ignore the 1 pixel border around the image for the computation.
   for (int j = 1; j < h - 1; ++j) {
     for (int i = 1; i < w - 1; ++i) {
@@ -12756,18 +14104,22 @@ static uint16_t edge_probability(const uint8_t *input, int w, int h,
       int16_t g_y = g.y >> (bd - 8);
       uint16_t magnitude = (uint16_t)sqrt(g_x * g_x + g_y * g_y);
       highest = AOMMAX(highest, magnitude);
+      highest_x = AOMMAX(highest_x, g_x);
+      highest_y = AOMMAX(highest_y, g_y);
     }
   }
-  return highest;
+  EdgeInfo ei = { .magnitude = highest, .x = highest_x, .y = highest_y };
+  return ei;
 }
 
 /* Uses most of the Canny edge detection algorithm to find if there are any
  * edges in the image.
  */
-uint16_t av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
+EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
                          bool high_bd, int bd) {
   if (w < 3 || h < 3) {
-    return 0;
+    EdgeInfo n = { .magnitude = 0, .x = 0, .y = 0 };
+    return n;
   }
   uint8_t *blurred;
   if (high_bd) {
@@ -12780,7 +14132,7 @@ uint16_t av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
   // want a probability of an edge existing in the buffer, which is determined
   // by the strongest edge in it -- we don't need to eliminate the weaker
   // edges. Use Sobel for the edge detection.
-  uint16_t prob = edge_probability(blurred, w, h, high_bd, bd);
+  EdgeInfo prob = edge_probability(blurred, w, h, high_bd, bd);
   if (high_bd) {
     aom_free(CONVERT_TO_SHORTPTR(blurred));
   } else {
diff --git a/libaom/av1/encoder/rdopt.h b/libaom/av1/encoder/rdopt.h
index 5ff2df3..7ba1b18 100644
--- a/libaom/av1/encoder/rdopt.h
+++ b/libaom/av1/encoder/rdopt.h
@@ -123,18 +123,33 @@ void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi,
                                struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
 
+void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi,
+                                  struct TileDataEnc *tile_data,
+                                  struct macroblock *x, int mi_row, int mi_col,
+                                  struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                  PICK_MODE_CONTEXT *ctx,
+                                  int64_t best_rd_so_far);
+
 void av1_rd_pick_inter_mode_sb_seg_skip(
     const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
     struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
     BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
 
+// The best edge strength seen in the block, as well as the best x and y
+// components of edge strength seen.
+typedef struct {
+  uint16_t magnitude;
+  uint16_t x;
+  uint16_t y;
+} EdgeInfo;
+
 /** Returns an integer indicating the strength of the edge.
  * 0 means no edge found, 556 is the strength of a solid black/white edge,
  * and the number may range higher if the signal is even stronger (e.g., on a
  * corner). high_bd is a bool indicating the source should be treated
  * as a 16-bit array. bd is the bit depth.
  */
-uint16_t av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
+EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
                          bool high_bd, int bd);
 
 /** Applies a Gaussian blur with sigma = 1.3. Used by av1_edge_exists and
@@ -151,10 +166,8 @@ typedef struct {
 
 sobel_xy sobel(const uint8_t *input, int stride, int i, int j, bool high_bd);
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
 void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
 void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
-#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libaom/av1/encoder/reconinter_enc.c b/libaom/av1/encoder/reconinter_enc.c
index 1100222..4b477ce 100644
--- a/libaom/av1/encoder/reconinter_enc.c
+++ b/libaom/av1/encoder/reconinter_enc.c
@@ -138,27 +138,28 @@ static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
         assert(bw < 8 || bh < 8);
         ConvolveParams conv_params = get_conv_params_no_round(
             0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
-        conv_params.use_jnt_comp_avg = 0;
+        conv_params.use_dist_wtd_comp_avg = 0;
         struct buf_2d *const dst_buf = &pd->dst;
         uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
 
         ref = 0;
-        const RefBuffer *ref_buf =
-            &cm->current_frame
-                 .frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+        const RefCntBuffer *ref_buf =
+            get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
+        const struct scale_factors *ref_scale_factors =
+            get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
 
-        pd->pre[ref].buf0 = (plane == 1) ? ref_buf->buf->buf.u_buffer
-                                         : ref_buf->buf->buf.v_buffer;
+        pd->pre[ref].buf0 =
+            (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer;
         pd->pre[ref].buf =
-            pd->pre[ref].buf0 +
-            scaled_buffer_offset(pre_x, pre_y, ref_buf->buf->buf.uv_stride,
-                                 &ref_buf->sf);
-        pd->pre[ref].width = ref_buf->buf->buf.uv_crop_width;
-        pd->pre[ref].height = ref_buf->buf->buf.uv_crop_height;
-        pd->pre[ref].stride = ref_buf->buf->buf.uv_stride;
+            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+                                                     ref_buf->buf.uv_stride,
+                                                     ref_scale_factors);
+        pd->pre[ref].width = ref_buf->buf.uv_crop_width;
+        pd->pre[ref].height = ref_buf->buf.uv_crop_height;
+        pd->pre[ref].stride = ref_buf->buf.uv_stride;
 
         const struct scale_factors *const sf =
-            is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+            is_intrabc ? &cm->sf_identity : ref_scale_factors;
         struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
 
         const MV mv = this_mbmi->mv[ref].as_mv;
@@ -195,15 +196,15 @@ static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
   {
     ConvolveParams conv_params = get_conv_params_no_round(
         0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
-    av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
-                               &conv_params.bck_offset,
-                               &conv_params.use_jnt_comp_avg, is_compound);
+    av1_dist_wtd_comp_weight_assign(
+        cm, mi, 0, &conv_params.fwd_offset, &conv_params.bck_offset,
+        &conv_params.use_dist_wtd_comp_avg, is_compound);
 
     struct buf_2d *const dst_buf = &pd->dst;
     uint8_t *const dst = dst_buf->buf;
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       const struct scale_factors *const sf =
-          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+          is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
       struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
       const MV mv = mi->mv[ref].as_mv;
 
@@ -236,46 +237,19 @@ static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
   }
 }
 
-static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
-                                              MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                                              int mi_row, int mi_col,
-                                              int plane_from, int plane_to) {
-  int plane;
+static void build_inter_predictors_for_plane(const AV1_COMMON *cm,
+                                             MACROBLOCKD *xd, int mi_row,
+                                             int mi_col, const BUFFER_SET *ctx,
+                                             BLOCK_SIZE bsize, int plane_idx) {
+  const struct macroblockd_plane *pd = &xd->plane[plane_idx];
+  if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                           pd->subsampling_y))
+    return;
+
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
-  for (plane = plane_from; plane <= plane_to; ++plane) {
-    const struct macroblockd_plane *pd = &xd->plane[plane];
-    const int bw = pd->width;
-    const int bh = pd->height;
-
-    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                             pd->subsampling_y))
-      continue;
-
-    build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
-  }
-}
-
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col, BUFFER_SET *ctx,
-                                    BLOCK_SIZE bsize) {
-  av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, 0);
-}
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col, BUFFER_SET *ctx,
-                                     BLOCK_SIZE bsize) {
-  for (int plane_idx = 1; plane_idx < MAX_MB_PLANE; plane_idx++) {
-    av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize,
-                                   plane_idx);
-  }
-}
-
-void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col, BUFFER_SET *ctx,
-                                    BLOCK_SIZE bsize, int plane_idx) {
-  build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, plane_idx,
-                                    plane_idx);
+  build_inter_predictors(cm, xd, plane_idx, xd->mi[0], 0, pd->width, pd->height,
+                         mi_x, mi_y);
 
   if (is_interintra_pred(xd->mi[0])) {
     BUFFER_SET default_ctx = { { NULL, NULL, NULL }, { 0, 0, 0 } };
@@ -290,13 +264,14 @@ void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
   }
 }
 
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   int mi_row, int mi_col, BUFFER_SET *ctx,
-                                   BLOCK_SIZE bsize) {
-  const int num_planes = av1_num_planes(cm);
-  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
-  if (num_planes > 1)
-    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int mi_row, int mi_col,
+                                   const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+                                   int plane_from, int plane_to) {
+  for (int plane_idx = plane_from; plane_idx <= plane_to; ++plane_idx) {
+    build_inter_predictors_for_plane(cm, xd, mi_row, mi_col, ctx, bsize,
+                                     plane_idx);
+  }
 }
 
 // TODO(sarahparker):
@@ -309,7 +284,7 @@ void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                                InterpFilters interp_filters,
                                const WarpTypesAllowed *warp_types, int p_col,
                                int p_row, int plane, int ref,
-                               enum mv_precision precision, int x, int y,
+                               mv_precision precision, int x, int y,
                                const MACROBLOCKD *xd, int can_use_previous) {
   const int is_q4 = precision == MV_PRECISION_Q4;
   const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
@@ -452,7 +427,7 @@ void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
     int len = sizeof(uint16_t);
     dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
     dst_buf1[1] =
@@ -493,7 +468,7 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const MB_MODE_INFO *mi = xd->mi[0];
 
-  const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+  const struct scale_factors *const sf = xd->block_ref_scale_factors[ref];
   struct buf_2d *const pre_buf = &pd->pre[ref];
   uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
   const MV mv = mi->mv[ref].as_mv;
@@ -575,37 +550,41 @@ static void build_wedge_inter_predictor_from_buf(
   uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
   mbmi->interinter_comp.seg_mask = xd->seg_mask;
   const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+  const int is_hbd = is_cur_buf_hbd(xd);
 
   if (is_compound && is_masked_compound_type(comp_data->type)) {
     if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      if (is_hbd) {
         av1_build_compound_diffwtd_mask_highbd(
             comp_data->seg_mask, comp_data->mask_type,
             CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
             CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
-      else
+      } else {
         av1_build_compound_diffwtd_mask(
             comp_data->seg_mask, comp_data->mask_type, ext_dst0,
             ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
+      }
     }
 
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    if (is_hbd) {
       build_masked_compound_highbd(
           dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
           CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
           mbmi->sb_type, h, w, xd->bd);
-    else
+    } else {
       build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
                             ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
                             h, w);
+    }
   } else {
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    if (is_hbd) {
       aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
                                dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
                                xd->bd);
-    else
+    } else {
       aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
                         0, NULL, 0, w, h);
+    }
   }
 }
 
diff --git a/libaom/av1/encoder/reconinter_enc.h b/libaom/av1/encoder/reconinter_enc.h
index 10d5e8c..5687168 100644
--- a/libaom/av1/encoder/reconinter_enc.h
+++ b/libaom/av1/encoder/reconinter_enc.h
@@ -23,21 +23,10 @@
 extern "C" {
 #endif
 
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col, BUFFER_SET *ctx,
-                                    BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col, BUFFER_SET *ctx,
-                                     BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col, BUFFER_SET *ctx,
-                                    BLOCK_SIZE bsize, int plane_idx);
-
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   int mi_row, int mi_col, BUFFER_SET *ctx,
-                                   BLOCK_SIZE bsize);
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int mi_row, int mi_col,
+                                   const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+                                   int plane_from, int plane_to);
 
 void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, const MV *src_mv,
@@ -46,7 +35,7 @@ void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                                InterpFilters interp_filters,
                                const WarpTypesAllowed *warp_types, int p_col,
                                int p_row, int plane, int ref,
-                               enum mv_precision precision, int x, int y,
+                               mv_precision precision, int x, int y,
                                const MACROBLOCKD *xd, int can_use_previous);
 
 // Detect if the block have sub-pixel level motion vectors
diff --git a/libaom/av1/encoder/speed_features.c b/libaom/av1/encoder/speed_features.c
index fd0368e..5dfc585 100644
--- a/libaom/av1/encoder/speed_features.c
+++ b/libaom/av1/encoder/speed_features.c
@@ -17,13 +17,9 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 
-// Setting this to 1 will disable trellis optimization completely.
-// Setting this to 2 will disable trellis optimization within the
-// transform search. Trellis optimization will still be applied
-// in the final encode.
-#define DISABLE_TRELLISQ_SEARCH 0
-
 #define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+// Max speed setting for tx domain evaluation
+#define MAX_TX_DOMAIN_EVAL_SPEED 5
 static MESH_PATTERN
     good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
       { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
@@ -50,6 +46,22 @@ static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
 static uint8_t intrabc_max_mesh_pct[MAX_MESH_SPEED + 1] = { 100, 100, 100,
                                                             25,  25,  10 };
 
+// Threshold values to be used for pruning the txfm_domain_distortion
+// based on block MSE
+// TODO(any): Experiment the threshold logic based on variance metric
+static unsigned int tx_domain_dist_thresholds[MAX_TX_DOMAIN_EVAL_SPEED + 1] = {
+  UINT_MAX, 162754, 22026, 22026, 22026, 0
+};
+// Threshold values to be used for disabling coeff RD-optimization
+// based on block MSE
+// TODO(any): Experiment the threshold logic based on variance metric
+static unsigned int coeff_opt_dist_thresholds[5] = { UINT_MAX, 162754, 162754,
+                                                     22026, 22026 };
+// scaling values to be used for gating wedge/compound segment based on best
+// approximate rd
+static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 };
+static int comp_type_rd_threshold_div[3] = { 3, 16, 16 };
+
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
 static int frame_is_boosted(const AV1_COMP *cpi) {
@@ -62,7 +74,7 @@ static int frame_is_boosted(const AV1_COMP *cpi) {
 // partly on the screen area that over which they propogate. Propogation is
 // limited by transform block size but the screen area take up by a given block
 // size will be larger for a small image format stretched to full screen.
-static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) {
+static BLOCK_SIZE set_partition_min_limit(const AV1_COMMON *const cm) {
   unsigned int screen_area = (cm->width * cm->height);
 
   // Select block size based on image format size.
@@ -78,24 +90,21 @@ static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) {
   }
 }
 
-// Do we have an internal image edge (e.g. formatting bars).
-static int has_internal_image_edge(const AV1_COMP *cpi) {
-  return (cpi->oxcf.pass == 2) &&
-         ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
-          (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
-}
-
-static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
-                                                       SPEED_FEATURES *sf,
-                                                       int speed) {
-  AV1_COMMON *const cm = &cpi->common;
+static void set_good_speed_feature_framesize_dependent(
+    const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
 
   if (is_480p_or_larger) {
     sf->use_square_partition_only_threshold = BLOCK_128X128;
+    if (is_720p_or_larger)
+      sf->auto_max_partition_based_on_simple_motion = ADAPT_PRED;
+    else
+      sf->auto_max_partition_based_on_simple_motion = RELAXED_PRED;
   } else {
     sf->use_square_partition_only_threshold = BLOCK_64X64;
+    sf->auto_max_partition_based_on_simple_motion = DIRECT_PRED;
   }
 
   // TODO(huisu@google.com): train models for 720P and above.
@@ -107,6 +116,11 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
     sf->ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
   }
 
+  if (is_720p_or_larger && speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL_START &&
+      speed < CONFIG_2PASS_PARTITION_SEARCH_LVL_END) {
+    sf->two_pass_partition_search = 1;
+  }
+
   if (speed >= 1) {
     if (is_720p_or_larger) {
       sf->use_square_partition_only_threshold = BLOCK_128X128;
@@ -122,18 +136,28 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
       sf->ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
       sf->ml_partition_search_breakout_thresh[3] = 300;  // BLOCK_64X64
       sf->ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
+
+      sf->firstpass_simple_motion_search_early_term = 1;
     }
   }
 
   if (speed >= 2) {
     if (is_720p_or_larger) {
-      sf->disable_split_mask =
-          cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+      sf->use_square_partition_only_threshold = BLOCK_64X64;
+    } else if (is_480p_or_larger) {
+      sf->use_square_partition_only_threshold = BLOCK_32X32;
+    } else {
+      // TODO(chiyotsai@google.com): Setting the threshold to BLOCK_16X16 incurs
+      // a large loss (about 0.584%). Try increasing the threshold on boosted
+      // frame and see if it improves the performance.
+      sf->use_square_partition_only_threshold = BLOCK_32X32;
+    }
+
+    if (is_720p_or_larger) {
       sf->adaptive_pred_interp_filter = 0;
       sf->partition_search_breakout_dist_thr = (1 << 24);
       sf->partition_search_breakout_rate_thr = 120;
     } else {
-      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
       sf->partition_search_breakout_dist_thr = (1 << 22);
       sf->partition_search_breakout_rate_thr = 100;
     }
@@ -142,24 +166,15 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
 
   if (speed >= 3) {
     if (is_720p_or_larger) {
-      sf->disable_split_mask = DISABLE_ALL_SPLIT;
       sf->partition_search_breakout_dist_thr = (1 << 25);
       sf->partition_search_breakout_rate_thr = 200;
     } else {
       sf->max_intra_bsize = BLOCK_32X32;
-      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
       sf->partition_search_breakout_dist_thr = (1 << 23);
       sf->partition_search_breakout_rate_thr = 120;
     }
-  }
-
-  // If this is a two pass clip that fits the criteria for animated or
-  // graphics content then reset disable_split_mask for speeds 2+.
-  // Also if the image edge is internal to the coded area.
-  if ((speed >= 2) && (cpi->oxcf.pass == 2) &&
-      ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
-       (has_internal_image_edge(cpi)))) {
-    sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+    sf->use_first_partition_pass_interintra_stats =
+        sf->two_pass_partition_search;
   }
 
   if (speed >= 4) {
@@ -168,15 +183,15 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
     } else {
       sf->partition_search_breakout_dist_thr = (1 << 24);
     }
-    sf->disable_split_mask = DISABLE_ALL_SPLIT;
   }
 }
 
-static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
-                                                          SPEED_FEATURES *sf,
-                                                          int speed) {
-  AV1_COMMON *const cm = &cpi->common;
+static void set_good_speed_features_framesize_independent(
+    const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
   const int boosted = frame_is_boosted(cpi);
+  const int is_boosted_arf2_bwd_type =
+      boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame;
 
   // Speed 0 for all speed features that give neutral coding performance change.
   sf->reduce_inter_modes = 1;
@@ -184,16 +199,22 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
   sf->ml_prune_rect_partition = 1;
   sf->ml_prune_ab_partition = 1;
   sf->ml_prune_4_partition = 1;
+  sf->simple_motion_search_prune_rect = 1;
   sf->adaptive_txb_search_level = 1;
-  sf->use_jnt_comp_flag = JNT_COMP_SKIP_MV_SEARCH;
+  sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
   sf->model_based_prune_tx_search_level = 1;
   sf->model_based_post_interp_filter_breakout = 1;
+  sf->model_based_motion_mode_rd_breakout = 1;
+
+  // TODO(debargha): Test, tweak and turn on either 1 or 2
   sf->inter_mode_rd_model_estimation = 1;
+  sf->inter_mode_rd_model_estimation_adaptive = 0;
+
+  sf->two_loop_comp_search = 0;
   sf->prune_ref_frame_for_rect_partitions =
-      !(boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame);
-  sf->prune_ref_mode_for_partitions = sf->prune_ref_frame_for_rect_partitions;
+      boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2);
   sf->less_rectangular_check_level = 1;
-  sf->gm_search_type = GM_REDUCED_REF_SEARCH;
+  sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
   sf->gm_disable_recode = 1;
   sf->use_fast_interpolation_filter_search = 1;
   sf->intra_tx_size_search_init_depth_sqr = 1;
@@ -202,28 +223,250 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
   sf->prune_wedge_pred_diff_based = 1;
   sf->disable_wedge_search_var_thresh = 0;
   sf->disable_wedge_search_edge_thresh = 0;
+  sf->prune_motion_mode_level = 1;
+  sf->cb_pred_filter_search = 0;
+  sf->use_nonrd_pick_mode = 0;
+  sf->use_real_time_ref_set = 0;
 
   if (speed >= 1) {
     sf->gm_erroradv_type = GM_ERRORADV_TR_1;
     sf->selective_ref_frame = 2;
 
+    sf->intra_tx_size_search_init_depth_rect = 1;
+    sf->tx_size_search_lgr_block = 1;
+
+    sf->prune_ext_partition_types_search_level = 2;
+    sf->skip_repeat_interpolation_filter_search = 1;
+    sf->tx_type_search.skip_tx_search = 1;
+    sf->tx_type_search.ml_tx_split_thresh = 40;
+    sf->model_based_prune_tx_search_level = 0;
+    sf->adaptive_txb_search_level = 2;
+    sf->use_intra_txb_hash = 1;
+    sf->optimize_b_precheck = 1;
+    sf->dual_sgr_penalty_level = 1;
+    sf->use_accurate_subpel_search = USE_4_TAPS;
+    sf->reuse_inter_intra_mode = 1;
+    sf->prune_comp_search_by_single_result = 1;
+    sf->skip_repeated_newmv = 1;
+    sf->obmc_full_pixel_search_level = 1;
+    // TODO(anyone): Following speed feature will be further explored to
+    // identify the appropriate tradeoff between encoder performance and its
+    // speed.
+    sf->prune_single_motion_modes_by_simple_trans = 1;
+
+    sf->simple_motion_search_split_only = 1;
+    sf->simple_motion_search_early_term_none = 1;
+
+    sf->disable_wedge_search_var_thresh = 0;
+    sf->disable_wedge_search_edge_thresh = 0;
+    sf->disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+    sf->prune_comp_type_by_comp_avg = 1;
+    sf->prune_motion_mode_level = 2;
+    sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
+    sf->cb_pred_filter_search = 1;
+    sf->use_transform_domain_distortion = boosted ? 0 : 1;
+    sf->perform_coeff_opt = boosted ? 0 : 1;
+    sf->use_inter_txb_hash = 0;
+  }
+
+  if (speed >= 2) {
+    sf->gm_erroradv_type = GM_ERRORADV_TR_2;
+
+    sf->selective_ref_frame = 3;
     sf->inter_tx_size_search_init_depth_rect = 1;
     sf->inter_tx_size_search_init_depth_sqr = 1;
+
+    sf->fast_cdef_search = 1;
+
+    sf->adaptive_rd_thresh = 1;
+    sf->mv.auto_mv_step_size = 1;
+    sf->mv.subpel_iters_per_step = 1;
+    sf->disable_filter_search_var_thresh = 100;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
+
+    sf->partition_search_breakout_rate_thr = 80;
+    sf->allow_partition_search_skip = 1;
+    sf->disable_wedge_search_var_thresh = 100;
+    sf->disable_wedge_search_edge_thresh = 0;
+    sf->disable_interinter_wedge_newmv_search = 1;
+    sf->fast_wedge_sign_estimate = 1;
+    sf->disable_dual_filter = 1;
+    sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+    sf->prune_comp_type_by_comp_avg = 2;
+    // TODO(Sachin): Enable/Enhance this speed feature for speed 2 & 3
+    sf->cb_pred_filter_search = 0;
+    sf->adaptive_interp_filter_search = 1;
+    sf->perform_coeff_opt = boosted ? 0 : 2;
+  }
+
+  if (speed >= 3) {
+    sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
+    sf->less_rectangular_check_level = 2;
+    sf->adaptive_pred_interp_filter = 1;
+    // adaptive_motion_search breaks encoder multi-thread tests.
+    // The values in x->pred_mv[] differ for single and multi-thread cases.
+    // See aomedia:1778.
+    // sf->adaptive_motion_search = 1;
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+    sf->use_transform_domain_distortion = boosted ? 1 : 2;
+    sf->use_accurate_subpel_search = USE_2_TAPS;
+    sf->adaptive_rd_thresh = 2;
+    if (cpi->oxcf.enable_smooth_interintra) {
+      sf->disable_smooth_interintra =
+          (boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame)
+              ? 0
+              : 1;
+    }
+    sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
+    sf->gm_search_type = GM_DISABLE_SEARCH;
+    sf->prune_comp_search_by_single_result = 2;
+    sf->prune_motion_mode_level = boosted ? 2 : 3;
+    sf->prune_warp_using_wmtype = 1;
+    // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
+    // it with cpi->sf.disable_wedge_search_var_thresh.
+    sf->disable_wedge_interintra_search = 1;
+    // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2
+    // and clean-up the speed feature
+    sf->perform_best_rd_based_gating_for_chroma = 1;
+    sf->prune_ref_frame_for_rect_partitions =
+        frame_is_intra_only(&cpi->common) ? 0 : (boosted ? 1 : 2);
+    sf->perform_coeff_opt = is_boosted_arf2_bwd_type ? 2 : 3;
+    sf->prune_comp_type_by_model_rd = boosted ? 0 : 1;
+    // TODO(Venkat): Clean-up frame type dependency for
+    // simple_motion_search_split_only in partition search function and set the
+    // speed feature accordingly
+    // TODO(Venkat): Evaluate this speed feature for speed 1 & 2
+    sf->simple_motion_search_split_only =
+        cm->allow_screen_content_tools ? 1 : 2;
+    sf->disable_smooth_intra =
+        !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key != 1);
+  }
+
+  if (speed >= 4) {
+    sf->use_intra_txb_hash = 0;
+    sf->tx_type_search.fast_intra_tx_type_search = 1;
+    sf->disable_loop_restoration_chroma =
+        (boosted || cm->allow_screen_content_tools) ? 0 : 1;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
+    sf->adaptive_pred_interp_filter = 0;
+    sf->cb_pred_filter_search = 1;
+    sf->adaptive_mode_search = 1;
+    sf->alt_ref_search_fp = 1;
+    sf->skip_sharp_interp_filter_search = 1;
+    sf->perform_coeff_opt = is_boosted_arf2_bwd_type ? 2 : 4;
+    sf->adaptive_txb_search_level = boosted ? 2 : 3;
+  }
+
+  if (speed >= 5) {
+    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
+    sf->tx_size_search_method = USE_LARGESTALL;
+    sf->mv.search_method = BIGDIA;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_search_skip_flags =
+        (cm->current_frame.frame_type == KEY_FRAME)
+            ? 0
+            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+                  FLAG_EARLY_TERMINATE;
+    sf->disable_filter_search_var_thresh = 200;
+    sf->use_fast_coef_costing = 1;
+    sf->partition_search_breakout_rate_thr = 300;
+    sf->use_transform_domain_distortion = 2;
+  }
+
+  if (speed >= 6) {
+    int i;
+    sf->optimize_coefficients = NO_TRELLIS_OPT;
+    sf->mv.search_method = HEX;
+    sf->disable_filter_search_var_thresh = 500;
+    for (i = 0; i < TX_SIZES; ++i) {
+      sf->intra_y_mode_mask[i] = INTRA_DC;
+      sf->intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
+    }
+    sf->partition_search_breakout_rate_thr = 500;
+    sf->mv.reduce_first_step_size = 1;
+    sf->simple_model_rd_from_var = 1;
+  }
+  if (speed >= 7) {
+    sf->default_max_partition_size = BLOCK_32X32;
+    sf->default_min_partition_size = BLOCK_8X8;
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->frame_parameter_update = 0;
+    sf->mv.search_method = FAST_HEX;
+    sf->partition_search_type = REFERENCE_PARTITION;
+    sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+    // TODO(any): evaluate adaptive_mode_search=1 for speed 7 & 8
+    sf->adaptive_mode_search = 2;
+  }
+  if (speed >= 8) {
+    sf->mv.search_method = FAST_DIAMOND;
+    sf->mv.subpel_force_stop = HALF_PEL;
+    sf->lpf_pick = LPF_PICK_FROM_Q;
+  }
+}
+
+// TODO(kyslov): now this is very similar to
+// set_good_speed_features_framesize_independent
+//               except it sets non-rd flag on speed8. This function will likely
+//               be modified in the future with RT-specific speed features
+static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
+                                                        SPEED_FEATURES *sf,
+                                                        int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int boosted = frame_is_boosted(cpi);
+
+  // Speed 0 for all speed features that give neutral coding performance change.
+  sf->reduce_inter_modes = 1;
+  sf->prune_ext_partition_types_search_level = 1;
+  sf->ml_prune_rect_partition = 1;
+  sf->ml_prune_ab_partition = 1;
+  sf->ml_prune_4_partition = 1;
+  sf->adaptive_txb_search_level = 1;
+  sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
+  sf->model_based_prune_tx_search_level = 1;
+  sf->model_based_post_interp_filter_breakout = 1;
+  sf->model_based_motion_mode_rd_breakout = 1;
+
+  // TODO(debargha): Test, tweak and turn on either 1 or 2
+  sf->inter_mode_rd_model_estimation = 0;
+  sf->inter_mode_rd_model_estimation_adaptive = 0;
+  sf->two_loop_comp_search = 0;
+
+  sf->prune_ref_frame_for_rect_partitions = !boosted;
+  sf->less_rectangular_check_level = 1;
+  sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
+  sf->gm_disable_recode = 1;
+  sf->use_fast_interpolation_filter_search = 1;
+  sf->intra_tx_size_search_init_depth_sqr = 1;
+  sf->intra_angle_estimation = 1;
+  sf->selective_ref_frame = 1;
+  sf->prune_wedge_pred_diff_based = 1;
+  sf->disable_wedge_search_var_thresh = 0;
+  sf->disable_wedge_search_edge_thresh = 0;
+  sf->prune_motion_mode_level = 1;
+  sf->cb_pred_filter_search = 0;
+  sf->use_nonrd_pick_mode = 0;
+  sf->use_real_time_ref_set = 0;
+
+  if (speed >= 1) {
+    sf->gm_erroradv_type = GM_ERRORADV_TR_1;
+    sf->selective_ref_frame = 2;
+
     sf->intra_tx_size_search_init_depth_rect = 1;
     sf->tx_size_search_lgr_block = 1;
-    if (speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL) {
-      sf->two_pass_partition_search = 1;
-      sf->mode_pruning_based_on_two_pass_partition_search = 1;
-    }
     sf->prune_ext_partition_types_search_level = 2;
     sf->skip_repeat_interpolation_filter_search = 1;
     sf->tx_type_search.skip_tx_search = 1;
     sf->tx_type_search.ml_tx_split_thresh = 40;
     sf->model_based_prune_tx_search_level = 0;
-    sf->model_based_post_interp_filter_breakout = 0;
-    // TODO(angiebird): Re-evaluate the impact of inter_mode_rd_model_estimation
-    // on speed 1
-    sf->inter_mode_rd_model_estimation = 0;
     sf->adaptive_txb_search_level = 2;
     sf->use_intra_txb_hash = 1;
     sf->optimize_b_precheck = 1;
@@ -238,15 +481,23 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     // speed.
     sf->prune_single_motion_modes_by_simple_trans = 1;
 
-    sf->full_pixel_motion_search_based_split = 1;
+    sf->simple_motion_search_prune_rect = 1;
+
     sf->disable_wedge_search_var_thresh = 0;
     sf->disable_wedge_search_edge_thresh = 0;
+    sf->prune_comp_type_by_comp_avg = 1;
+    sf->prune_motion_mode_level = 2;
+    sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
+    sf->cb_pred_filter_search = 1;
+    sf->use_transform_domain_distortion = boosted ? 0 : 1;
   }
 
   if (speed >= 2) {
     sf->gm_erroradv_type = GM_ERRORADV_TR_2;
 
     sf->selective_ref_frame = 3;
+    sf->inter_tx_size_search_init_depth_rect = 1;
+    sf->inter_tx_size_search_init_depth_sqr = 1;
     sf->fast_cdef_search = 1;
 
     sf->adaptive_rd_thresh = 1;
@@ -256,18 +507,19 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
 
     sf->partition_search_breakout_rate_thr = 80;
-    // Note: This speed feature is disable as it seems to be worse in
-    // compression/quality and is also slower.
-    // sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->allow_partition_search_skip = 1;
     sf->disable_wedge_search_var_thresh = 100;
     sf->disable_wedge_search_edge_thresh = 0;
     sf->fast_wedge_sign_estimate = 1;
     sf->disable_dual_filter = 1;
-    sf->use_jnt_comp_flag = JNT_COMP_DISABLED;
+    sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+    sf->prune_comp_type_by_comp_avg = 2;
+    sf->cb_pred_filter_search = 0;
+    sf->adaptive_interp_filter_search = 1;
   }
 
   if (speed >= 3) {
+    sf->selective_ref_frame = 4;
     sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
     sf->less_rectangular_check_level = 2;
     sf->adaptive_pred_interp_filter = 1;
@@ -282,22 +534,23 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
     sf->gm_search_type = GM_DISABLE_SEARCH;
     sf->prune_comp_search_by_single_result = 2;
+    sf->prune_motion_mode_level = boosted ? 2 : 3;
+    sf->prune_warp_using_wmtype = 1;
+    // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
+    // it with cpi->sf.disable_wedge_search_var_thresh.
+    sf->disable_wedge_interintra_search = 1;
   }
 
   if (speed >= 4) {
     sf->use_intra_txb_hash = 0;
-    sf->use_inter_txb_hash = 0;
     sf->use_mb_rd_hash = 0;
     sf->tx_type_search.fast_intra_tx_type_search = 1;
     sf->tx_type_search.fast_inter_tx_type_search = 1;
-    sf->use_square_partition_only_threshold =
-        boosted ? BLOCK_128X128 : BLOCK_4X4;
     sf->tx_size_search_method =
         frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
     sf->adaptive_pred_interp_filter = 0;
     sf->adaptive_mode_search = 1;
-    sf->cb_partition_search = !boosted;
     sf->alt_ref_search_fp = 1;
     sf->skip_sharp_interp_filter_search = 1;
   }
@@ -310,7 +563,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
-    sf->use_square_partition_only_threshold = BLOCK_4X4;
     sf->tx_size_search_method = USE_LARGESTALL;
     sf->mv.search_method = BIGDIA;
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
@@ -352,30 +604,25 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
   }
   if (speed >= 8) {
     sf->mv.search_method = FAST_DIAMOND;
-    sf->mv.subpel_force_stop = 2;
-    sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+    sf->lpf_pick = LPF_PICK_FROM_Q;
+    sf->default_max_partition_size = BLOCK_128X128;
+    sf->default_min_partition_size = BLOCK_8X8;
+    sf->partition_search_type = VAR_BASED_PARTITION;
+    sf->use_real_time_ref_set = 1;
+    // Can't use LARGEST TX mode with pre-calculated partition
+    // and disabled TX64
+    if (!cpi->oxcf.enable_tx64) sf->tx_size_search_method = USE_FAST_RD;
+    sf->use_nonrd_pick_mode = 1;
+    sf->inter_mode_rd_model_estimation = 2;
   }
 }
 
-void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
+void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
   SPEED_FEATURES *const sf = &cpi->sf;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  RD_OPT *const rd = &cpi->rd;
-  int i;
 
   if (oxcf->mode == GOOD) {
-    set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
-  }
-
-  if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
-    sf->adaptive_pred_interp_filter = 0;
-  }
-
-  // Check for masked out split cases.
-  for (i = 0; i < MAX_REFS; ++i) {
-    if (sf->disable_split_mask & (1 << i)) {
-      rd->thresh_mult_sub8x8[i] = INT_MAX;
-    }
+    set_good_speed_feature_framesize_dependent(cpi, sf, speed);
   }
 
   // This is only used in motion vector unit test.
@@ -385,7 +632,7 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
     cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
 }
 
-void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
+void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
   AV1_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCK *const x = &cpi->td.mb;
@@ -398,25 +645,33 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->recode_loop = ALLOW_RECODE;
   sf->mv.subpel_search_method = SUBPEL_TREE;
   sf->mv.subpel_iters_per_step = 2;
-  sf->mv.subpel_force_stop = 0;
-#if DISABLE_TRELLISQ_SEARCH == 2
-  sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
-                                  ? FINAL_PASS_TRELLIS_OPT
-                                  : NO_TRELLIS_OPT;
-#elif DISABLE_TRELLISQ_SEARCH == 1
-  sf->optimize_coefficients = NO_TRELLIS_OPT;
-#else
-  if (is_lossless_requested(&cpi->oxcf))
+  sf->mv.subpel_force_stop = EIGHTH_PEL;
+  if (cpi->oxcf.disable_trellis_quant == 3) {
+    sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+                                    ? NO_ESTIMATE_YRD_TRELLIS_OPT
+                                    : NO_TRELLIS_OPT;
+  } else if (cpi->oxcf.disable_trellis_quant == 2) {
+    sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+                                    ? FINAL_PASS_TRELLIS_OPT
+                                    : NO_TRELLIS_OPT;
+  } else if (cpi->oxcf.disable_trellis_quant == 0) {
+    if (is_lossless_requested(&cpi->oxcf))
+      sf->optimize_coefficients = NO_TRELLIS_OPT;
+    else
+      sf->optimize_coefficients = FULL_TRELLIS_OPT;
+  } else if (cpi->oxcf.disable_trellis_quant == 1) {
     sf->optimize_coefficients = NO_TRELLIS_OPT;
-  else
-    sf->optimize_coefficients = FULL_TRELLIS_OPT;
-#endif  // DISABLE_TRELLISQ_SEARCH
+  } else {
+    assert(0 && "Invalid disable_trellis_quant value");
+  }
   sf->gm_erroradv_type = GM_ERRORADV_TR_0;
   sf->mv.reduce_first_step_size = 0;
   sf->mv.auto_mv_step_size = 0;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->adaptive_rd_thresh = 0;
-  sf->tx_size_search_method = USE_FULL_RD;
+  // TODO(sarahparker) Pair this with a speed setting once experiments are done
+  sf->trellis_eob_fast = 0;
+  sf->tx_size_search_method = cpi->oxcf.tx_size_search_method;
   sf->inter_tx_size_search_init_depth_sqr = 0;
   sf->inter_tx_size_search_init_depth_rect = 0;
   sf->intra_tx_size_search_init_depth_rect = 0;
@@ -424,12 +679,12 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->tx_size_search_lgr_block = 0;
   sf->model_based_prune_tx_search_level = 0;
   sf->model_based_post_interp_filter_breakout = 0;
+  sf->model_based_motion_mode_rd_breakout = 0;
   sf->reduce_inter_modes = 0;
   sf->selective_ref_gm = 1;
   sf->adaptive_motion_search = 0;
   sf->adaptive_pred_interp_filter = 0;
   sf->adaptive_mode_search = 0;
-  sf->cb_partition_search = 0;
   sf->alt_ref_search_fp = 0;
   sf->partition_search_type = SEARCH_PARTITION;
   sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE;
@@ -442,19 +697,20 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->less_rectangular_check_level = 0;
   sf->use_square_partition_only_threshold = BLOCK_128X128;
   sf->prune_ref_frame_for_rect_partitions = 0;
-  sf->prune_ref_mode_for_partitions = 0;
-  sf->auto_min_max_partition_size = NOT_IN_USE;
+  sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+  sf->auto_min_partition_based_on_simple_motion = 0;
   sf->rd_auto_partition_min_limit = BLOCK_4X4;
   sf->default_max_partition_size = BLOCK_LARGEST;
   sf->default_min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
-  sf->disable_split_mask = 0;
   sf->mode_search_skip_flags = 0;
   sf->disable_filter_search_var_thresh = 0;
   sf->allow_partition_search_skip = 0;
   sf->use_accurate_subpel_search = USE_8_TAPS;
   sf->disable_wedge_search_edge_thresh = 0;
+  sf->use_first_partition_pass_interintra_stats = 0;
   sf->disable_wedge_search_var_thresh = 0;
+  sf->disable_loop_restoration_chroma = 0;
   sf->fast_wedge_sign_estimate = 0;
   sf->prune_wedge_pred_diff_based = 0;
   sf->drop_ref = 0;
@@ -462,17 +718,19 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->txb_split_cap = 1;
   sf->adaptive_txb_search_level = 0;
   sf->two_pass_partition_search = 0;
-  sf->mode_pruning_based_on_two_pass_partition_search = 0;
+  sf->firstpass_simple_motion_search_early_term = 0;
   sf->use_intra_txb_hash = 0;
   sf->use_inter_txb_hash = 1;
   sf->use_mb_rd_hash = 1;
   sf->optimize_b_precheck = 0;
-  sf->jnt_comp_fast_tx_search = 0;
-  sf->use_jnt_comp_flag = JNT_COMP_ENABLED;
+  sf->two_loop_comp_search = 1;
+  sf->second_loop_comp_fast_tx_search = 0;
+  sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED;
   sf->reuse_inter_intra_mode = 0;
   sf->intra_angle_estimation = 0;
   sf->skip_obmc_in_uniform_mv_field = 0;
   sf->skip_wm_in_uniform_mv_field = 0;
+  sf->adaptive_interp_filter_search = 0;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
@@ -497,7 +755,9 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   for (i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
     sf->ml_partition_search_breakout_thresh[i] = -1;  // -1 means not enabled.
   }
-  sf->full_pixel_motion_search_based_split = 0;
+  sf->simple_motion_search_split_only = 0;
+  sf->simple_motion_search_prune_rect = 0;
+  sf->simple_motion_search_early_term_none = 0;
 
   // Set this at the appropriate speed levels
   sf->use_transform_domain_distortion = 0;
@@ -514,12 +774,29 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   // Set decoder side speed feature to use less dual sgr modes
   sf->dual_sgr_penalty_level = 0;
 
+  // TODO(angiebird, debargha): Re-evaluate the impact of
+  // inter_mode_rd_model_estimation in conjunction with
+  // model_based_motion_mode_rd_breakout
   sf->inter_mode_rd_model_estimation = 0;
+  sf->inter_mode_rd_model_estimation_adaptive = 0;
+
   sf->obmc_full_pixel_search_level = 0;
   sf->skip_sharp_interp_filter_search = 0;
+  sf->prune_comp_type_by_comp_avg = 0;
+  sf->disable_interinter_wedge_newmv_search = 0;
+  sf->disable_smooth_interintra = 0;
+  sf->prune_motion_mode_level = 0;
+  sf->prune_warp_using_wmtype = 0;
+  sf->disable_wedge_interintra_search = 0;
+  sf->perform_coeff_opt = 0;
+  sf->prune_comp_type_by_model_rd = 0;
+  sf->disable_smooth_intra = 0;
+  sf->perform_best_rd_based_gating_for_chroma = 0;
 
   if (oxcf->mode == GOOD)
-    set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed);
+    set_good_speed_features_framesize_independent(cpi, sf, speed);
+  else if (oxcf->mode == REALTIME)
+    set_rt_speed_features_framesize_independent(cpi, sf, speed);
 
   if (!cpi->seq_params_locked) {
     cpi->common.seq_params.enable_dual_filter &= !sf->disable_dual_filter;
@@ -534,39 +811,44 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   cpi->diamond_search_sad = av1_diamond_search_sad;
 
   sf->allow_exhaustive_searches = 1;
-  int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+
+  const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED);
   if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
     sf->exhaustive_searches_thresh = (1 << 24);
   else
     sf->exhaustive_searches_thresh = (1 << 25);
-  sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
-  if (speed > 0)
+  sf->max_exaustive_pct = good_quality_max_mesh_pct[mesh_speed];
+  if (mesh_speed > 0)
     sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
 
   for (i = 0; i < MAX_MESH_STEP; ++i) {
-    sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range;
+    sf->mesh_patterns[i].range =
+        good_quality_mesh_patterns[mesh_speed][i].range;
     sf->mesh_patterns[i].interval =
-        good_quality_mesh_patterns[speed][i].interval;
+        good_quality_mesh_patterns[mesh_speed][i].interval;
   }
   if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) &&
       (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
        cpi->oxcf.content == AOM_CONTENT_SCREEN)) {
     for (i = 0; i < MAX_MESH_STEP; ++i) {
-      sf->mesh_patterns[i].range = intrabc_mesh_patterns[speed][i].range;
-      sf->mesh_patterns[i].interval = intrabc_mesh_patterns[speed][i].interval;
+      sf->mesh_patterns[i].range = intrabc_mesh_patterns[mesh_speed][i].range;
+      sf->mesh_patterns[i].interval =
+          intrabc_mesh_patterns[mesh_speed][i].interval;
     }
-    sf->max_exaustive_pct = intrabc_max_mesh_pct[speed];
+    sf->max_exaustive_pct = intrabc_max_mesh_pct[mesh_speed];
   }
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
   if (oxcf->pass == 1) sf->optimize_coefficients = NO_TRELLIS_OPT;
 
-  // No recode for 1 pass.
+  // No recode or trellis for 1 pass.
   if (oxcf->pass == 0) {
     sf->recode_loop = DISALLOW_RECODE;
     sf->optimize_coefficients = NO_TRELLIS_OPT;
   }
+  // FIXME: trellis not very efficient for quantization matrices
+  if (oxcf->using_qm) sf->optimize_coefficients = NO_TRELLIS_OPT;
 
   if (sf->mv.subpel_search_method == SUBPEL_TREE) {
     cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree;
@@ -578,12 +860,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
     cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_evenmore;
   }
 
-  cpi->optimize_speed_feature =
-      oxcf->pass != 1 ? sf->optimize_coefficients : NO_TRELLIS_OPT;
-  // FIXME: trellis not very efficient for quantisation matrices
-  if (cm->using_qmatrix) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
-  if (oxcf->disable_trellis_quant) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
-
   x->min_partition_size = sf->default_min_partition_size;
   x->max_partition_size = sf->default_max_partition_size;
 
@@ -592,6 +868,17 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
     cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
   else if (cpi->oxcf.motion_vector_unit_test == 2)
     cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+  cpi->max_comp_type_rd_threshold_mul =
+      comp_type_rd_threshold_mul[sf->prune_comp_type_by_comp_avg];
+  cpi->max_comp_type_rd_threshold_div =
+      comp_type_rd_threshold_div[sf->prune_comp_type_by_comp_avg];
+  const int tx_domain_speed = AOMMIN(speed, MAX_TX_DOMAIN_EVAL_SPEED);
+  cpi->tx_domain_dist_threshold = tx_domain_dist_thresholds[tx_domain_speed];
+
+  // assert ensures that coeff_opt_dist_thresholds is accessed correctly
+  assert(cpi->sf.perform_coeff_opt >= 0 && cpi->sf.perform_coeff_opt < 5);
+  cpi->coeff_opt_dist_threshold =
+      coeff_opt_dist_thresholds[cpi->sf.perform_coeff_opt];
 
 #if CONFIG_DIST_8X8
   if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
@@ -600,6 +887,9 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
 #endif  // CONFIG_DIST_8X8
   if (cpi->oxcf.row_mt == 1 && (cpi->oxcf.max_threads > 1)) {
     sf->adaptive_rd_thresh = 0;
-    sf->inter_mode_rd_model_estimation = 0;
+    if (sf->inter_mode_rd_model_estimation == 1) {
+      sf->inter_mode_rd_model_estimation = 0;
+      sf->inter_mode_rd_model_estimation_adaptive = 0;
+    }
   }
 }
diff --git a/libaom/av1/encoder/speed_features.h b/libaom/av1/encoder/speed_features.h
index f71dcbf..a321192 100644
--- a/libaom/av1/encoder/speed_features.h
+++ b/libaom/av1/encoder/speed_features.h
@@ -73,7 +73,7 @@ enum {
                               (1 << THR_ALTR) | (1 << THR_GOLD)
 };
 
-typedef enum {
+enum {
   TXFM_CODING_SF = 1,
   INTER_PRED_SF = 2,
   INTRA_PRED_SF = 4,
@@ -82,9 +82,9 @@ typedef enum {
   RD_SKIP_SF = 32,
   RESERVE_2_SF = 64,
   RESERVE_3_SF = 128,
-} DEV_SPEED_FEATURES;
+} UENUM1BYTE(DEV_SPEED_FEATURES);
 
-typedef enum {
+enum {
   DIAMOND = 0,
   NSTEP = 1,
   HEX = 2,
@@ -92,9 +92,9 @@ typedef enum {
   SQUARE = 4,
   FAST_HEX = 5,
   FAST_DIAMOND = 6
-} SEARCH_METHODS;
+} UENUM1BYTE(SEARCH_METHODS);
 
-typedef enum {
+enum {
   // No recode.
   DISALLOW_RECODE = 0,
   // Allow recode for KF and exceeding maximum frame bandwidth.
@@ -103,28 +103,23 @@ typedef enum {
   ALLOW_RECODE_KFARFGF = 2,
   // Allow recode for all frames based on bitrate constraints.
   ALLOW_RECODE = 3,
-} RECODE_LOOP_TYPE;
+} UENUM1BYTE(RECODE_LOOP_TYPE);
 
-typedef enum {
+enum {
   SUBPEL_TREE = 0,
   SUBPEL_TREE_PRUNED = 1,           // Prunes 1/2-pel searches
   SUBPEL_TREE_PRUNED_MORE = 2,      // Prunes 1/2-pel searches more aggressively
   SUBPEL_TREE_PRUNED_EVENMORE = 3,  // Prunes 1/2- and 1/4-pel searches
   // Other methods to come
-} SUBPEL_SEARCH_METHODS;
+} UENUM1BYTE(SUBPEL_SEARCH_METHODS);
 
-typedef enum {
+enum {
   USE_FULL_RD = 0,
   USE_FAST_RD,
   USE_LARGESTALL,
-} TX_SIZE_SEARCH_METHOD;
-
-typedef enum {
-  NOT_IN_USE = 0,
-  RELAXED_NEIGHBORING_MIN_MAX = 1
-} AUTO_MIN_MAX_MODE;
+} UENUM1BYTE(TX_SIZE_SEARCH_METHOD);
 
-typedef enum {
+enum {
   // Try the full image with different values.
   LPF_PICK_FROM_FULL_IMAGE,
   // Try a small portion of the image with different values.
@@ -133,9 +128,9 @@ typedef enum {
   LPF_PICK_FROM_Q,
   // Pick 0 to disable LPF if LPF was enabled last frame
   LPF_PICK_MINIMAL_LPF
-} LPF_PICK_METHOD;
+} UENUM1BYTE(LPF_PICK_METHOD);
 
-typedef enum {
+enum {
   // Terminate search early based on distortion so far compared to
   // qp step, distortion in the neighborhood of the frame, etc.
   FLAG_EARLY_TERMINATE = 1 << 0,
@@ -152,9 +147,9 @@ typedef enum {
 
   // Skips intra modes other than DC_PRED if the source variance is small
   FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
-} MODE_SEARCH_SKIP_LOGIC;
+} UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC);
 
-typedef enum {
+enum {
   NO_PRUNE = 0,
   // eliminates one tx type in vertical and horizontal direction
   PRUNE_ONE = 1,
@@ -165,7 +160,7 @@ typedef enum {
   PRUNE_2D_ACCURATE = 3,
   // similar, but applies much more aggressive pruning to get better speed-up
   PRUNE_2D_FAST = 4,
-} TX_TYPE_PRUNE_MODE;
+} UENUM1BYTE(TX_TYPE_PRUNE_MODE);
 
 typedef struct {
   TX_TYPE_PRUNE_MODE prune_mode;
@@ -184,15 +179,31 @@ typedef struct {
   int skip_tx_search;
 } TX_TYPE_SEARCH;
 
-typedef enum {
+enum {
   // Search partitions using RD criterion
   SEARCH_PARTITION,
 
   // Always use a fixed size partition
   FIXED_PARTITION,
 
-  REFERENCE_PARTITION
-} PARTITION_SEARCH_TYPE;
+  REFERENCE_PARTITION,
+
+  VAR_BASED_PARTITION
+} UENUM1BYTE(PARTITION_SEARCH_TYPE);
+
+enum {
+  EIGHTH_PEL,
+  QUARTER_PEL,
+  HALF_PEL,
+  FULL_PEL
+} UENUM1BYTE(SUBPEL_FORCE_STOP);
+
+enum {
+  NOT_IN_USE,
+  DIRECT_PRED,
+  RELAXED_PRED,
+  ADAPT_PRED
+} UENUM1BYTE(MAX_PART_PRED_MODE);
 
 typedef struct MV_SPEED_FEATURES {
   // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
@@ -215,8 +226,8 @@ typedef struct MV_SPEED_FEATURES {
   // Maximum number of steps in logarithmic subpel search before giving up.
   int subpel_iters_per_step;
 
-  // Control when to stop subpel search
-  int subpel_force_stop;
+  // When to stop subpel search.
+  SUBPEL_FORCE_STOP subpel_force_stop;
 } MV_SPEED_FEATURES;
 
 #define MAX_MESH_STEP 4
@@ -226,35 +237,43 @@ typedef struct MESH_PATTERN {
   int interval;
 } MESH_PATTERN;
 
-typedef enum {
+enum {
   GM_FULL_SEARCH,
-  GM_REDUCED_REF_SEARCH,
+  GM_REDUCED_REF_SEARCH_SKIP_L2_L3,
+  GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2,
   GM_DISABLE_SEARCH
-} GM_SEARCH_TYPE;
+} UENUM1BYTE(GM_SEARCH_TYPE);
 
-typedef enum {
+enum {
   GM_ERRORADV_TR_0,
   GM_ERRORADV_TR_1,
   GM_ERRORADV_TR_2,
   GM_ERRORADV_TR_TYPES,
-} GM_ERRORADV_TYPE;
+} UENUM1BYTE(GM_ERRORADV_TYPE);
 
-typedef enum {
-  NO_TRELLIS_OPT,         // No trellis optimization
-  FULL_TRELLIS_OPT,       // Trellis optimization in all stages
-  FINAL_PASS_TRELLIS_OPT  // Trellis optimization in only the final encode pass
-} TRELLIS_OPT_TYPE;
+enum {
+  NO_TRELLIS_OPT,          // No trellis optimization
+  FULL_TRELLIS_OPT,        // Trellis optimization in all stages
+  FINAL_PASS_TRELLIS_OPT,  // Trellis optimization in only the final encode pass
+  NO_ESTIMATE_YRD_TRELLIS_OPT  // Disable trellis in estimate_yrd_for_sb
+} UENUM1BYTE(TRELLIS_OPT_TYPE);
 
-typedef enum {
+enum {
   FULL_TXFM_RD,
   LOW_TXFM_RD,
-} TXFM_RD_MODEL;
+} UENUM1BYTE(TXFM_RD_MODEL);
+
+enum {
+  DIST_WTD_COMP_ENABLED,
+  DIST_WTD_COMP_SKIP_MV_SEARCH,
+  DIST_WTD_COMP_DISABLED,
+} UENUM1BYTE(DIST_WTD_COMP_FLAG);
 
 typedef enum {
-  JNT_COMP_ENABLED,
-  JNT_COMP_SKIP_MV_SEARCH,
-  JNT_COMP_DISABLED,
-} JNT_COMP_FLAG;
+  FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP_REGULAR,
+  FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
+  FLAG_SKIP_EIGHTTAP_SHARP = 1 << MULTITAP_SHARP,
+} INTERP_FILTER_MASK;
 
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
@@ -335,11 +354,16 @@ typedef struct SPEED_FEATURES {
   // 1: use model based rd breakout
   int model_based_post_interp_filter_breakout;
 
+  // Model based breakout in motion_mode_rd
+  // 0: no breakout
+  // 1: use model based rd breakout
+  int model_based_motion_mode_rd_breakout;
+
   // Used if partition_search_type = FIXED_SIZE_PARTITION
   BLOCK_SIZE always_this_block_size;
 
   // Drop less likely to be picked reference frames in the RD search.
-  // Has four levels for now: 0, 1, 2 and 3, where higher levels prune more
+  // Has five levels for now: 0, 1, 2, 3 and 4, where higher levels prune more
   // aggressively than lower ones. (0 means no pruning).
   int selective_ref_frame;
 
@@ -351,6 +375,10 @@ typedef struct SPEED_FEATURES {
   // Use a ML model to prune horz and vert partitions
   int ml_prune_rect_partition;
 
+  // Disable/Enable interintra motion mode based on stats collected during
+  // first_partition_search_pass
+  int use_first_partition_pass_interintra_stats;
+
   // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
   int ml_prune_ab_partition;
 
@@ -359,12 +387,13 @@ typedef struct SPEED_FEATURES {
 
   int fast_cdef_search;
 
-  // 2-pass coding block partition search
+  // 2-pass coding block partition search, and also use the mode decisions made
+  // in the initial partition search to prune mode candidates, e.g. ref frames.
   int two_pass_partition_search;
 
-  // Use the mode decisions made in the initial partition search to prune mode
-  // candidates, e.g. ref frames.
-  int mode_pruning_based_on_two_pass_partition_search;
+  // Terminate early in firstpass of two_pass partition search for faster
+  // firstpass.
+  int firstpass_simple_motion_search_early_term;
 
   // Skip rectangular partition test when partition type none gives better
   // rd than partition type split. Can take values 0 - 2, 0 referring to no
@@ -375,14 +404,17 @@ typedef struct SPEED_FEATURES {
   BLOCK_SIZE use_square_partition_only_threshold;
 
   // Prune reference frames for rectangular partitions.
+  // 0 implies no pruning
+  // 1 implies prune for extended partition
+  // 2 implies prune horiz, vert and extended partition
   int prune_ref_frame_for_rect_partitions;
 
-  // Prune ref/mode choices for partitions.
-  int prune_ref_mode_for_partitions;
+  // Sets min and max square partition levels for this superblock based on
+  // motion vector and prediction error distribution produced from 16x16
+  // simple motion search
+  MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion;
+  int auto_min_partition_based_on_simple_motion;
 
-  // Sets min and max partition sizes for this superblock based on the
-  // same superblock in last encoded frame, and the left and above neighbor.
-  AUTO_MIN_MAX_MODE auto_min_max_partition_size;
   // Ensures the rd based auto partition search will always
   // go down at least to the specified level.
   BLOCK_SIZE rd_auto_partition_min_limit;
@@ -396,11 +428,6 @@ typedef struct SPEED_FEATURES {
   // frame's partitioning. Only used if use_lastframe_partitioning is set.
   int adjust_partitioning_from_last_frame;
 
-  // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
-  // it always, to allow it for only Last frame and Intra, disable it for all
-  // inter modes or to enable it always.
-  int disable_split_mask;
-
   // TODO(jingning): combine the related motion search speed features
   // This allows us to use motion search at other sizes as a starting
   // point for this motion search and limits the search range around it.
@@ -427,8 +454,6 @@ typedef struct SPEED_FEATURES {
   // Adaptive prediction mode search
   int adaptive_mode_search;
 
-  int cb_partition_search;
-
   int alt_ref_search_fp;
 
   // Implements various heuristics to skip searching modes
@@ -541,18 +566,26 @@ typedef struct SPEED_FEATURES {
   // Calculate RD cost before doing optimize_b, and skip if the cost is large.
   int optimize_b_precheck;
 
-  // Use model rd instead of transform search in jnt_comp
-  int jnt_comp_fast_tx_search;
+  // Use two-loop compound search
+  int two_loop_comp_search;
+
+  // Use model rd instead of transform search in second loop of compound search
+  int second_loop_comp_fast_tx_search;
 
   // Decide when and how to use joint_comp.
-  JNT_COMP_FLAG use_jnt_comp_flag;
+  DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag;
 
   // Decoder side speed feature to add penalty for use of dual-sgr filters.
   // Takes values 0 - 10, 0 indicating no penalty and each additional level
   // adding a penalty of 1%
   int dual_sgr_penalty_level;
 
-  // Dynamically estimate final rd from prediction error and mode cost
+  // 2-pass inter mode model estimation where the preliminary pass skips
+  // transform search and uses a model to estimate rd, while the final pass
+  // computes the full transform search. Two types of models are supported:
+  // 0: not used
+  // 1: used with online dynamic rd model
+  // 2: used with static rd model
   int inter_mode_rd_model_estimation;
 
   // Skip some ref frames in compound motion search by single motion search
@@ -581,24 +614,95 @@ typedef struct SPEED_FEATURES {
   // Prune intra mode candidates based on source block gradient stats.
   int intra_angle_estimation;
 
-  // Performs full pixel motion search before none_partition to decide if we
-  // want to split directly without trying other partition types.
-  int full_pixel_motion_search_based_split;
-
   // Skip obmc or warped motion mode when neighborhood motion field is
   // identical
   int skip_obmc_in_uniform_mv_field;
   int skip_wm_in_uniform_mv_field;
 
+  // Enable/disable ME for interinter wedge search.
+  int disable_interinter_wedge_newmv_search;
+
+  // Enable/disable smooth inter-intra mode
+  int disable_smooth_interintra;
+
   // skip sharp_filter evaluation based on regular and smooth filter rd for
   // dual_filter=0 case
   int skip_sharp_interp_filter_search;
+
+  // prune wedge and compound segment approximate rd evaluation based on
+  // compound average rd/ref_best_rd
+  int prune_comp_type_by_comp_avg;
+
+  // Prune/gate motion mode evaluation based on token based rd
+  // during transform search for inter blocks
+  // Values are 0 (not used) , 1 - 3 with progressively increasing
+  // aggressiveness
+  int prune_motion_mode_level;
+
+  // Gate warp evaluation for motions of type IDENTITY,
+  // TRANSLATION and AFFINE(based on number of warp neighbors)
+  int prune_warp_using_wmtype;
+
+  // Perform simple_motion_search on each possible subblock and use it to prune
+  // PARTITION_HORZ and PARTITION_VERT.
+  int simple_motion_search_prune_rect;
+
+  // Perform simple motion search before none_partition to decide if we
+  // want to split directly without trying other partition types.
+  int simple_motion_search_split_only;
+
+  // Use features from simple_motion_search to terminate prediction block
+  // partition after PARTITION_NONE
+  int simple_motion_search_early_term_none;
+
+  int cb_pred_filter_search;
+
+  // adaptive interp_filter search to allow skip of certain filter types.
+  int adaptive_interp_filter_search;
+
+  // mask for skip evaluation of certain interp_filter type.
+  INTERP_FILTER_MASK interp_filter_search_mask;
+
+  // Flag used to control the ref_best_rd based gating for chroma
+  int perform_best_rd_based_gating_for_chroma;
+
+  // Enable/disable interintra wedge search.
+  int disable_wedge_interintra_search;
+
+  // Disable loop restoration for Chroma plane
+  int disable_loop_restoration_chroma;
+
+  // Flag used to control the extent of coeff R-D optimization
+  int perform_coeff_opt;
+
+  // Flag used to control the speed of the eob selection in trellis.
+  int trellis_eob_fast;
+
+  // This flag controls the use of non-RD mode decision.
+  int use_nonrd_pick_mode;
+
+  // prune wedge and compound segment approximate rd evaluation based on
+  // compound average modeled rd
+  int prune_comp_type_by_model_rd;
+
+  // Enable/disable smooth intra modes.
+  int disable_smooth_intra;
+
+  // use reduced ref set for real-time mode
+  int use_real_time_ref_set;
+
+  // Perform a full TX search on some modes while using the
+  // inter-mode RD model for others. Only enabled when
+  // inter_mode_rd_model_estimation != 0
+  int inter_mode_rd_model_estimation_adaptive;
 } SPEED_FEATURES;
 
 struct AV1_COMP;
 
-void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi);
-void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi);
+void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi,
+                                                  int speed);
+void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi,
+                                                int speed);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libaom/av1/encoder/temporal_filter.c b/libaom/av1/encoder/temporal_filter.c
index ace585e..ba883d7 100644
--- a/libaom/av1/encoder/temporal_filter.c
+++ b/libaom/av1/encoder/temporal_filter.c
@@ -37,13 +37,22 @@
 #define EDGE_THRESHOLD 50
 #define SQRT_PI_BY_2 1.25331413732
 
+static unsigned int index_mult[14] = {
+  0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124
+};
+
+static int64_t highbd_index_mult[14] = { 0U,          0U,          0U,
+                                         0U,          3221225472U, 2576980378U,
+                                         2147483648U, 1840700270U, 1610612736U,
+                                         1431655766U, 1288490189U, 1171354718U,
+                                         0U,          991146300U };
+
 static void temporal_filter_predictors_mb_c(
     MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
     int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
     uint8_t *pred, struct scale_factors *scale, int x, int y,
-    int can_use_previous, int num_planes) {
-  const MV mv = { mv_row, mv_col };
-  enum mv_precision mv_precision_uv;
+    int can_use_previous, int num_planes, MV *blk_mvs, int use_32x32) {
+  mv_precision mv_precision_uv;
   int uv_stride;
   // TODO(angiebird): change plane setting accordingly
   ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
@@ -52,33 +61,146 @@ static void temporal_filter_predictors_mb_c(
   WarpTypesAllowed warp_types;
   memset(&warp_types, 0, sizeof(WarpTypesAllowed));
 
-  if (uv_block_width == 8) {
+  const int ssx = (uv_block_width == (BW >> 1)) ? 1 : 0;
+  if (ssx) {
     uv_stride = (stride + 1) >> 1;
     mv_precision_uv = MV_PRECISION_Q4;
   } else {
     uv_stride = stride;
     mv_precision_uv = MV_PRECISION_Q3;
   }
-  av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
-                            &conv_params, interp_filters, &warp_types, x, y, 0,
-                            0, MV_PRECISION_Q3, x, y, xd, can_use_previous);
 
+  if (use_32x32) {
+    assert(mv_row >= INT16_MIN && mv_row <= INT16_MAX && mv_col >= INT16_MIN &&
+           mv_col <= INT16_MAX);
+    const MV mv = { (int16_t)mv_row, (int16_t)mv_col };
+
+    av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], BW, &mv, scale, BW,
+                              BH, &conv_params, interp_filters, &warp_types, x,
+                              y, 0, 0, MV_PRECISION_Q3, x, y, xd,
+                              can_use_previous);
+    if (num_planes > 1) {
+      av1_build_inter_predictor(
+          u_mb_ptr, uv_stride, &pred[BLK_PELS], uv_block_width, &mv, scale,
+          uv_block_width, uv_block_height, &conv_params, interp_filters,
+          &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous);
+      av1_build_inter_predictor(
+          v_mb_ptr, uv_stride, &pred[(BLK_PELS << 1)], uv_block_width, &mv,
+          scale, uv_block_width, uv_block_height, &conv_params, interp_filters,
+          &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous);
+    }
+
+    return;
+  }
+
+  // While use_32x32 = 0, construct the 32x32 predictor using 4 16x16
+  // predictors.
+  int i, j, k = 0, ys = (BH >> 1), xs = (BW >> 1);
+  // Y predictor
+  for (i = 0; i < BH; i += ys) {
+    for (j = 0; j < BW; j += xs) {
+      const MV mv = blk_mvs[k];
+      const int y_offset = i * stride + j;
+      const int p_offset = i * BW + j;
+
+      av1_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset],
+                                BW, &mv, scale, xs, ys, &conv_params,
+                                interp_filters, &warp_types, x, y, 0, 0,
+                                MV_PRECISION_Q3, x, y, xd, can_use_previous);
+      k++;
+    }
+  }
+
+  // U and V predictors
   if (num_planes > 1) {
-    av1_build_inter_predictor(
-        u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale,
-        uv_block_width, uv_block_height, &conv_params, interp_filters,
-        &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous);
-
-    av1_build_inter_predictor(
-        v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale,
-        uv_block_width, uv_block_height, &conv_params, interp_filters,
-        &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous);
+    ys = (uv_block_height >> 1);
+    xs = (uv_block_width >> 1);
+    k = 0;
+
+    for (i = 0; i < uv_block_height; i += ys) {
+      for (j = 0; j < uv_block_width; j += xs) {
+        const MV mv = blk_mvs[k];
+        const int uv_offset = i * uv_stride + j;
+        const int p_offset = i * uv_block_width + j;
+
+        av1_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride,
+                                  &pred[BLK_PELS + p_offset], uv_block_width,
+                                  &mv, scale, xs, ys, &conv_params,
+                                  interp_filters, &warp_types, x, y, 1, 0,
+                                  mv_precision_uv, x, y, xd, can_use_previous);
+        av1_build_inter_predictor(
+            v_mb_ptr + uv_offset, uv_stride, &pred[(BLK_PELS << 1) + p_offset],
+            uv_block_width, &mv, scale, xs, ys, &conv_params, interp_filters,
+            &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd,
+            can_use_previous);
+        k++;
+      }
+    }
   }
 }
 
-static INLINE int64_t mod_index(int64_t sum_dist, int index, int rounding,
-                                int strength, int filter_weight) {
-  int64_t mod = (sum_dist * 3) / index;
+static void apply_temporal_filter_self(const uint8_t *pred, int buf_stride,
+                                       unsigned int block_width,
+                                       unsigned int block_height,
+                                       int filter_weight, uint32_t *accumulator,
+                                       uint16_t *count) {
+  const int modifier = filter_weight * 16;
+  unsigned int i, j, k = 0;
+  assert(filter_weight == 2);
+
+  for (i = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++) {
+      const int pixel_value = pred[i * buf_stride + j];
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+      ++k;
+    }
+  }
+}
+
+static void highbd_apply_temporal_filter_self(
+    const uint8_t *pred8, int buf_stride, unsigned int block_width,
+    unsigned int block_height, int filter_weight, uint32_t *accumulator,
+    uint16_t *count) {
+  const int modifier = filter_weight * 16;
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  unsigned int i, j, k = 0;
+  assert(filter_weight == 2);
+
+  for (i = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++) {
+      const int pixel_value = pred[i * buf_stride + j];
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+      ++k;
+    }
+  }
+}
+
+static INLINE int mod_index(int sum_dist, int index, int rounding, int strength,
+                            int filter_weight) {
+  assert(index >= 0 && index <= 13);
+  assert(index_mult[index] != 0);
+
+  int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = AOMMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+static INLINE int highbd_mod_index(int64_t sum_dist, int index, int rounding,
+                                   int strength, int filter_weight) {
+  assert(index >= 0 && index <= 13);
+  assert(highbd_index_mult[index] != 0);
+
+  int mod =
+      (int)((AOMMIN(sum_dist, INT32_MAX) * highbd_index_mult[index]) >> 32);
   mod += rounding;
   mod >>= strength;
 
@@ -106,12 +228,35 @@ static INLINE void calculate_squared_errors(const uint8_t *s, int s_stride,
   }
 }
 
-static void apply_temporal_filter(
+static INLINE int get_filter_weight(unsigned int i, unsigned int j,
+                                    unsigned int block_height,
+                                    unsigned int block_width, const int *blk_fw,
+                                    int use_32x32) {
+  if (use_32x32)
+    // blk_fw[0] ~ blk_fw[3] are the same.
+    return blk_fw[0];
+
+  int filter_weight = 0;
+  if (i < block_height / 2) {
+    if (j < block_width / 2)
+      filter_weight = blk_fw[0];
+    else
+      filter_weight = blk_fw[1];
+  } else {
+    if (j < block_width / 2)
+      filter_weight = blk_fw[2];
+    else
+      filter_weight = blk_fw[3];
+  }
+  return filter_weight;
+}
+
+void av1_apply_temporal_filter_c(
     const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred,
     int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1,
     int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred,
     int uv_buf_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int filter_weight,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
     uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
     uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) {
   unsigned int i, j, k, m;
@@ -119,20 +264,17 @@ static void apply_temporal_filter(
   const int rounding = (1 << strength) >> 1;
   const unsigned int uv_block_width = block_width >> ss_x;
   const unsigned int uv_block_height = block_height >> ss_y;
-  DECLARE_ALIGNED(16, uint16_t, y_diff_sse[256]);
-  DECLARE_ALIGNED(16, uint16_t, u_diff_sse[256]);
-  DECLARE_ALIGNED(16, uint16_t, v_diff_sse[256]);
+  DECLARE_ALIGNED(16, uint16_t, y_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint16_t, u_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint16_t, v_diff_sse[BLK_PELS]);
 
   int idx = 0, idy;
 
-  assert(filter_weight >= 0);
-  assert(filter_weight <= 2);
-
-  memset(y_diff_sse, 0, 256 * sizeof(uint16_t));
-  memset(u_diff_sse, 0, 256 * sizeof(uint16_t));
-  memset(v_diff_sse, 0, 256 * sizeof(uint16_t));
+  memset(y_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+  memset(u_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+  memset(v_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
 
-  // Calculate diff^2 for each pixel of the 16x16 block.
+  // Calculate diff^2 for each pixel of the block.
   // TODO(yunqing): the following code needs to be optimized.
   calculate_squared_errors(y_frame1, y_stride, y_pred, y_buf_stride, y_diff_sse,
                            block_width, block_height);
@@ -144,6 +286,8 @@ static void apply_temporal_filter(
   for (i = 0, k = 0, m = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++) {
       const int pixel_value = y_pred[i * y_buf_stride + j];
+      int filter_weight =
+          get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
 
       // non-local mean approach
       int y_index = 0;
@@ -249,22 +393,22 @@ static INLINE void highbd_calculate_squared_errors(
   }
 }
 
-static void highbd_apply_temporal_filter(
+void av1_highbd_apply_temporal_filter_c(
     const uint8_t *yf, int y_stride, const uint8_t *yp, int y_buf_stride,
     const uint8_t *uf, const uint8_t *vf, int uv_stride, const uint8_t *up,
     const uint8_t *vp, int uv_buf_stride, unsigned int block_width,
     unsigned int block_height, int ss_x, int ss_y, int strength,
-    int filter_weight, uint32_t *y_accumulator, uint16_t *y_count,
-    uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator,
-    uint16_t *v_count) {
+    const int *blk_fw, int use_32x32, uint32_t *y_accumulator,
+    uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count,
+    uint32_t *v_accumulator, uint16_t *v_count) {
   unsigned int i, j, k, m;
   int64_t modifier;
   const int rounding = (1 << strength) >> 1;
   const unsigned int uv_block_width = block_width >> ss_x;
   const unsigned int uv_block_height = block_height >> ss_y;
-  DECLARE_ALIGNED(16, uint32_t, y_diff_sse[256]);
-  DECLARE_ALIGNED(16, uint32_t, u_diff_sse[256]);
-  DECLARE_ALIGNED(16, uint32_t, v_diff_sse[256]);
+  DECLARE_ALIGNED(16, uint32_t, y_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint32_t, u_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint32_t, v_diff_sse[BLK_PELS]);
 
   const uint16_t *y_frame1 = CONVERT_TO_SHORTPTR(yf);
   const uint16_t *u_frame1 = CONVERT_TO_SHORTPTR(uf);
@@ -274,14 +418,11 @@ static void highbd_apply_temporal_filter(
   const uint16_t *v_pred = CONVERT_TO_SHORTPTR(vp);
   int idx = 0, idy;
 
-  assert(filter_weight >= 0);
-  assert(filter_weight <= 2);
-
-  memset(y_diff_sse, 0, 256 * sizeof(uint32_t));
-  memset(u_diff_sse, 0, 256 * sizeof(uint32_t));
-  memset(v_diff_sse, 0, 256 * sizeof(uint32_t));
+  memset(y_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+  memset(u_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+  memset(v_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
 
-  // Calculate diff^2 for each pixel of the 16x16 block.
+  // Calculate diff^2 for each pixel of the block.
   // TODO(yunqing): the following code needs to be optimized.
   highbd_calculate_squared_errors(y_frame1, y_stride, y_pred, y_buf_stride,
                                   y_diff_sse, block_width, block_height);
@@ -293,6 +434,8 @@ static void highbd_apply_temporal_filter(
   for (i = 0, k = 0, m = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++) {
       const int pixel_value = y_pred[i * y_buf_stride + j];
+      int filter_weight =
+          get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
 
       // non-local mean approach
       int y_index = 0;
@@ -321,11 +464,11 @@ static void highbd_apply_temporal_filter(
 
       y_index += 2;
 
-      modifier =
-          mod_index(modifier, y_index, rounding, strength, filter_weight);
+      const int final_y_mod = highbd_mod_index(modifier, y_index, rounding,
+                                               strength, filter_weight);
 
-      y_count[k] += modifier;
-      y_accumulator[k] += modifier * pixel_value;
+      y_count[k] += final_y_mod;
+      y_accumulator[k] += final_y_mod * pixel_value;
 
       ++k;
 
@@ -367,13 +510,15 @@ static void highbd_apply_temporal_filter(
         u_mod += y_diff;
         v_mod += y_diff;
 
-        u_mod = mod_index(u_mod, cr_index, rounding, strength, filter_weight);
-        v_mod = mod_index(v_mod, cr_index, rounding, strength, filter_weight);
+        const int final_u_mod = highbd_mod_index(u_mod, cr_index, rounding,
+                                                 strength, filter_weight);
+        const int final_v_mod = highbd_mod_index(v_mod, cr_index, rounding,
+                                                 strength, filter_weight);
 
-        u_count[m] += u_mod;
-        u_accumulator[m] += u_mod * u_pixel_value;
-        v_count[m] += v_mod;
-        v_accumulator[m] += v_mod * v_pixel_value;
+        u_count[m] += final_u_mod;
+        u_accumulator[m] += final_u_mod * u_pixel_value;
+        v_count[m] += final_v_mod;
+        v_accumulator[m] += final_v_mod * v_pixel_value;
 
         ++m;
       }  // Complete YUV pixel
@@ -385,8 +530,8 @@ static void highbd_apply_temporal_filter(
 void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
                                  uint8_t *frame2, unsigned int block_width,
                                  unsigned int block_height, int strength,
-                                 int filter_weight, unsigned int *accumulator,
-                                 uint16_t *count) {
+                                 const int *blk_fw, int use_32x32,
+                                 unsigned int *accumulator, uint16_t *count) {
   unsigned int i, j, k;
   int modifier;
   int byte = 0;
@@ -395,6 +540,8 @@ void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
       int pixel_value = *frame2;
+      int filter_weight =
+          get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
 
       // non-local mean approach
       int diff_sse[9] = { 0 };
@@ -447,7 +594,7 @@ void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
 void av1_highbd_temporal_filter_apply_c(
     uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8,
     unsigned int block_width, unsigned int block_height, int strength,
-    int filter_weight, unsigned int *accumulator, uint16_t *count) {
+    int *blk_fw, int use_32x32, unsigned int *accumulator, uint16_t *count) {
   uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
   uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
   unsigned int i, j, k;
@@ -458,6 +605,8 @@ void av1_highbd_temporal_filter_apply_c(
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
       int pixel_value = *frame2;
+      int filter_weight =
+          get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
 
       // non-local mean approach
       int diff_sse[9] = { 0 };
@@ -509,8 +658,8 @@ void av1_highbd_temporal_filter_apply_c(
 static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
                                               uint8_t *arf_frame_buf,
                                               uint8_t *frame_ptr_buf,
-                                              int stride, int x_pos,
-                                              int y_pos) {
+                                              int stride, int x_pos, int y_pos,
+                                              MV *blk_mvs, int *blk_bestsme) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
@@ -543,9 +692,12 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
 
   av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
 
-  av1_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
-                        NSTEP, 1, sadpb, cond_cost_list(cpi, cost_list),
-                        &best_ref_mv1, 0, 0, x_pos, y_pos, 0);
+  // av1_full_pixel_search() parameters: best_ref_mv1_full is the start mv, and
+  // best_ref_mv1 is for mv rate calculation. The search result is stored in
+  // x->best_mv.
+  av1_full_pixel_search(cpi, x, TF_BLOCK, &best_ref_mv1_full, step_param, NSTEP,
+                        1, sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1,
+                        0, 0, x_pos, y_pos, 0, &cpi->ss_cfg[SS_CFG_LOOKAHEAD]);
   x->mv_limits = tmp_mv_limits;
 
   // Ignore mv costing by sending NULL pointer instead of cost array
@@ -559,19 +711,64 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
     x->best_mv.as_mv.row *= 8;
     x->best_mv.as_mv.col *= 8;
 
-    bestsme = cpi->fn_ptr[BLOCK_16X16].vf(y + offset, y_stride, src_address,
-                                          src_stride, &sse);
-  } else {
-    bestsme = cpi->find_fractional_mv_step(
-        x, &cpi->common, 0, 0, &best_ref_mv1,
-        cpi->common.allow_high_precision_mv, x->errorperbit,
-        &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
-        cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
-        NULL, 0, 0, 16, 16, USE_8_TAPS, 1);
+    bestsme = cpi->fn_ptr[TF_BLOCK].vf(y + offset, y_stride, src_address,
+                                       src_stride, &sse);
+
+    x->e_mbd.mi[0]->mv[0] = x->best_mv;
+
+    // Restore input state
+    x->plane[0].src = src;
+    xd->plane[0].pre[0] = pre;
+
+    return bestsme;
   }
 
+  // find_fractional_mv_step parameters: best_ref_mv1 is for mv rate cost
+  // calculation. The start full mv and the search result are stored in
+  // x->best_mv. mi_row and mi_col are only needed for "av1_is_scaled(sf)=1"
+  // case.
+  bestsme = cpi->find_fractional_mv_step(
+      x, &cpi->common, 0, 0, &best_ref_mv1, cpi->common.allow_high_precision_mv,
+      x->errorperbit, &cpi->fn_ptr[TF_BLOCK], 0, mv_sf->subpel_iters_per_step,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL,
+      0, 0, BW, BH, USE_8_TAPS, 1);
+
   x->e_mbd.mi[0]->mv[0] = x->best_mv;
 
+  // DO motion search on 4 16x16 sub_blocks.
+  int i, j, k = 0;
+  best_ref_mv1.row = x->e_mbd.mi[0]->mv[0].as_mv.row;
+  best_ref_mv1.col = x->e_mbd.mi[0]->mv[0].as_mv.col;
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  for (i = 0; i < BH; i += SUB_BH) {
+    for (j = 0; j < BW; j += SUB_BW) {
+      // Setup frame pointers
+      x->plane[0].src.buf = arf_frame_buf + i * stride + j;
+      x->plane[0].src.stride = stride;
+      xd->plane[0].pre[0].buf = frame_ptr_buf + i * stride + j;
+      xd->plane[0].pre[0].stride = stride;
+
+      av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+      av1_full_pixel_search(cpi, x, TF_SUB_BLOCK, &best_ref_mv1_full,
+                            step_param, NSTEP, 1, sadpb,
+                            cond_cost_list(cpi, cost_list), &best_ref_mv1, 0, 0,
+                            x_pos, y_pos, 0, &cpi->ss_cfg[SS_CFG_LOOKAHEAD]);
+      x->mv_limits = tmp_mv_limits;
+
+      blk_bestsme[k] = cpi->find_fractional_mv_step(
+          x, &cpi->common, 0, 0, &best_ref_mv1,
+          cpi->common.allow_high_precision_mv, x->errorperbit,
+          &cpi->fn_ptr[TF_SUB_BLOCK], 0, mv_sf->subpel_iters_per_step,
+          cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
+          NULL, 0, 0, SUB_BW, SUB_BH, USE_8_TAPS, 1);
+
+      blk_mvs[k] = x->best_mv.as_mv;
+      k++;
+    }
+  }
+
   // Restore input state
   x->plane[0].src = src;
   xd->plane[0].pre[0] = pre;
@@ -582,39 +779,42 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
 static void temporal_filter_iterate_c(AV1_COMP *cpi,
                                       YV12_BUFFER_CONFIG **frames,
                                       int frame_count, int alt_ref_index,
-                                      int strength, RefBuffer *ref_buf) {
+                                      int strength,
+                                      struct scale_factors *ref_scale_factors) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   int byte;
   int frame;
   int mb_col, mb_row;
-  unsigned int filter_weight;
-  int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
-  int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
+  int mb_cols = (frames[alt_ref_index]->y_crop_width + BW - 1) >> BW_LOG2;
+  int mb_rows = (frames[alt_ref_index]->y_crop_height + BH - 1) >> BH_LOG2;
   int mb_y_offset = 0;
+  int mb_y_src_offset = 0;
   int mb_uv_offset = 0;
-  DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
-  DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
+  int mb_uv_src_offset = 0;
+  DECLARE_ALIGNED(16, unsigned int, accumulator[BLK_PELS * 3]);
+  DECLARE_ALIGNED(16, uint16_t, count[BLK_PELS * 3]);
   MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
   YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
   uint8_t *dst1, *dst2;
-  DECLARE_ALIGNED(32, uint16_t, predictor16[16 * 16 * 3]);
-  DECLARE_ALIGNED(32, uint8_t, predictor8[16 * 16 * 3]);
+  DECLARE_ALIGNED(32, uint16_t, predictor16[BLK_PELS * 3]);
+  DECLARE_ALIGNED(32, uint8_t, predictor8[BLK_PELS * 3]);
   uint8_t *predictor;
-  const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
-  const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
+  const int mb_uv_height = BH >> mbd->plane[1].subsampling_y;
+  const int mb_uv_width = BW >> mbd->plane[1].subsampling_x;
 
   // Save input state
   uint8_t *input_buffer[MAX_MB_PLANE];
   int i;
-  if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  const int is_hbd = is_cur_buf_hbd(mbd);
+  if (is_hbd) {
     predictor = CONVERT_TO_BYTEPTR(predictor16);
   } else {
     predictor = predictor8;
   }
 
-  mbd->block_refs[0] = ref_buf;
-  mbd->block_refs[1] = ref_buf;
+  mbd->block_ref_scale_factors[0] = ref_scale_factors;
+  mbd->block_ref_scale_factors[1] = ref_scale_factors;
 
   for (i = 0; i < num_planes; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
 
@@ -631,108 +831,173 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
     // To keep the mv in play for both Y and UV planes the max that it
     //  can be on a border is therefore 16 - (2*AOM_INTERP_EXTEND+1).
     cpi->td.mb.mv_limits.row_min =
-        -((mb_row * 16) + (17 - 2 * AOM_INTERP_EXTEND));
+        -((mb_row * BH) + (17 - 2 * AOM_INTERP_EXTEND));
     cpi->td.mb.mv_limits.row_max =
-        ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * AOM_INTERP_EXTEND);
+        ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * AOM_INTERP_EXTEND);
 
     for (mb_col = 0; mb_col < mb_cols; mb_col++) {
       int j, k;
       int stride;
 
-      memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
-      memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+      memset(accumulator, 0, BLK_PELS * 3 * sizeof(accumulator[0]));
+      memset(count, 0, BLK_PELS * 3 * sizeof(count[0]));
 
       cpi->td.mb.mv_limits.col_min =
-          -((mb_col * 16) + (17 - 2 * AOM_INTERP_EXTEND));
+          -((mb_col * BW) + (17 - 2 * AOM_INTERP_EXTEND));
       cpi->td.mb.mv_limits.col_max =
-          ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * AOM_INTERP_EXTEND);
+          ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * AOM_INTERP_EXTEND);
 
       for (frame = 0; frame < frame_count; frame++) {
-        const int thresh_low = 10000;
-        const int thresh_high = 20000;
+        // MVs for 4 16x16 sub blocks.
+        MV blk_mvs[4];
+        // Filter weights for 4 16x16 sub blocks.
+        int blk_fw[4] = { 0, 0, 0, 0 };
+        int use_32x32 = 0;
 
         if (frames[frame] == NULL) continue;
 
         mbd->mi[0]->mv[0].as_mv.row = 0;
         mbd->mi[0]->mv[0].as_mv.col = 0;
         mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+        blk_mvs[0] = kZeroMv;
+        blk_mvs[1] = kZeroMv;
+        blk_mvs[2] = kZeroMv;
+        blk_mvs[3] = kZeroMv;
 
         if (frame == alt_ref_index) {
-          filter_weight = 2;
+          blk_fw[0] = blk_fw[1] = blk_fw[2] = blk_fw[3] = 2;
+          use_32x32 = 1;
         } else {
+          int thresh_low = 10000;
+          int thresh_high = 20000;
+          int blk_bestsme[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+
           // Find best match in this frame by MC
           int err = temporal_filter_find_matching_mb_c(
-              cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
-              frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride,
-              mb_col * 16, mb_row * 16);
-
-          // Assign higher weight to matching MB if it's error
-          // score is lower. If not applying MC default behavior
-          // is to weight all MBs equal.
-          filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0;
+              cpi, frames[alt_ref_index]->y_buffer + mb_y_src_offset,
+              frames[frame]->y_buffer + mb_y_src_offset,
+              frames[frame]->y_stride, mb_col * BW, mb_row * BH, blk_mvs,
+              blk_bestsme);
+
+          int err16 =
+              blk_bestsme[0] + blk_bestsme[1] + blk_bestsme[2] + blk_bestsme[3];
+          int max_err = INT_MIN, min_err = INT_MAX;
+          for (k = 0; k < 4; k++) {
+            if (min_err > blk_bestsme[k]) min_err = blk_bestsme[k];
+            if (max_err < blk_bestsme[k]) max_err = blk_bestsme[k];
+          }
+
+          if (((err * 15 < (err16 << 4)) && max_err - min_err < 12000) ||
+              ((err * 14 < (err16 << 4)) && max_err - min_err < 6000)) {
+            use_32x32 = 1;
+            // Assign higher weight to matching MB if it's error
+            // score is lower. If not applying MC default behavior
+            // is to weight all MBs equal.
+            blk_fw[0] = err < (thresh_low << THR_SHIFT)
+                            ? 2
+                            : err < (thresh_high << THR_SHIFT) ? 1 : 0;
+            blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0];
+          } else {
+            use_32x32 = 0;
+            for (k = 0; k < 4; k++)
+              blk_fw[k] = blk_bestsme[k] < thresh_low
+                              ? 2
+                              : blk_bestsme[k] < thresh_high ? 1 : 0;
+          }
         }
 
-        if (filter_weight != 0) {
+        if (blk_fw[0] || blk_fw[1] || blk_fw[2] || blk_fw[3]) {
           // Construct the predictors
           temporal_filter_predictors_mb_c(
-              mbd, frames[frame]->y_buffer + mb_y_offset,
-              frames[frame]->u_buffer + mb_uv_offset,
-              frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
-              mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row,
-              mbd->mi[0]->mv[0].as_mv.col, predictor, &ref_buf->sf, mb_col * 16,
-              mb_row * 16, cm->allow_warped_motion, num_planes);
+              mbd, frames[frame]->y_buffer + mb_y_src_offset,
+              frames[frame]->u_buffer + mb_uv_src_offset,
+              frames[frame]->v_buffer + mb_uv_src_offset,
+              frames[frame]->y_stride, mb_uv_width, mb_uv_height,
+              mbd->mi[0]->mv[0].as_mv.row, mbd->mi[0]->mv[0].as_mv.col,
+              predictor, ref_scale_factors, mb_col * BW, mb_row * BH,
+              cm->allow_warped_motion, num_planes, blk_mvs, use_32x32);
 
           // Apply the filter (YUV)
-          if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-            int adj_strength = strength + 2 * (mbd->bd - 8);
-
-            if (num_planes <= 1) {
-              // Single plane case
-              av1_highbd_temporal_filter_apply_c(
-                  f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
-                  adj_strength, filter_weight, accumulator, count);
-            } else {
-              // Process 3 planes together.
-              highbd_apply_temporal_filter(
-                  f->y_buffer + mb_y_offset, f->y_stride, predictor, 16,
-                  f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
-                  f->uv_stride, predictor + 256, predictor + 512, mb_uv_width,
-                  16, 16, mbd->plane[1].subsampling_x,
-                  mbd->plane[1].subsampling_y, adj_strength, filter_weight,
-                  accumulator, count, accumulator + 256, count + 256,
-                  accumulator + 512, count + 512);
+          if (frame == alt_ref_index) {
+            uint8_t *pred = predictor;
+            uint32_t *accum = accumulator;
+            uint16_t *cnt = count;
+            int plane;
+
+            // All 4 blk_fws are equal to 2.
+            for (plane = 0; plane < num_planes; ++plane) {
+              const int pred_stride = plane ? mb_uv_width : BW;
+              const unsigned int w = plane ? mb_uv_width : BW;
+              const unsigned int h = plane ? mb_uv_height : BH;
+
+              if (is_hbd) {
+                highbd_apply_temporal_filter_self(pred, pred_stride, w, h,
+                                                  blk_fw[0], accum, cnt);
+              } else {
+                apply_temporal_filter_self(pred, pred_stride, w, h, blk_fw[0],
+                                           accum, cnt);
+              }
+
+              pred += BLK_PELS;
+              accum += BLK_PELS;
+              cnt += BLK_PELS;
             }
           } else {
-            if (num_planes <= 1) {
-              // Single plane case
-              av1_temporal_filter_apply_c(
-                  f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
-                  strength, filter_weight, accumulator, count);
+            if (is_hbd) {
+              const int adj_strength = strength + 2 * (mbd->bd - 8);
+
+              if (num_planes <= 1) {
+                // Single plane case
+                av1_highbd_temporal_filter_apply_c(
+                    f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW,
+                    BH, adj_strength, blk_fw, use_32x32, accumulator, count);
+              } else {
+                // Process 3 planes together.
+                av1_highbd_apply_temporal_filter(
+                    f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW,
+                    f->u_buffer + mb_uv_src_offset,
+                    f->v_buffer + mb_uv_src_offset, f->uv_stride,
+                    predictor + BLK_PELS, predictor + (BLK_PELS << 1),
+                    mb_uv_width, BW, BH, mbd->plane[1].subsampling_x,
+                    mbd->plane[1].subsampling_y, adj_strength, blk_fw,
+                    use_32x32, accumulator, count, accumulator + BLK_PELS,
+                    count + BLK_PELS, accumulator + (BLK_PELS << 1),
+                    count + (BLK_PELS << 1));
+              }
             } else {
-              // Process 3 planes together.
-              apply_temporal_filter(
-                  f->y_buffer + mb_y_offset, f->y_stride, predictor, 16,
-                  f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
-                  f->uv_stride, predictor + 256, predictor + 512, mb_uv_width,
-                  16, 16, mbd->plane[1].subsampling_x,
-                  mbd->plane[1].subsampling_y, strength, filter_weight,
-                  accumulator, count, accumulator + 256, count + 256,
-                  accumulator + 512, count + 512);
+              if (num_planes <= 1) {
+                // Single plane case
+                av1_temporal_filter_apply_c(
+                    f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW,
+                    BH, strength, blk_fw, use_32x32, accumulator, count);
+              } else {
+                // Process 3 planes together.
+                av1_apply_temporal_filter(
+                    f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW,
+                    f->u_buffer + mb_uv_src_offset,
+                    f->v_buffer + mb_uv_src_offset, f->uv_stride,
+                    predictor + BLK_PELS, predictor + (BLK_PELS << 1),
+                    mb_uv_width, BW, BH, mbd->plane[1].subsampling_x,
+                    mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32,
+                    accumulator, count, accumulator + BLK_PELS,
+                    count + BLK_PELS, accumulator + (BLK_PELS << 1),
+                    count + (BLK_PELS << 1));
+              }
             }
           }
         }
       }
 
       // Normalize filter output to produce AltRef frame
-      if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (is_hbd) {
         uint16_t *dst1_16;
         uint16_t *dst2_16;
         dst1 = cpi->alt_ref_buffer.y_buffer;
         dst1_16 = CONVERT_TO_SHORTPTR(dst1);
         stride = cpi->alt_ref_buffer.y_stride;
         byte = mb_y_offset;
-        for (i = 0, k = 0; i < 16; i++) {
-          for (j = 0; j < 16; j++, k++) {
+        for (i = 0, k = 0; i < BH; i++) {
+          for (j = 0; j < BW; j++, k++) {
             dst1_16[byte] =
                 (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
 
@@ -740,7 +1005,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
             byte++;
           }
 
-          byte += stride - 16;
+          byte += stride - BW;
         }
         if (num_planes > 1) {
           dst1 = cpi->alt_ref_buffer.u_buffer;
@@ -749,9 +1014,9 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
           dst2_16 = CONVERT_TO_SHORTPTR(dst2);
           stride = cpi->alt_ref_buffer.uv_stride;
           byte = mb_uv_offset;
-          for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
             for (j = 0; j < mb_uv_width; j++, k++) {
-              int m = k + 256;
+              int m = k + BLK_PELS;
               // U
               dst1_16[byte] =
                   (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
@@ -768,24 +1033,24 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
         dst1 = cpi->alt_ref_buffer.y_buffer;
         stride = cpi->alt_ref_buffer.y_stride;
         byte = mb_y_offset;
-        for (i = 0, k = 0; i < 16; i++) {
-          for (j = 0; j < 16; j++, k++) {
+        for (i = 0, k = 0; i < BH; i++) {
+          for (j = 0; j < BW; j++, k++) {
             dst1[byte] =
                 (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
 
             // move to next pixel
             byte++;
           }
-          byte += stride - 16;
+          byte += stride - BW;
         }
         if (num_planes > 1) {
           dst1 = cpi->alt_ref_buffer.u_buffer;
           dst2 = cpi->alt_ref_buffer.v_buffer;
           stride = cpi->alt_ref_buffer.uv_stride;
           byte = mb_uv_offset;
-          for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
             for (j = 0; j < mb_uv_width; j++, k++) {
-              int m = k + 256;
+              int m = k + BLK_PELS;
               // U
               dst1[byte] =
                   (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
@@ -799,11 +1064,16 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
           }
         }
       }
-      mb_y_offset += 16;
+      mb_y_offset += BW;
+      mb_y_src_offset += BW;
       mb_uv_offset += mb_uv_width;
+      mb_uv_src_offset += mb_uv_width;
     }
-    mb_y_offset += 16 * (f->y_stride - mb_cols);
-    mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
+    mb_y_offset += BH * cpi->alt_ref_buffer.y_stride - BW * mb_cols;
+    mb_y_src_offset += BH * f->y_stride - BW * mb_cols;
+    mb_uv_src_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
+    mb_uv_offset +=
+        mb_uv_height * cpi->alt_ref_buffer.uv_stride - mb_uv_width * mb_cols;
   }
 
   // Restore input state
@@ -920,7 +1190,7 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
   MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
   struct lookahead_entry *buf = av1_lookahead_peek(cpi->lookahead, distance);
   double noiselevel;
-  if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(mbd)) {
     noiselevel = highbd_estimate_noise(
         buf->img.y_buffer, buf->img.y_crop_width, buf->img.y_crop_height,
         buf->img.y_stride, mbd->bd, EDGE_THRESHOLD);
@@ -974,8 +1244,7 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) {
   int strength;
   int frames_to_blur_backward;
   int frames_to_blur_forward;
-  RefBuffer ref_buf;
-  ref_buf.buf = NULL;
+  struct scale_factors sf;
 
   YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
   const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
@@ -984,9 +1253,8 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) {
   // Apply context specific adjustments to the arnr filter parameters.
   if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
     // TODO(weitinglin): Currently, we enforce the filtering strength on
-    //                   extra ARFs' to be zeros. We should investigate in which
-    //                   case it is more beneficial to use non-zero strength
-    //                   filtering.
+    // internal ARFs to be zeros. We should investigate in which case it is more
+    // beneficial to use non-zero strength filtering.
     strength = 0;
     frames_to_blur = 1;
   } else {
@@ -1020,7 +1288,7 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) {
     // supported.
     // ARF is produced at the native frame size and resized when coded.
     av1_setup_scale_factors_for_frame(
-        &ref_buf.sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+        &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
         frames[0]->y_crop_width, frames[0]->y_crop_height);
   }
 
@@ -1031,5 +1299,5 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) {
   av1_initialize_cost_tables(&cpi->common, &cpi->td.mb);
 
   temporal_filter_iterate_c(cpi, frames, frames_to_blur,
-                            frames_to_blur_backward, strength, &ref_buf);
+                            frames_to_blur_backward, strength, &sf);
 }
diff --git a/libaom/av1/encoder/temporal_filter.h b/libaom/av1/encoder/temporal_filter.h
index 1ff1162..bb26c36 100644
--- a/libaom/av1/encoder/temporal_filter.h
+++ b/libaom/av1/encoder/temporal_filter.h
@@ -18,6 +18,18 @@ extern "C" {
 
 #define ARNR_FILT_QINDEX 128
 
+// Block size used in temporal filtering
+#define TF_BLOCK BLOCK_32X32
+#define BH 32
+#define BH_LOG2 5
+#define BW 32
+#define BW_LOG2 5
+#define BLK_PELS 1024  // Pixels in the block
+#define THR_SHIFT 2
+#define TF_SUB_BLOCK BLOCK_16X16
+#define SUB_BH 16
+#define SUB_BW 16
+
 void av1_temporal_filter(AV1_COMP *cpi, int distance);
 
 #ifdef __cplusplus
diff --git a/libaom/av1/encoder/tokenize.h b/libaom/av1/encoder/tokenize.h
index 63b505f..c80af7b 100644
--- a/libaom/av1/encoder/tokenize.h
+++ b/libaom/av1/encoder/tokenize.h
@@ -38,11 +38,11 @@ struct tokenize_b_args {
   uint8_t allow_update_cdf;
 };
 
-typedef enum {
+enum {
   OUTPUT_ENABLED = 0,
   DRY_RUN_NORMAL,
   DRY_RUN_COSTCOEFFS,
-} RUN_TYPE;
+} UENUM1BYTE(RUN_TYPE);
 
 // Note in all the tokenize functions rate if non NULL is incremented
 // with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
diff --git a/libaom/av1/encoder/tpl_model.c b/libaom/av1/encoder/tpl_model.c
new file mode 100644
index 0000000..79afb6d
--- /dev/null
+++ b/libaom/av1/encoder/tpl_model.c
@@ -0,0 +1,592 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_codec.h"
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/reconinter_enc.h"
+
+typedef struct GF_PICTURE {
+  YV12_BUFFER_CONFIG *frame;
+  int ref_frame[7];
+} GF_PICTURE;
+
+static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               TX_SIZE tx_size, int64_t *recon_error,
+                               int64_t *sse) {
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
+  uint16_t eob;
+  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+  const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+  av1_quantize_fp_32x32(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX,
+                        p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, dqcoeff,
+                        p->dequant_QTX, &eob, scan_order->scan,
+                        scan_order->iscan);
+
+  *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+  *recon_error = AOMMAX(*recon_error, 1);
+
+  *sse = (*sse) >> shift;
+  *sse = AOMMAX(*sse, 1);
+}
+
+static void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                         TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_8X8: aom_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: aom_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: aom_hadamard_32x32(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+
+static uint32_t motion_compensated_prediction(AV1_COMP *cpi, ThreadData *td,
+                                              uint8_t *cur_frame_buf,
+                                              uint8_t *ref_frame_buf,
+                                              int stride, BLOCK_SIZE bsize,
+                                              int mi_row, int mi_col) {
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const SEARCH_METHODS search_method = NSTEP;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  uint32_t bestsme = UINT_MAX;
+  int distortion;
+  uint32_t sse;
+  int cost_list[5];
+  const MvLimits tmp_mv_limits = x->mv_limits;
+
+  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+  av1_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
+                        search_method, 0, sadpb, cond_cost_list(cpi, cost_list),
+                        &best_ref_mv1, INT_MAX, 0, (MI_SIZE * mi_col),
+                        (MI_SIZE * mi_row), 0, &cpi->ss_cfg[SS_CFG_SRC]);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  bestsme = cpi->find_fractional_mv_step(
+      x, cm, mi_row, mi_col, &best_ref_mv1, cpi->common.allow_high_precision_mv,
+      x->errorperbit, &cpi->fn_ptr[bsize], 0, mv_sf->subpel_iters_per_step,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL,
+      0, 0, pw, ph, 1, 1);
+
+  return bestsme;
+}
+
+static void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                            struct scale_factors *sf, GF_PICTURE *gf_picture,
+                            int frame_idx, int16_t *src_diff, tran_low_t *coeff,
+                            tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
+                            int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
+                            TplDepStats *tpl_stats) {
+  AV1_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  const int pix_num = bw * bh;
+  int best_rf_idx = -1;
+  int_mv best_mv;
+  int64_t best_inter_cost = INT64_MAX;
+  int64_t inter_cost;
+  int rf_idx;
+  const InterpFilters kernel =
+      av1_make_interp_filters(EIGHTTAP_REGULAR, EIGHTTAP_REGULAR);
+
+  int64_t best_intra_cost = INT64_MAX;
+  int64_t intra_cost;
+  PREDICTION_MODE mode;
+  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  MB_MODE_INFO mi_above, mi_left;
+
+  memset(tpl_stats, 0, sizeof(*tpl_stats));
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
+  xd->above_mbmi = (mi_row > 0) ? &mi_above : NULL;
+  xd->left_mbmi = (mi_col > 0) ? &mi_left : NULL;
+
+  // Intra prediction search
+  for (mode = DC_PRED; mode <= PAETH_PRED; ++mode) {
+    uint8_t *src, *dst;
+    int src_stride, dst_stride;
+
+    src = xd->cur_buf->y_buffer + mb_y_offset;
+    src_stride = xd->cur_buf->y_stride;
+
+    dst = &predictor[0];
+    dst_stride = bw;
+
+    xd->mi[0]->sb_type = bsize;
+    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+    av1_predict_intra_block(
+        cm, xd, block_size_wide[bsize], block_size_high[bsize], tx_size, mode,
+        0, 0, FILTER_INTRA_MODES, src, src_stride, dst, dst_stride, 0, 0, 0);
+
+    if (is_cur_buf_hbd(xd)) {
+      aom_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                                dst_stride, xd->bd);
+    } else {
+      aom_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                         dst_stride);
+    }
+
+    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+
+    intra_cost = aom_satd(coeff, pix_num);
+
+    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
+  }
+
+  // Motion compensated prediction
+  best_mv.as_int = 0;
+
+  (void)mb_y_offset;
+  // Motion estimation column boundary
+  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
+  x->mv_limits.col_max =
+      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND);
+
+  for (rf_idx = 0; rf_idx < 7; ++rf_idx) {
+    if (ref_frame[rf_idx] == NULL) continue;
+
+    motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
+                                  ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                                  xd->cur_buf->y_stride, bsize, mi_row, mi_col);
+
+    // TODO(jingning): Not yet support high bit-depth in the next three
+    // steps.
+    ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
+    WarpTypesAllowed warp_types;
+    memset(&warp_types, 0, sizeof(WarpTypesAllowed));
+
+    av1_build_inter_predictor(
+        ref_frame[rf_idx]->y_buffer + mb_y_offset, ref_frame[rf_idx]->y_stride,
+        &predictor[0], bw, &x->best_mv.as_mv, sf, bw, bh, &conv_params, kernel,
+        &warp_types, mi_col * MI_SIZE, mi_row * MI_SIZE, 0, 0, MV_PRECISION_Q3,
+        mi_col * MI_SIZE, mi_row * MI_SIZE, xd, 0);
+    if (is_cur_buf_hbd(xd)) {
+      aom_highbd_subtract_block(
+          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
+          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
+    } else {
+      aom_subtract_block(bh, bw, src_diff, bw,
+                         xd->cur_buf->y_buffer + mb_y_offset,
+                         xd->cur_buf->y_stride, &predictor[0], bw);
+    }
+    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+
+    inter_cost = aom_satd(coeff, pix_num);
+    if (inter_cost < best_inter_cost) {
+      int64_t recon_error, sse;
+
+      best_rf_idx = rf_idx;
+      best_inter_cost = inter_cost;
+      best_mv.as_int = x->best_mv.as_int;
+      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &recon_error,
+                         &sse);
+    }
+  }
+  best_intra_cost = AOMMAX(best_intra_cost, 1);
+  best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
+  tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->mc_dep_cost = tpl_stats->intra_cost + tpl_stats->mc_flow;
+
+  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+  tpl_stats->mv.as_int = best_mv.as_int;
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+  int round;
+  if (ref_pos < 0)
+    round = -(1 + (-ref_pos - 1) / bsize_pix);
+  else
+    round = ref_pos / bsize_pix;
+
+  return round;
+}
+
+static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
+                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
+  int width = 0, height = 0;
+  int bw = 4 << mi_size_wide_log2[bsize];
+  int bh = 4 << mi_size_high_log2[bsize];
+
+  switch (block) {
+    case 0:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 1:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 2:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    case 3:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    default: assert(0);
+  }
+
+  return width * height;
+}
+
+static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                               int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
+  TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
+  MV mv = tpl_stats->mv.as_mv;
+  int mv_row = mv.row >> 3;
+  int mv_col = mv.col >> 3;
+
+  int ref_pos_row = mi_row * MI_SIZE + mv_row;
+  int ref_pos_col = mi_col * MI_SIZE + mv_col;
+
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+  const int pix_num = bw * bh;
+
+  // top-left on grid block location in pixel
+  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+  int block;
+
+  for (block = 0; block < 4; ++block) {
+    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+      int overlap_area = get_overlap_area(
+          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+
+      int64_t mc_flow = tpl_stats->mc_dep_cost -
+                        (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
+                            tpl_stats->intra_cost;
+
+      int idx, idy;
+
+      for (idy = 0; idy < mi_height; ++idy) {
+        for (idx = 0; idx < mi_width; ++idx) {
+          TplDepStats *des_stats =
+              &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
+                         (ref_mi_col + idx)];
+
+          des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
+          assert(overlap_area >= 0);
+        }
+      }
+    }
+  }
+}
+
+static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                             int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  int idx, idy;
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplDepStats *tpl_ptr =
+          &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
+      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
+                         BLOCK_4X4);
+    }
+  }
+}
+
+static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize, int stride,
+                            const TplDepStats *src_stats) {
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+  int idx, idy;
+
+  int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
+  int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
+
+  TplDepStats *tpl_ptr;
+
+  intra_cost = AOMMAX(1, intra_cost);
+  inter_cost = AOMMAX(1, inter_cost);
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col];
+    for (idx = 0; idx < mi_width; ++idx) {
+      tpl_ptr->intra_cost = intra_cost;
+      tpl_ptr->inter_cost = inter_cost;
+      tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
+      tpl_ptr->ref_frame_index = src_stats->ref_frame_index;
+      tpl_ptr->mv.as_int = src_stats->mv.as_int;
+      ++tpl_ptr;
+    }
+  }
+}
+
+static void mc_flow_dispenser(AV1_COMP *cpi, GF_PICTURE *gf_picture,
+                              int frame_idx) {
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
+  YV12_BUFFER_CONFIG *ref_frame[7] = {
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL
+  };
+
+  AV1_COMMON *cm = &cpi->common;
+  struct scale_factors sf;
+  int rdmult, idx;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int mi_row, mi_col;
+
+  DECLARE_ALIGNED(32, uint16_t, predictor16[32 * 32 * 3]);
+  DECLARE_ALIGNED(32, uint8_t, predictor8[32 * 32 * 3]);
+  uint8_t *predictor;
+  DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+
+  const BLOCK_SIZE bsize = BLOCK_32X32;
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+
+  // Setup scaling factor
+  av1_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height);
+
+  if (is_cur_buf_hbd(xd))
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  else
+    predictor = predictor8;
+
+  // Prepare reference frame pointers. If any reference frame slot is
+  // unavailable, the pointer will be set to Null.
+  for (idx = 0; idx < 7; ++idx) {
+    int rf_idx = gf_picture[frame_idx].ref_frame[idx];
+    if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
+  }
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+  xd->cur_buf = this_frame;
+
+  // Get rd multiplier set up.
+  rdmult = (int)av1_compute_rd_mult(cpi, tpl_frame->base_qindex);
+  if (rdmult < 1) rdmult = 1;
+  set_error_per_bit(x, rdmult);
+  av1_initialize_me_consts(cpi, x, tpl_frame->base_qindex);
+
+  tpl_frame->is_valid = 1;
+
+  cm->base_qindex = tpl_frame->base_qindex;
+  av1_frame_init_quantizer(cpi);
+
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+    // Motion estimation row boundary
+    x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
+    x->mv_limits.row_max =
+        (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * AOM_INTERP_EXTEND);
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      TplDepStats tpl_stats;
+      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, src_diff, coeff,
+                      qcoeff, dqcoeff, mi_row, mi_col, bsize, tx_size,
+                      ref_frame, predictor, &tpl_stats);
+
+      // Motion flow dependency dispenser.
+      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+                      tpl_frame->stride, &tpl_stats);
+
+      tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
+                       bsize);
+    }
+  }
+}
+
+static void init_gop_frames(AV1_COMP *cpi, GF_PICTURE *gf_picture,
+                            const GF_GROUP *gf_group, int *tpl_group_frames,
+                            const EncodeFrameInput *const frame_input) {
+  AV1_COMMON *cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  int frame_idx = 0;
+  int i;
+  int gld_index = -1;
+  int alt_index = -1;
+  int lst_index = -1;
+  int extend_frame_count = 0;
+  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
+
+  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
+  int recon_frame_index[INTER_REFS_PER_FRAME + 1] = { -1, -1, -1, -1,
+                                                      -1, -1, -1, -1 };
+
+  // TODO(jingning): To be used later for gf frame type parsing.
+  (void)gf_group;
+
+  for (i = 0; i < FRAME_BUFFERS && frame_idx < INTER_REFS_PER_FRAME + 1; ++i) {
+    if (frame_bufs[i].ref_count == 0) {
+      alloc_frame_mvs(cm, &frame_bufs[i]);
+      if (aom_realloc_frame_buffer(
+              &frame_bufs[i].buf, cm->width, cm->height,
+              seq_params->subsampling_x, seq_params->subsampling_y,
+              seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+              cm->byte_alignment, NULL, NULL, NULL))
+        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffer");
+
+      recon_frame_index[frame_idx] = i;
+      ++frame_idx;
+    }
+  }
+
+  for (i = 0; i < INTER_REFS_PER_FRAME + 1; ++i) {
+    assert(recon_frame_index[i] >= 0);
+    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+  }
+
+  *tpl_group_frames = 0;
+
+  // Initialize Golden reference frame.
+  gf_picture[0].frame = NULL;
+  RefCntBuffer *ref_buf = get_ref_frame_buf(cm, GOLDEN_FRAME);
+  if (ref_buf) gf_picture[0].frame = &ref_buf->buf;
+  for (i = 0; i < 7; ++i) gf_picture[0].ref_frame[i] = -1;
+  gld_index = 0;
+  ++*tpl_group_frames;
+
+  // Initialize ARF frame
+  gf_picture[1].frame = frame_input->source;
+  gf_picture[1].ref_frame[0] = gld_index;
+  gf_picture[1].ref_frame[1] = lst_index;
+  gf_picture[1].ref_frame[2] = alt_index;
+  // TODO(yuec) Need o  figure out full AV1 reference model
+  for (i = 3; i < 7; ++i) gf_picture[1].ref_frame[i] = -1;
+  alt_index = 1;
+  ++*tpl_group_frames;
+
+  // Initialize P frames
+  for (frame_idx = 2; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
+    struct lookahead_entry *buf =
+        av1_lookahead_peek(cpi->lookahead, frame_idx - 2);
+
+    if (buf == NULL) break;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
+
+    ++*tpl_group_frames;
+    lst_index = frame_idx;
+
+    if (frame_idx == cpi->rc.baseline_gf_interval + 1) break;
+  }
+
+  gld_index = frame_idx;
+  lst_index = AOMMAX(0, frame_idx - 1);
+  alt_index = -1;
+  ++frame_idx;
+
+  // Extend two frames outside the current gf group.
+  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
+    struct lookahead_entry *buf =
+        av1_lookahead_peek(cpi->lookahead, frame_idx - 2);
+
+    if (buf == NULL) break;
+
+    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
+    lst_index = frame_idx;
+    ++*tpl_group_frames;
+    ++extend_frame_count;
+  }
+}
+
+static void init_tpl_stats(AV1_COMP *cpi) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+    memset(tpl_frame->tpl_stats_ptr, 0,
+           tpl_frame->height * tpl_frame->width *
+               sizeof(*tpl_frame->tpl_stats_ptr));
+    tpl_frame->is_valid = 0;
+  }
+}
+
+void av1_tpl_setup_stats(AV1_COMP *cpi,
+                         const EncodeFrameInput *const frame_input) {
+  GF_PICTURE gf_picture[MAX_LAG_BUFFERS];
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  int tpl_group_frames = 0;
+  int frame_idx;
+
+  init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames, frame_input);
+
+  init_tpl_stats(cpi);
+
+  // Backward propagation from tpl_group_frames to 1.
+  for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx)
+    mc_flow_dispenser(cpi, gf_picture, frame_idx);
+}
diff --git a/libaom/av1/encoder/tpl_model.h b/libaom/av1/encoder/tpl_model.h
new file mode 100644
index 0000000..f6b33b0
--- /dev/null
+++ b/libaom/av1/encoder/tpl_model.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TPL_MODEL_H_
+#define AOM_AV1_ENCODER_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_tpl_setup_stats(AV1_COMP *cpi,
+                         const EncodeFrameInput *const frame_input);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TPL_MODEL_H_
diff --git a/libaom/av1/encoder/var_based_part.c b/libaom/av1/encoder/var_based_part.c
new file mode 100644
index 0000000..3cead91
--- /dev/null
+++ b/libaom/av1/encoder/var_based_part.c
@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/reconinter_enc.h"
+
+extern const uint8_t AV1_VAR_OFFS[];
+
+typedef struct {
+  // TODO(kyslov): consider changing to 64bit
+
+  // This struct is used for computing variance in choose_partitioning(), where
+  // the max number of samples within a superblock is 32x32 (with 4x4 avg).
+  // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32
+  // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit
+  uint32_t sum_square_error;
+  int32_t sum_error;
+  int log2_count;
+  int variance;
+} var;
+
+typedef struct {
+  var none;
+  var horz[2];
+  var vert[2];
+} partition_variance;
+
+typedef struct {
+  partition_variance part_variances;
+  var split[4];
+} v4x4;
+
+typedef struct {
+  partition_variance part_variances;
+  v4x4 split[4];
+} v8x8;
+
+typedef struct {
+  partition_variance part_variances;
+  v8x8 split[4];
+} v16x16;
+
+typedef struct {
+  partition_variance part_variances;
+  v16x16 split[4];
+} v32x32;
+
+typedef struct {
+  partition_variance part_variances;
+  v32x32 split[4];
+} v64x64;
+
+typedef struct {
+  partition_variance part_variances;
+  v64x64 split[4];
+} v128x128;
+
+typedef struct {
+  partition_variance *part_variances;
+  var *split[4];
+} variance_node;
+
+static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
+  int i;
+  node->part_variances = NULL;
+  switch (bsize) {
+    case BLOCK_128X128: {
+      v128x128 *vt = (v128x128 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_64X64: {
+      v64x64 *vt = (v64x64 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_32X32: {
+      v32x32 *vt = (v32x32 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_16X16: {
+      v16x16 *vt = (v16x16 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_8X8: {
+      v8x8 *vt = (v8x8 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    default: {
+      v4x4 *vt = (v4x4 *)data;
+      assert(bsize == BLOCK_4X4);
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++) node->split[i] = &vt->split[i];
+      break;
+    }
+  }
+}
+
+// Set variance values given sum square error, sum error, count.
+static void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->log2_count = c;
+}
+
+static void get_variance(var *v) {
+  v->variance =
+      (int)(256 * (v->sum_square_error -
+                   (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
+                              v->log2_count)) >>
+            v->log2_count);
+}
+
+static void sum_2_variances(const var *a, const var *b, var *r) {
+  assert(a->log2_count == b->log2_count);
+  fill_variance(a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+  variance_node node;
+  memset(&node, 0, sizeof(node));
+  tree_to_node(data, bsize, &node);
+  sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
+  sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
+  sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
+  sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
+  sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
+                  &node.part_variances->none);
+}
+
+static void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x,
+                           MACROBLOCKD *const xd, int mi_row, int mi_col,
+                           BLOCK_SIZE bsize) {
+  if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
+    set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+    xd->mi[0]->sb_type = bsize;
+  }
+}
+
+static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
+                               MACROBLOCKD *const xd,
+                               const TileInfo *const tile, void *data,
+                               BLOCK_SIZE bsize, int mi_row, int mi_col,
+                               int64_t threshold, BLOCK_SIZE bsize_min,
+                               int force_split) {
+  AV1_COMMON *const cm = &cpi->common;
+  variance_node vt;
+  const int block_width = mi_size_wide[bsize];
+  const int block_height = mi_size_high[bsize];
+
+  assert(block_height == block_width);
+  tree_to_node(data, bsize, &vt);
+
+  if (force_split == 1) return 0;
+
+  if (mi_col + block_width > tile->mi_col_end ||
+      mi_row + block_height > tile->mi_row_end)
+    return 0;
+
+  // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+  // variance is below threshold, otherwise split will be selected.
+  // No check for vert/horiz split as too few samples for variance.
+  if (bsize == bsize_min) {
+    // Variance already computed to set the force_split.
+    if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+    return 0;
+  } else if (bsize > bsize_min) {
+    // Variance already computed to set the force_split.
+    if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+    // For key frame: take split for bsize above 32X32 or very high variance.
+    if (frame_is_intra_only(cm) &&
+        (bsize > BLOCK_32X32 ||
+         vt.part_variances->none.variance > (threshold << 4))) {
+      return 0;
+    }
+    // If variance is low, take the bsize (no split).
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+
+    // Check vertical split.
+    if (mi_row + block_height / 2 < cm->mi_rows) {
+      BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
+      get_variance(&vt.part_variances->vert[0]);
+      get_variance(&vt.part_variances->vert[1]);
+      if (vt.part_variances->vert[0].variance < threshold &&
+          vt.part_variances->vert[1].variance < threshold &&
+          get_plane_block_size(subsize, xd->plane[1].subsampling_x,
+                               xd->plane[1].subsampling_y) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
+        return 1;
+      }
+    }
+    // Check horizontal split.
+    if (mi_col + block_width / 2 < cm->mi_cols) {
+      BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+      get_variance(&vt.part_variances->horz[0]);
+      get_variance(&vt.part_variances->horz[1]);
+      if (vt.part_variances->horz[0].variance < threshold &&
+          vt.part_variances->horz[1].variance < threshold &&
+          get_plane_block_size(subsize, xd->plane[1].subsampling_x,
+                               xd->plane[1].subsampling_y) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
+        return 1;
+      }
+    }
+
+    return 0;
+  }
+  return 0;
+}
+
+static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x16_idx, int y16_idx, v16x16 *vst,
+                                 int pixels_wide, int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+      s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+      if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
+                              int dp, int x16_idx, int y16_idx, int pixels_wide,
+                              int pixels_high) {
+  int k;
+  int minmax_max = 0;
+  int minmax_min = 255;
+  // Loop over the 4 8x8 subblocks.
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    int min = 0;
+    int max = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      aom_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp,
+                     &min, &max);
+      if ((max - min) > minmax_max) minmax_max = (max - min);
+      if ((max - min) < minmax_min) minmax_min = (max - min);
+    }
+  }
+  return (minmax_max - minmax_min);
+}
+
+static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x8_idx, int y8_idx, v8x8 *vst,
+                                 int pixels_wide, int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x4_idx = x8_idx + ((k & 1) << 2);
+    int y4_idx = y8_idx + ((k >> 1) << 2);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+      s_avg = aom_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+      if (!is_key_frame) d_avg = aom_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+                                         int width, int height,
+                                         int content_state) {
+  if (speed >= 8) {
+    if (width <= 640 && height <= 480)
+      return (5 * threshold_base) >> 2;
+    else if ((content_state == kLowSadLowSumdiff) ||
+             (content_state == kHighSadLowSumdiff) ||
+             (content_state == kLowVarHighSumdiff))
+      return (5 * threshold_base) >> 2;
+  } else if (speed == 7) {
+    if ((content_state == kLowSadLowSumdiff) ||
+        (content_state == kHighSadLowSumdiff) ||
+        (content_state == kLowVarHighSumdiff)) {
+      return (5 * threshold_base) >> 2;
+    }
+  }
+  return threshold_base;
+}
+
+// Set the variance split thresholds for following the block sizes:
+// 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32,
+// 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is
+// currently only used on key frame.
+static void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], int q,
+                               int content_state) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int threshold_multiplier = is_key_frame ? 40 : 1;
+  int64_t threshold_base =
+      (int64_t)(threshold_multiplier * cpi->dequants.y_dequant_QTX[q][1]);
+
+  if (is_key_frame) {
+    thresholds[0] = threshold_base;
+    thresholds[1] = threshold_base;
+    thresholds[2] = threshold_base >> 2;
+    thresholds[3] = threshold_base >> 2;
+    thresholds[4] = threshold_base << 2;
+  } else {
+    // Increase base variance threshold based on content_state/sum_diff level.
+    threshold_base = scale_part_thresh_sumdiff(
+        threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state);
+
+    thresholds[1] = threshold_base;
+    thresholds[3] = threshold_base << cpi->oxcf.speed;
+    if (cm->width >= 1280 && cm->height >= 720)
+      thresholds[3] = thresholds[3] << 1;
+    if (cm->width <= 352 && cm->height <= 288) {
+      thresholds[1] = threshold_base >> 3;
+      thresholds[2] = threshold_base >> 1;
+      thresholds[3] = threshold_base << 3;
+    } else if (cm->width < 1280 && cm->height < 720) {
+      thresholds[2] = (5 * threshold_base) >> 2;
+    } else if (cm->width < 1920 && cm->height < 1080) {
+      thresholds[2] = threshold_base << 1;
+      thresholds[3] <<= 2;
+    } else {
+      thresholds[2] = (5 * threshold_base) >> 1;
+    }
+  }
+}
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
+                                           int content_state) {
+  AV1_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const int is_key_frame = frame_is_intra_only(cm);
+  if (sf->partition_search_type != VAR_BASED_PARTITION) {
+    return;
+  } else {
+    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q, content_state);
+    // The thresholds below are not changed locally.
+    if (is_key_frame) {
+      cpi->vbp_threshold_sad = 0;
+      cpi->vbp_threshold_copy = 0;
+      cpi->vbp_bsize_min = BLOCK_8X8;
+    } else {
+      if (cm->width <= 352 && cm->height <= 288)
+        cpi->vbp_threshold_sad = 10;
+      else
+        cpi->vbp_threshold_sad = (cpi->dequants.y_dequant_QTX[q][1] << 1) > 1000
+                                     ? (cpi->dequants.y_dequant_QTX[q][1] << 1)
+                                     : 1000;
+      cpi->vbp_bsize_min = BLOCK_16X16;
+      if (cm->width <= 352 && cm->height <= 288)
+        cpi->vbp_threshold_copy = 4000;
+      else if (cm->width <= 640 && cm->height <= 360)
+        cpi->vbp_threshold_copy = 8000;
+      else
+        cpi->vbp_threshold_copy =
+            (cpi->dequants.y_dequant_QTX[q][1] << 3) > 8000
+                ? (cpi->dequants.y_dequant_QTX[q][1] << 3)
+                : 8000;
+    }
+    cpi->vbp_threshold_minmax = 15 + (q >> 3);
+  }
+}
+
+// This function chooses partitioning based on the variance between source and
+// reconstructed last, where variance is computed for down-sampled inputs.
+// TODO(kyslov): lot of things. Bring back noise estimation, brush up partition
+// selection and most of all - retune the thresholds
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                      MACROBLOCK *x, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int i, j, k, m;
+  v128x128 *vt;
+  v16x16 *vt2 = NULL;
+  unsigned char force_split[85];
+  int avg_32x32;
+  int max_var_32x32 = 0;
+  int min_var_32x32 = INT_MAX;
+  int var_32x32;
+  int var_64x64;
+  int min_var_64x64 = INT_MAX;
+  int max_var_64x64 = 0;
+  int avg_16x16[4];
+  int maxvar_16x16[4];
+  int minvar_16x16[4];
+  int64_t threshold_4x4avg;
+  int content_state = 0;
+  uint8_t *s;
+  const uint8_t *d;
+  int sp;
+  int dp;
+  int compute_minmax_variance = 1;
+  int is_key_frame = frame_is_intra_only(cm);
+  int pixels_wide = 128, pixels_high = 128;
+  assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+         cm->seq_params.sb_size == BLOCK_128X128);
+  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  const int num_64x64_blocks = is_small_sb ? 1 : 4;
+
+  CHECK_MEM_ERROR(cm, vt, aom_calloc(1, sizeof(*vt)));
+
+  int64_t thresholds[5] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
+                            cpi->vbp_thresholds[2], cpi->vbp_thresholds[3],
+                            cpi->vbp_thresholds[4] };
+
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  int variance4x4downsample[64];
+  int segment_id;
+  const int num_planes = av1_num_planes(cm);
+
+  segment_id = xd->mi[0]->segment_id;
+
+  set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state);
+
+  if (is_small_sb) {
+    pixels_wide = 64;
+    pixels_high = 64;
+  }
+
+  // For non keyframes, disable 4x4 average for low resolution when speed = 8
+  threshold_4x4avg = INT64_MAX;
+
+  if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
+  if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
+
+  s = x->plane[0].src.buf;
+  sp = x->plane[0].src.stride;
+
+  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+  // 5-20 for the 16x16 blocks.
+  force_split[0] = 0;
+
+  if (!is_key_frame) {
+    // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
+    // is!!
+    MB_MODE_INFO *mi = xd->mi[0];
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+
+    assert(yv12 != NULL);
+
+    av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         get_ref_scale_factors(cm, LAST_FRAME), num_planes);
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE_FRAME;
+    mi->sb_type = cm->seq_params.sb_size;
+    mi->mv[0].as_int = 0;
+    mi->interp_filters = av1_make_interp_filters(BILINEAR, BILINEAR);
+    if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
+      const MV dummy_mv = { 0, 0 };
+      av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size, mi_row,
+                                    mi_col, &dummy_mv);
+    }
+
+// TODO(kyslov): bring the small SAD functionality back
+#if 0
+    y_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                   xd->plane[0].pre[0].buf,
+                                   xd->plane[0].pre[0].stride);
+#endif
+    x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
+                                  cm->seq_params.sb_size, AOM_PLANE_Y,
+                                  AOM_PLANE_Y);
+
+    d = xd->plane[0].dst.buf;
+    dp = xd->plane[0].dst.stride;
+
+    // If the y_sad is very small, take 64x64 as partition and exit.
+    // Don't check on boosted segment for now, as 64x64 is suppressed there.
+#if 0
+        if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad)
+       { const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64]; const
+       int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64]; if (mi_col +
+       block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows)
+       { set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_128X128);
+            x->variance_low[0] = 1;
+            return 0;
+          }
+        }
+#endif
+  } else {
+    d = AV1_VAR_OFFS;
+    dp = 0;
+  }
+
+  if (low_res && threshold_4x4avg < INT64_MAX)
+    CHECK_MEM_ERROR(cm, vt2, aom_calloc(64, sizeof(*vt2)));
+  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+  // for splits.
+  for (m = 0; m < num_64x64_blocks; m++) {
+    const int x64_idx = ((m & 1) << 6);
+    const int y64_idx = ((m >> 1) << 6);
+    const int m2 = m << 2;
+    force_split[m + 1] = 0;
+    for (i = 0; i < 4; i++) {
+      const int x32_idx = x64_idx + ((i & 1) << 5);
+      const int y32_idx = y64_idx + ((i >> 1) << 5);
+      const int i2 = (m2 + i) << 2;
+      force_split[5 + m2 + i] = 0;
+      avg_16x16[i] = 0;
+      maxvar_16x16[i] = 0;
+      minvar_16x16[i] = INT_MAX;
+      for (j = 0; j < 4; j++) {
+        const int x16_idx = x32_idx + ((j & 1) << 4);
+        const int y16_idx = y32_idx + ((j >> 1) << 4);
+        const int split_index = 21 + i2 + j;
+        v16x16 *vst = &vt->split[m].split[i].split[j];
+        force_split[split_index] = 0;
+        variance4x4downsample[i2 + j] = 0;
+        if (!is_key_frame) {
+          fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst, pixels_wide,
+                               pixels_high, is_key_frame);
+          fill_variance_tree(&vt->split[m].split[i].split[j], BLOCK_16X16);
+          get_variance(&vt->split[m].split[i].split[j].part_variances.none);
+          avg_16x16[i] +=
+              vt->split[m].split[i].split[j].part_variances.none.variance;
+          if (vt->split[m].split[i].split[j].part_variances.none.variance <
+              minvar_16x16[i])
+            minvar_16x16[i] =
+                vt->split[m].split[i].split[j].part_variances.none.variance;
+          if (vt->split[m].split[i].split[j].part_variances.none.variance >
+              maxvar_16x16[i])
+            maxvar_16x16[i] =
+                vt->split[m].split[i].split[j].part_variances.none.variance;
+          if (vt->split[m].split[i].split[j].part_variances.none.variance >
+              thresholds[3]) {
+            // 16X16 variance is above threshold for split, so force split to
+            // 8x8 for this 16x16 block (this also forces splits for upper
+            // levels).
+            force_split[split_index] = 1;
+            force_split[5 + m2 + i] = 1;
+            force_split[m + 1] = 1;
+            force_split[0] = 1;
+          } else if (compute_minmax_variance &&
+                     vt->split[m]
+                             .split[i]
+                             .split[j]
+                             .part_variances.none.variance > thresholds[2] &&
+                     !cyclic_refresh_segment_id_boosted(segment_id)) {
+            // We have some nominal amount of 16x16 variance (based on average),
+            // compute the minmax over the 8x8 sub-blocks, and if above
+            // threshold, force split to 8x8 block for this 16x16 block.
+            int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+                                            pixels_wide, pixels_high);
+            int thresh_minmax = (int)cpi->vbp_threshold_minmax;
+            if (minmax > thresh_minmax) {
+              force_split[split_index] = 1;
+              force_split[5 + m2 + i] = 1;
+              force_split[m + 1] = 1;
+              force_split[0] = 1;
+            }
+          }
+        }
+        if (is_key_frame) {
+          force_split[split_index] = 0;
+          // Go down to 4x4 down-sampling for variance.
+          variance4x4downsample[i2 + j] = 1;
+          for (k = 0; k < 4; k++) {
+            int x8_idx = x16_idx + ((k & 1) << 3);
+            int y8_idx = y16_idx + ((k >> 1) << 3);
+            v8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
+            fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+                                 pixels_wide, pixels_high, is_key_frame);
+          }
+        }
+      }
+    }
+  }
+
+  // Fill the rest of the variance tree by summing split partition values.
+  for (m = 0; m < num_64x64_blocks; ++m) {
+    avg_32x32 = 0;
+    const int m2 = m << 2;
+    for (i = 0; i < 4; i++) {
+      const int i2 = (m2 + i) << 2;
+      for (j = 0; j < 4; j++) {
+        const int split_index = 21 + i2 + j;
+        if (variance4x4downsample[i2 + j] == 1) {
+          v16x16 *vtemp =
+              (!is_key_frame) ? &vt2[i2 + j] : &vt->split[m].split[i].split[j];
+          for (k = 0; k < 4; k++)
+            fill_variance_tree(&vtemp->split[k], BLOCK_8X8);
+          fill_variance_tree(vtemp, BLOCK_16X16);
+          // If variance of this 16x16 block is above the threshold, force block
+          // to split. This also forces a split on the upper levels.
+          get_variance(&vtemp->part_variances.none);
+          if (vtemp->part_variances.none.variance > thresholds[3]) {
+            force_split[split_index] = 1;
+            force_split[5 + m2 + i] = 1;
+            force_split[m + 1] = 1;
+            force_split[0] = 1;
+          }
+        }
+      }
+      fill_variance_tree(&vt->split[m].split[i], BLOCK_32X32);
+      // If variance of this 32x32 block is above the threshold, or if its above
+      // (some threshold of) the average variance over the sub-16x16 blocks,
+      // then force this block to split. This also forces a split on the upper
+      // (64x64) level.
+      if (!force_split[5 + m2 + i]) {
+        get_variance(&vt->split[m].split[i].part_variances.none);
+        var_32x32 = vt->split[m].split[i].part_variances.none.variance;
+        max_var_32x32 = AOMMAX(var_32x32, max_var_32x32);
+        min_var_32x32 = AOMMIN(var_32x32, min_var_32x32);
+        if (vt->split[m].split[i].part_variances.none.variance >
+                thresholds[2] ||
+            (!is_key_frame &&
+             vt->split[m].split[i].part_variances.none.variance >
+                 (thresholds[2] >> 1) &&
+             vt->split[m].split[i].part_variances.none.variance >
+                 (avg_16x16[i] >> 1))) {
+          force_split[5 + m2 + i] = 1;
+          force_split[m + 1] = 1;
+          force_split[0] = 1;
+        } else if (!is_key_frame && cm->height <= 360 &&
+                   (maxvar_16x16[i] - minvar_16x16[i]) > (thresholds[2] >> 1) &&
+                   maxvar_16x16[i] > thresholds[2]) {
+          force_split[5 + m2 + i] = 1;
+          force_split[m + 1] = 1;
+          force_split[0] = 1;
+        }
+        avg_32x32 += var_32x32;
+      }
+    }
+    if (!force_split[1 + m]) {
+      fill_variance_tree(&vt->split[m], BLOCK_64X64);
+      get_variance(&vt->split[m].part_variances.none);
+      var_64x64 = vt->split[m].part_variances.none.variance;
+      max_var_64x64 = AOMMAX(var_64x64, max_var_64x64);
+      min_var_64x64 = AOMMIN(var_64x64, min_var_64x64);
+      // If variance of this 64x64 block is above (some threshold of) the
+      // average variance over the sub-32x32 blocks, then force this block to
+      // split. Only checking this for noise level >= medium for now.
+
+      if (!is_key_frame &&
+          (max_var_32x32 - min_var_32x32) > 3 * (thresholds[1] >> 3) &&
+          max_var_32x32 > thresholds[1] >> 1)
+        force_split[1 + m] = 1;
+    }
+    if (is_small_sb) force_split[0] = 1;
+  }
+
+  if (!force_split[0]) {
+    fill_variance_tree(vt, BLOCK_128X128);
+    get_variance(&vt->part_variances.none);
+    if (!is_key_frame &&
+        (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) &&
+        max_var_64x64 > thresholds[0] >> 1)
+      force_split[0] = 1;
+  }
+
+  if (!set_vt_partitioning(cpi, x, xd, tile, vt, BLOCK_128X128, mi_row, mi_col,
+                           thresholds[0], BLOCK_16X16, force_split[0])) {
+    for (m = 0; m < num_64x64_blocks; ++m) {
+      const int x64_idx = ((m & 1) << 4);
+      const int y64_idx = ((m >> 1) << 4);
+      const int m2 = m << 2;
+
+      // Now go through the entire structure, splitting every block size until
+      // we get to one that's got a variance lower than our threshold.
+      if (!set_vt_partitioning(cpi, x, xd, tile, &vt->split[m], BLOCK_64X64,
+                               mi_row + y64_idx, mi_col + x64_idx,
+                               thresholds[1], BLOCK_16X16,
+                               force_split[1 + m])) {
+        for (i = 0; i < 4; ++i) {
+          const int x32_idx = ((i & 1) << 3);
+          const int y32_idx = ((i >> 1) << 3);
+          const int i2 = (m2 + i) << 2;
+          if (!set_vt_partitioning(cpi, x, xd, tile, &vt->split[m].split[i],
+                                   BLOCK_32X32, (mi_row + y64_idx + y32_idx),
+                                   (mi_col + x64_idx + x32_idx), thresholds[2],
+                                   BLOCK_16X16, force_split[5 + m2 + i])) {
+            for (j = 0; j < 4; ++j) {
+              const int x16_idx = ((j & 1) << 2);
+              const int y16_idx = ((j >> 1) << 2);
+              const int split_index = 21 + i2 + j;
+              // For inter frames: if variance4x4downsample[] == 1 for this
+              // 16x16 block, then the variance is based on 4x4 down-sampling,
+              // so use vt2 in set_vt_partioning(), otherwise use vt.
+              v16x16 *vtemp =
+                  (!is_key_frame && variance4x4downsample[i2 + j] == 1)
+                      ? &vt2[i2 + j]
+                      : &vt->split[m].split[i].split[j];
+              if (!set_vt_partitioning(cpi, x, xd, tile, vtemp, BLOCK_16X16,
+                                       mi_row + y64_idx + y32_idx + y16_idx,
+                                       mi_col + x64_idx + x32_idx + x16_idx,
+                                       thresholds[3], BLOCK_8X8,
+                                       force_split[split_index])) {
+                for (k = 0; k < 4; ++k) {
+                  const int x8_idx = (k & 1) << 1;
+                  const int y8_idx = (k >> 1) << 1;
+                  set_block_size(
+                      cpi, x, xd,
+                      (mi_row + y64_idx + y32_idx + y16_idx + y8_idx),
+                      (mi_col + x64_idx + x32_idx + x16_idx + x8_idx),
+                      BLOCK_8X8);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (vt2) aom_free(vt2);
+  if (vt) aom_free(vt);
+  return 0;
+}
diff --git a/libaom/av1/encoder/var_based_part.h b/libaom/av1/encoder/var_based_part.h
new file mode 100644
index 0000000..c355224
--- /dev/null
+++ b/libaom/av1/encoder/var_based_part.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_VAR_BASED_PART_H_
+#define AOM_AV1_ENCODER_VAR_BASED_PART_H_
+
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
+                                           int content_state);
+
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                      MACROBLOCK *x, int mi_row, int mi_col);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_VAR_BASED_PART_H_
diff --git a/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
index 13982cc..9483063 100644
--- a/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
+++ b/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -1408,12 +1408,6 @@ static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
   output[15] = x1[0];
 }
 
-static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
-  const __m256i scale__r = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
-  const __m256i b = _mm256_madd_epi16(a, scale__r);
-  return _mm256_srai_epi32(b, NewSqrt2Bits);
-}
-
 static INLINE void fidentity16x16_new_avx2(const __m256i *input,
                                            __m256i *output, int8_t cos_bit) {
   (void)cos_bit;
@@ -1997,6 +1991,794 @@ static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
   }
 }
 
+static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
+                               __m256i *in1, __m128i *out0, __m128i *out1,
+                               __m128i *out2, __m128i *out3,
+                               const __m256i *__rounding, int8_t *cos_bit) {
+  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+  __m256i u0 = _mm256_madd_epi16(t0, *w0);
+  __m256i u1 = _mm256_madd_epi16(t1, *w0);
+  __m256i v0 = _mm256_madd_epi16(t0, *w1);
+  __m256i v1 = _mm256_madd_epi16(t1, *w1);
+
+  __m256i a0 = _mm256_add_epi32(u0, *__rounding);
+  __m256i a1 = _mm256_add_epi32(u1, *__rounding);
+  __m256i b0 = _mm256_add_epi32(v0, *__rounding);
+  __m256i b1 = _mm256_add_epi32(v1, *__rounding);
+
+  __m256i c0 = _mm256_srai_epi32(a0, *cos_bit);
+  __m256i c1 = _mm256_srai_epi32(a1, *cos_bit);
+  __m256i d0 = _mm256_srai_epi32(b0, *cos_bit);
+  __m256i d1 = _mm256_srai_epi32(b1, *cos_bit);
+
+  __m256i temp0 = _mm256_packs_epi32(c0, c1);
+  __m256i temp1 = _mm256_packs_epi32(d0, d1);
+
+  *out0 = _mm256_castsi256_si128(temp0);
+  *out1 = _mm256_castsi256_si128(temp1);
+  *out2 = _mm256_extracti128_si256(temp0, 0x01);
+  *out3 = _mm256_extracti128_si256(temp1, 0x01);
+}
+
+static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
+                                    int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = _mm256_adds_epi16(input[0], input[7]);
+  x1[7] = _mm256_subs_epi16(input[0], input[7]);
+  x1[1] = _mm256_adds_epi16(input[1], input[6]);
+  x1[6] = _mm256_subs_epi16(input[1], input[6]);
+  x1[2] = _mm256_adds_epi16(input[2], input[5]);
+  x1[5] = _mm256_subs_epi16(input[2], input[5]);
+  x1[3] = _mm256_adds_epi16(input[3], input[4]);
+  x1[4] = _mm256_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm256_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm256_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding,
+                  cos_bit);
+  x2[5] = x1[5];
+  x2[6] = x1[6];
+  x2[7] = x1[7];
+
+  // stage 3
+  __m256i x3[8];
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding,
+                  cos_bit);
+  x3[0] = x2[0];
+  x3[1] = x2[1];
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding,
+                  cos_bit);
+  x3[2] = x2[2];
+  x3[3] = x2[3];
+  x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm256_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm256_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding,
+                  cos_bit);
+  x4[4] = x3[4];
+  x4[7] = x3[7];
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding,
+                  cos_bit);
+  x4[5] = x3[5];
+  x4[6] = x3[6];
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+
+static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
+                                     int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __zero = _mm256_setzero_si256();
+  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm256_subs_epi16(__zero, input[7]);
+  x1[2] = _mm256_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm256_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm256_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding,
+                  cos_bit);
+  x2[2] = x1[2];
+  x2[3] = x1[3];
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding,
+                  cos_bit);
+  x2[6] = x1[6];
+  x2[7] = x1[7];
+
+  // stage 3
+  __m256i x3[8];
+  x3[0] = _mm256_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm256_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm256_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm256_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm256_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm256_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm256_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm256_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding,
+                  cos_bit);
+  x4[4] = x3[4];
+  x4[5] = x3[5];
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding,
+                  cos_bit);
+  x4[6] = x3[6];
+  x4[7] = x3[7];
+
+  // stage 5
+  __m256i x5[8];
+  x5[0] = _mm256_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm256_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm256_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm256_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm256_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm256_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm256_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm256_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m256i x6[8];
+  btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding,
+                  cos_bit);
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding,
+                  cos_bit);
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding,
+                  cos_bit);
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding,
+                  cos_bit);
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+
+static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
+                                         int8_t cos_bit) {
+  (void)cos_bit;
+
+  output[0] = _mm256_adds_epi16(input[0], input[0]);
+  output[1] = _mm256_adds_epi16(input[1], input[1]);
+  output[2] = _mm256_adds_epi16(input[2], input[2]);
+  output[3] = _mm256_adds_epi16(input[3], input[3]);
+  output[4] = _mm256_adds_epi16(input[4], input[4]);
+  output[5] = _mm256_adds_epi16(input[5], input[5]);
+  output[6] = _mm256_adds_epi16(input[6], input[6]);
+  output[7] = _mm256_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
+                                     int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  __m128i temp0, temp1, temp2, temp3;
+  __m256i in0, in1;
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+  __m256i cospi_arr[12];
+
+  cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32),
+                                         cospi_m32_p32, 0x1);
+  cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p32_p32, 0x1);
+  cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p48_p16, 0x1);
+  cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+                                         cospi_m16_p48, 0x1);
+  cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48),
+                                         cospi_m48_m16, 0x1);
+  cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16),
+                                         cospi_m16_p48, 0x1);
+  cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08),
+                                         cospi_p24_p40, 0x1);
+  cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56),
+                                         cospi_m40_p24, 0x1);
+  cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04),
+                                         cospi_p28_p36, 0x1);
+  cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60),
+                                         cospi_m36_p28, 0x1);
+  cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20),
+                                          cospi_p12_p52, 0x1);
+  cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44),
+                                          cospi_m52_p12, 0x1);
+
+  __m256i x[8];
+  x[0] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1);
+  x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14],
+                                 0x1);
+  x[2] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1);
+  x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12],
+                                 0x1);
+  x[4] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1);
+  x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11],
+                                 0x1);
+  x[6] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1);
+  x[7] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = _mm256_adds_epi16(x[0], x[1]);
+  x1[7] = _mm256_subs_epi16(x[0], x[1]);
+  x1[1] = _mm256_adds_epi16(x[2], x[3]);
+  x1[6] = _mm256_subs_epi16(x[2], x[3]);
+  x1[2] = _mm256_adds_epi16(x[4], x[5]);
+  x1[5] = _mm256_subs_epi16(x[4], x[5]);
+  x1[3] = _mm256_adds_epi16(x[6], x[7]);
+  x1[4] = _mm256_subs_epi16(x[6], x[7]);
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+  x2[7] = _mm256_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+  x2[6] = _mm256_subs_epi16(x1[1], x1[2]);
+  x2[2] = x1[4];
+  x2[3] = x1[7];
+  btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1,
+              &temp2, &temp3, &__rounding_256, &cos_bit);
+  x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1);
+  x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+  // stage 3
+  __m256i x3[8];
+  x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e);
+  x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+  x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+  x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]),
+              _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1);
+  x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1);
+  x3[3] = _mm256_adds_epi16(x2[2], x2[4]);
+  x3[4] = _mm256_subs_epi16(x2[2], x2[4]);
+  x3[5] = _mm256_adds_epi16(x2[3], x2[5]);
+  x3[6] = _mm256_subs_epi16(x2[3], x2[5]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0);
+  x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21);
+  btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0],
+              &output[8], &output[4], &output[12], &__rounding_256, &cos_bit);
+  x4[2] = _mm256_adds_epi16(x3[2], x3[7]);
+  x4[3] = _mm256_subs_epi16(x3[2], x3[7]);
+  x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20);
+  x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20);
+  in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31);
+  in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31);
+  btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+
+  x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1);
+  x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+  // stage 5
+  __m256i x5[4];
+  in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31);
+  in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20);
+  btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14],
+              &output[10], &output[6], &__rounding_256, &cos_bit);
+  x5[0] = _mm256_adds_epi16(x4[4], x4[6]);
+  x5[1] = _mm256_subs_epi16(x4[4], x4[6]);
+  x5[2] = _mm256_adds_epi16(x4[5], x4[7]);
+  x5[3] = _mm256_subs_epi16(x4[5], x4[7]);
+
+  // stage 6
+  in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20);
+  in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31);
+  btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15],
+              &output[9], &output[7], &__rounding_256, &cos_bit);
+  in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31);
+  in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20);
+  btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5],
+              &output[11], &output[13], &output[3], &__rounding_256, &cos_bit);
+}
+
+static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __zero = _mm256_setzero_si256();
+  const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+  __m256i in0, in1;
+  __m128i temp0, temp1, temp2, temp3;
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  __m256i cospi_arr[20];
+
+  cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p32_p32, 0x1);
+  cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+                                         cospi_p32_m32, 0x1);
+  cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p32_p32, 0x1);
+  cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+                                         cospi_p32_m32, 0x1);
+  cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+                                         cospi_m48_p16, 0x1);
+  cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+                                         cospi_p16_p48, 0x1);
+  cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+                                         cospi_m48_p16, 0x1);
+  cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+                                         cospi_p16_p48, 0x1);
+  cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+                                         cospi_p40_p24, 0x1);
+  cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08),
+                                         cospi_p24_m40, 0x1);
+  cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08),
+                                          cospi_m24_p40, 0x1);
+  cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+                                          cospi_p40_p24, 0x1);
+  cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62),
+                                          cospi_p10_p54, 0x1);
+  cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02),
+                                          cospi_p54_m10, 0x1);
+  cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46),
+                                          cospi_p26_p38, 0x1);
+  cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18),
+                                          cospi_p38_m26, 0x1);
+  cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30),
+                                          cospi_p42_p22, 0x1);
+  cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34),
+                                          cospi_p22_m42, 0x1);
+  cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14),
+                                          cospi_p58_p06, 0x1);
+  cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50),
+                                          cospi_p06_m58, 0x1);
+
+  __m256i x[8];
+  x[0] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1);
+  x[1] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1);
+  x[2] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1);
+  x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14],
+                                 0x1);
+  x[4] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1);
+  x[5] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1);
+  x[6] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1);
+  x[7] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = x[0];
+  x1[1] = _mm256_subs_epi16(__zero, x[7]);
+  x1[2] = x[2];
+  x1[3] = _mm256_subs_epi16(__zero, x[5]);
+  x1[4] = _mm256_subs_epi16(__zero, x[4]);
+  x1[5] = x[3];
+  x1[6] = _mm256_subs_epi16(__zero, x[6]);
+  x1[7] = x[1];
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0);
+  x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0);
+  x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0);
+  x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0);
+  in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0);
+  in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0);
+  btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+  in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21);
+  in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21);
+  btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+  // stage 3
+  __m256i x3[8];
+  x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+  x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+  x3[2] = _mm256_adds_epi16(x2[3], x2[2]);
+  x3[3] = _mm256_subs_epi16(x2[3], x2[2]);
+  x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm256_adds_epi16(x2[7], x2[6]);
+  x3[7] = _mm256_subs_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[4] = x3[4];
+  x4[5] = x3[5];
+  in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20);
+  in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31);
+  btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+  in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20);
+  in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31);
+  btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+  // stage 5
+  __m256i x5[8];
+  x5[0] = _mm256_adds_epi16(x4[0], x4[2]);
+  x5[1] = _mm256_subs_epi16(x4[0], x4[2]);
+  x5[2] = _mm256_adds_epi16(x4[1], x4[3]);
+  x5[3] = _mm256_subs_epi16(x4[1], x4[3]);
+  x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
+  x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
+  x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
+  x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
+
+  // stage 6
+  __m256i x6[8];
+  x6[0] = x5[0];
+  x6[1] = x5[2];
+  x6[2] = x5[1];
+  x6[3] = x5[3];
+  in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20);
+  in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31);
+  btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+  in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20);
+  in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31);
+  btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1,
+              &temp2, &temp3, &__rounding_256, &cos_bit);
+  x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+  // stage 7
+  __m256i x7[8];
+  x7[0] = _mm256_adds_epi16(x6[0], x6[4]);
+  x7[1] = _mm256_subs_epi16(x6[0], x6[4]);
+  x7[2] = _mm256_adds_epi16(x6[1], x6[5]);
+  x7[3] = _mm256_subs_epi16(x6[1], x6[5]);
+  x7[4] = _mm256_adds_epi16(x6[2], x6[6]);
+  x7[5] = _mm256_subs_epi16(x6[2], x6[6]);
+  x7[6] = _mm256_adds_epi16(x6[3], x6[7]);
+  x7[7] = _mm256_subs_epi16(x6[3], x6[7]);
+
+  // stage 8
+  in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31);
+  btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15],
+              &output[0], &output[13], &output[2], &__rounding_256, &cos_bit);
+  in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31);
+  btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11],
+              &output[4], &output[9], &output[6], &__rounding_256, &cos_bit);
+  in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31);
+  btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7],
+              &output[8], &output[5], &output[10], &__rounding_256, &cos_bit);
+  in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31);
+  btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3],
+              &output[12], &output[1], &output[14], &__rounding_256, &cos_bit);
+}
+
+static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i temp;
+  for (int i = 0; i < 16; i += 2) {
+    temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]),
+                                   input[i + 1], 0x1);
+    const __m256i a_lo = _mm256_unpacklo_epi16(temp, one);
+    const __m256i a_hi = _mm256_unpackhi_epi16(temp, one);
+    const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+    const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+    temp = _mm256_packs_epi32(b_lo, b_hi);
+    output[i] = _mm256_castsi256_si128(temp);
+    output[i + 1] = _mm256_extractf128_si256(temp, 0x1);
+  }
+}
+
+static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x8_new_avx2,       // DCT_DCT
+  fdct8x8_new_avx2,       // ADST_DCT
+  fadst8x8_new_avx2,      // DCT_ADST
+  fadst8x8_new_avx2,      // ADST_ADST
+  fdct8x8_new_avx2,       // FLIPADST_DCT
+  fadst8x8_new_avx2,      // DCT_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x8_new_avx2,      // ADST_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_ADST
+  fidentity8x8_new_avx2,  // IDTX
+  fidentity8x8_new_avx2,  // V_DCT
+  fdct8x8_new_avx2,       // H_DCT
+  fidentity8x8_new_avx2,  // V_ADST
+  fadst8x8_new_avx2,      // H_ADST
+  fidentity8x8_new_avx2,  // V_FLIPADST
+  fadst8x8_new_avx2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_avx2,       // DCT_DCT
+  fadst8x16_new_avx2,      // ADST_DCT
+  fdct8x16_new_avx2,       // DCT_ADST
+  fadst8x16_new_avx2,      // ADST_ADST
+  fadst8x16_new_avx2,      // FLIPADST_DCT
+  fdct8x16_new_avx2,       // DCT_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x16_new_avx2,      // ADST_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_ADST
+  fidentity8x16_new_avx2,  // IDTX
+  fdct8x16_new_avx2,       // V_DCT
+  fidentity8x16_new_avx2,  // H_DCT
+  fadst8x16_new_avx2,      // V_ADST
+  fidentity8x16_new_avx2,  // H_ADST
+  fadst8x16_new_avx2,      // V_FLIPADST
+  fidentity8x16_new_avx2   // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = {
+  fdct8x8_new_avx2,       // DCT_DCT
+  fadst8x8_new_avx2,      // ADST_DCT
+  fdct8x8_new_avx2,       // DCT_ADST
+  fadst8x8_new_avx2,      // ADST_ADST
+  fadst8x8_new_avx2,      // FLIPADST_DCT
+  fdct8x8_new_avx2,       // DCT_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x8_new_avx2,      // ADST_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_ADST
+  fidentity8x8_new_avx2,  // IDTX
+  fdct8x8_new_avx2,       // V_DCT
+  fidentity8x8_new_avx2,  // H_DCT
+  fadst8x8_new_avx2,      // V_ADST
+  fidentity8x8_new_avx2,  // H_ADST
+  fadst8x8_new_avx2,      // V_FLIPADST
+  fidentity8x8_new_avx2,  // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = {
+  fdct8x16_new_avx2,       // DCT_DCT
+  fdct8x16_new_avx2,       // ADST_DCT
+  fadst8x16_new_avx2,      // DCT_ADST
+  fadst8x16_new_avx2,      // ADST_ADST
+  fdct8x16_new_avx2,       // FLIPADST_DCT
+  fadst8x16_new_avx2,      // DCT_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x16_new_avx2,      // ADST_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_ADST
+  fidentity8x16_new_avx2,  // IDTX
+  fidentity8x16_new_avx2,  // V_DCT
+  fdct8x16_new_avx2,       // H_DCT
+  fidentity8x16_new_avx2,  // V_ADST
+  fadst8x16_new_avx2,      // H_ADST
+  fidentity8x16_new_avx2,  // V_FLIPADST
+  fadst8x16_new_avx2       // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  __m256i buf2[8];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+  __m128i *bufl, *bufu;
+  if (lr_flip) {
+    bufl = buf0;
+    bufu = buf0 + 8;
+    flip_buf_sse2(buf1 + width * 0, bufl, width);
+    flip_buf_sse2(buf1 + width * 1, bufu, width);
+  } else {
+    bufl = buf1 + width * 0;
+    bufu = buf1 + width * 1;
+  }
+  pack_reg(bufl, bufu, buf2);
+  row_txfm(buf2, buf2, cos_bit_row);
+  round_shift_16bit_w16_avx2(buf2, width, shift[2]);
+  transpose_16bit_16x8_avx2(buf2, buf2);
+  store_rect_buffer_16bit_to_32bit_w8_avx2(buf2, output, width, 8);
+}
+
+static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  __m256i buf2[8];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 8;
+  const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height);
+    load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height);
+  } else {
+    load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height);
+    load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height);
+  }
+  pack_reg(buf0, &buf0[8], buf2);
+  round_shift_16bit_w16_avx2(buf2, height, shift[0]);
+  col_txfm(buf2, buf2, cos_bit_col);
+  round_shift_16bit_w16_avx2(buf2, height, shift[1]);
+  transpose_16bit_16x8_avx2(buf2, buf2);
+  extract_reg(buf2, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_8x8(buf + 8, buf + 8);
+  store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
   av1_lowbd_fwd_txfm2d_4x4_sse2,   // 4x4 transform
   av1_lowbd_fwd_txfm2d_8x8_sse2,   // 8x8 transform
@@ -2005,8 +2787,8 @@ static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
   lowbd_fwd_txfm2d_64x64_avx2,     // 64x64 transform
   av1_lowbd_fwd_txfm2d_4x8_sse2,   // 4x8 transform
   av1_lowbd_fwd_txfm2d_8x4_sse2,   // 8x4 transform
-  av1_lowbd_fwd_txfm2d_8x16_sse2,  // 8x16 transform
-  av1_lowbd_fwd_txfm2d_16x8_sse2,  // 16x8 transform
+  lowbd_fwd_txfm2d_8x16_avx2,      // 8x16 transform
+  lowbd_fwd_txfm2d_16x8_avx2,      // 16x8 transform
   lowbd_fwd_txfm2d_16x32_avx2,     // 16x32 transform
   lowbd_fwd_txfm2d_32x16_avx2,     // 32x16 transform
   lowbd_fwd_txfm2d_32x64_avx2,     // 32x64 transform
diff --git a/libaom/av1/encoder/x86/corner_match_avx2.c b/libaom/av1/encoder/x86/corner_match_avx2.c
new file mode 100644
index 0000000..7a3b999
--- /dev/null
+++ b/libaom/av1/encoder/x86/corner_match_avx2.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <immintrin.h>
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/encoder/corner_match.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, byte_mask[16]) = {
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0
+};
+#if MATCH_SZ != 13
+#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
+#endif
+
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double compute_cross_correlation_avx2(unsigned char *im1, int stride1, int x1,
+                                      int y1, unsigned char *im2, int stride2,
+                                      int x2, int y2) {
+  int i, stride1_i = 0, stride2_i = 0;
+  __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1;
+  const __m128i mask = _mm_load_si128((__m128i *)byte_mask);
+  const __m256i zero = _mm256_setzero_si256();
+  __m128i v1, v2;
+
+  sum_vec = zero;
+  sumsq2_vec = zero;
+  cross_vec = zero;
+
+  im1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2);
+  im2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2);
+
+  for (i = 0; i < MATCH_SZ; ++i) {
+    v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&im1[stride1_i]), mask);
+    v1_1 = _mm256_cvtepu8_epi16(v1);
+    v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&im2[stride2_i]), mask);
+    v2_1 = _mm256_cvtepu8_epi16(v2);
+
+    v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1);
+    sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1));
+
+    sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero));
+    cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1));
+    stride1_i += stride1;
+    stride2_i += stride2;
+  }
+  __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8);
+  sum_vec = _mm256_add_epi32(sum_vec, sum_vec1);
+  int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec));
+  int sum2_acc = _mm256_extract_epi32(sum_vec, 4);
+
+  __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec);
+  __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec);
+  temp1 = _mm256_add_epi32(unp_low, unp_hig);
+
+  __m128i low_sumsq = _mm256_castsi256_si128(temp1);
+  low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1));
+  low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32));
+  int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq);
+  int cross_acc = _mm_extract_epi32(low_sumsq, 2);
+
+  int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc;
+  int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc;
+  return cov / sqrt((double)var2);
+}
diff --git a/libaom/av1/encoder/x86/encodetxb_avx2.c b/libaom/av1/encoder/x86/encodetxb_avx2.c
index 7642f57..2621301 100644
--- a/libaom/av1/encoder/x86/encodetxb_avx2.c
+++ b/libaom/av1/encoder/x86/encodetxb_avx2.c
@@ -26,14 +26,6 @@ void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
   const int stride = width + TX_PAD_HOR;
   const __m256i y_zeros = _mm256_setzero_si256();
 
-  const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
-  uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
-  uint8_t *pre_buf_end = pre_buf + pre_len;
-  do {
-    yy_storeu_256(pre_buf, y_zeros);
-    pre_buf += 32;
-  } while (pre_buf < pre_buf_end);
-
   const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
   uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride;
   uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
diff --git a/libaom/av1/encoder/x86/encodetxb_sse4.c b/libaom/av1/encoder/x86/encodetxb_sse4.c
index 5e0687c..34c9e4f 100644
--- a/libaom/av1/encoder/x86/encodetxb_sse4.c
+++ b/libaom/av1/encoder/x86/encodetxb_sse4.c
@@ -23,14 +23,6 @@ void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
   const int stride = width + TX_PAD_HOR;
   const __m128i zeros = _mm_setzero_si128();
 
-  const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
-  uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
-  uint8_t *pre_buf_end = pre_buf + pre_len;
-  do {
-    _mm_storeu_si128((__m128i *)(pre_buf), zeros);
-    pre_buf += 16;
-  } while (pre_buf < pre_buf_end);
-
   const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
   uint8_t *bottom_buf = levels + stride * height;
   uint8_t *bottom_buf_end = bottom_buf + bottom_len;
diff --git a/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c
new file mode 100644
index 0000000..719734c
--- /dev/null
+++ b/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <stdio.h>
+#include "aom/aom_integer.h"
+#include "av1/common/common.h"
+
+int64_t av1_highbd_block_error_avx2(tran_low_t *coeff, tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz,
+                                    int bps) {
+  int i;
+  int64_t temp1[8];
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bps - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i += 16) {
+    __m256i mm256_coeff = _mm256_loadu_si256((__m256i *)(coeff + i));
+    __m256i mm256_coeff2 = _mm256_loadu_si256((__m256i *)(coeff + i + 8));
+    __m256i mm256_dqcoeff = _mm256_loadu_si256((__m256i *)(dqcoeff + i));
+    __m256i mm256_dqcoeff2 = _mm256_loadu_si256((__m256i *)(dqcoeff + i + 8));
+
+    __m256i diff1 = _mm256_sub_epi32(mm256_coeff, mm256_dqcoeff);
+    __m256i diff2 = _mm256_sub_epi32(mm256_coeff2, mm256_dqcoeff2);
+    __m256i diff1h = _mm256_srli_epi64(diff1, 32);
+    __m256i diff2h = _mm256_srli_epi64(diff2, 32);
+    __m256i res = _mm256_mul_epi32(diff1, diff1);
+    __m256i res1 = _mm256_mul_epi32(diff1h, diff1h);
+    __m256i res2 = _mm256_mul_epi32(diff2, diff2);
+    __m256i res3 = _mm256_mul_epi32(diff2h, diff2h);
+    __m256i res_diff = _mm256_add_epi64(_mm256_add_epi64(res, res1),
+                                        _mm256_add_epi64(res2, res3));
+    __m256i mm256_coeffh = _mm256_srli_epi64(mm256_coeff, 32);
+    __m256i mm256_coeffh2 = _mm256_srli_epi64(mm256_coeff2, 32);
+    res = _mm256_mul_epi32(mm256_coeff, mm256_coeff);
+    res1 = _mm256_mul_epi32(mm256_coeffh, mm256_coeffh);
+    res2 = _mm256_mul_epi32(mm256_coeff2, mm256_coeff2);
+    res3 = _mm256_mul_epi32(mm256_coeffh2, mm256_coeffh2);
+    __m256i res_sqcoeff = _mm256_add_epi64(_mm256_add_epi64(res, res1),
+                                           _mm256_add_epi64(res2, res3));
+    _mm256_storeu_si256((__m256i *)temp1, res_diff);
+    _mm256_storeu_si256((__m256i *)temp1 + 1, res_sqcoeff);
+
+    error += temp1[0] + temp1[1] + temp1[2] + temp1[3];
+    sqcoeff += temp1[4] + temp1[5] + temp1[6] + temp1[7];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
diff --git a/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c
new file mode 100644
index 0000000..24c513f
--- /dev/null
+++ b/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c
@@ -0,0 +1,3170 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h> /*AVX2*/
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void av1_load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
+                                            int stride, int flipud, int fliplr,
+                                            int shift) {
+  __m128i out1[8];
+  if (!flipud) {
+    out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  } else {
+    out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  }
+  if (!fliplr) {
+    out[0] = _mm256_cvtepi16_epi32(out1[0]);
+    out[1] = _mm256_cvtepi16_epi32(out1[1]);
+    out[2] = _mm256_cvtepi16_epi32(out1[2]);
+    out[3] = _mm256_cvtepi16_epi32(out1[3]);
+    out[4] = _mm256_cvtepi16_epi32(out1[4]);
+    out[5] = _mm256_cvtepi16_epi32(out1[5]);
+    out[6] = _mm256_cvtepi16_epi32(out1[6]);
+    out[7] = _mm256_cvtepi16_epi32(out1[7]);
+
+  } else {
+    out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0]));
+    out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1]));
+    out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2]));
+    out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3]));
+    out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4]));
+    out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5]));
+    out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6]));
+    out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7]));
+  }
+  out[0] = _mm256_slli_epi32(out[0], shift);
+  out[1] = _mm256_slli_epi32(out[1], shift);
+  out[2] = _mm256_slli_epi32(out[2], shift);
+  out[3] = _mm256_slli_epi32(out[3], shift);
+  out[4] = _mm256_slli_epi32(out[4], shift);
+  out[5] = _mm256_slli_epi32(out[5], shift);
+  out[6] = _mm256_slli_epi32(out[6], shift);
+  out[7] = _mm256_slli_epi32(out[7], shift);
+}
+static INLINE void col_txfm_8x8_rounding(__m256i *in, int shift) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm256_add_epi32(in[0], rounding);
+  in[1] = _mm256_add_epi32(in[1], rounding);
+  in[2] = _mm256_add_epi32(in[2], rounding);
+  in[3] = _mm256_add_epi32(in[3], rounding);
+  in[4] = _mm256_add_epi32(in[4], rounding);
+  in[5] = _mm256_add_epi32(in[5], rounding);
+  in[6] = _mm256_add_epi32(in[6], rounding);
+  in[7] = _mm256_add_epi32(in[7], rounding);
+
+  in[0] = _mm256_srai_epi32(in[0], shift);
+  in[1] = _mm256_srai_epi32(in[1], shift);
+  in[2] = _mm256_srai_epi32(in[2], shift);
+  in[3] = _mm256_srai_epi32(in[3], shift);
+  in[4] = _mm256_srai_epi32(in[4], shift);
+  in[5] = _mm256_srai_epi32(in[5], shift);
+  in[6] = _mm256_srai_epi32(in[6], shift);
+  in[7] = _mm256_srai_epi32(in[7], shift);
+}
+static INLINE void av1_load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
+                                             int stride, int flipud, int fliplr,
+                                             int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+  av1_load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift);
+  av1_load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift);
+}
+static INLINE void av1_load_buffer_16xn_avx2(const int16_t *input, __m256i *out,
+                                             int stride, int height,
+                                             int outstride, int flipud,
+                                             int fliplr) {
+  __m256i out1[64];
+  if (!flipud) {
+    for (int i = 0; i < height; i++) {
+      out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride));
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      out1[(height - 1) - i] =
+          _mm256_loadu_si256((const __m256i *)(input + i * stride));
+    }
+  }
+  if (!fliplr) {
+    for (int i = 0; i < height; i++) {
+      out[i * outstride] =
+          _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i]));
+      out[i * outstride + 1] =
+          _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1));
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      out[i * outstride + 1] = _mm256_cvtepi16_epi32(
+          mm_reverse_epi16(_mm256_castsi256_si128(out1[i])));
+      out[i * outstride + 0] = _mm256_cvtepi16_epi32(
+          mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1)));
+    }
+  }
+}
+
+static void av1_fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out,
+                                            const int instride,
+                                            const int outstride) {
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i x0, x1;
+
+  u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]);
+  u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]);
+
+  u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]);
+  u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]);
+
+  u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]);
+  u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]);
+
+  u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]);
+  u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]);
+
+  x0 = _mm256_unpacklo_epi64(u0, u2);
+  x1 = _mm256_unpacklo_epi64(u4, u6);
+  out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u0, u2);
+  x1 = _mm256_unpackhi_epi64(u4, u6);
+  out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpacklo_epi64(u1, u3);
+  x1 = _mm256_unpacklo_epi64(u5, u7);
+  out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u1, u3);
+  x1 = _mm256_unpackhi_epi64(u5, u7);
+  out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+static INLINE void av1_round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
+                                               int stride) {
+  if (bit < 0) {
+    bit = -bit;
+    __m256i round = _mm256_set1_epi32(1 << (bit - 1));
+    for (int i = 0; i < size; ++i) {
+      in[stride * i] = _mm256_add_epi32(in[stride * i], round);
+      in[stride * i] = _mm256_srai_epi32(in[stride * i], bit);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[stride * i] = _mm256_slli_epi32(in[stride * i], bit);
+    }
+  }
+}
+static INLINE void av1_store_buffer_avx2(const __m256i *const in, int32_t *out,
+                                         const int stride, const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    _mm256_store_si256((__m256i *)(out), in[i]);
+    out += stride;
+  }
+}
+static INLINE void av1_fwd_txfm_transpose_16x16_avx2(const __m256i *in,
+                                                     __m256i *out) {
+  av1_fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2);
+  av1_fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2);
+  av1_fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2);
+  av1_fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2);
+}
+
+static INLINE __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0,
+                                        const __m256i *w1, const __m256i *n1,
+                                        const __m256i *rounding, int bit) {
+  __m256i x, y;
+
+  x = _mm256_mullo_epi32(*w0, *n0);
+  y = _mm256_mullo_epi32(*w1, *n1);
+  x = _mm256_add_epi32(x, y);
+  x = _mm256_add_epi32(x, *rounding);
+  x = _mm256_srai_epi32(x, bit);
+  return x;
+}
+#define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                       \
+    const __m256i ww0 = _mm256_set1_epi32(w0);               \
+    const __m256i ww1 = _mm256_set1_epi32(w1);               \
+    const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);     \
+    const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);     \
+    out0 = _mm256_add_epi32(in0_w0, in1_w1);                 \
+    av1_round_shift_32_8xn_avx2(&out0, 1, -bit, 1);          \
+    const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);     \
+    const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);     \
+    out1 = _mm256_sub_epi32(in0_w1, in1_w0);                 \
+    av1_round_shift_32_8xn_avx2(&out1, 1, -bit, 1);          \
+  } while (0)
+
+#define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                \
+    const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);              \
+    const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);              \
+    out0 = _mm256_add_epi32(in0_w0, in1_w1);                          \
+    out0 = _mm256_add_epi32(out0, r);                                 \
+    out0 = _mm256_srai_epi32(out0, bit);                              \
+    const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);              \
+    const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);              \
+    out1 = _mm256_sub_epi32(in0_w1, in1_w0);                          \
+    out1 = _mm256_add_epi32(out1, r);                                 \
+    out1 = _mm256_srai_epi32(out1, bit);                              \
+  } while (0)
+
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out,
+                                  const int8_t cos_bit, int instride,
+                                  int outstride);
+static void av1_fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                           const int col_num, const int outstride) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  __m256i u[8], v[8];
+  for (int col = 0; col < col_num; ++col) {
+    u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]);
+    v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]);
+    u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]);
+    u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]);
+    u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]);
+    u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]);
+    u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]);
+    v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]);
+    v[0] = _mm256_add_epi32(u[0], u[3]);
+    v[3] = _mm256_sub_epi32(u[0], u[3]);
+    v[1] = _mm256_add_epi32(u[1], u[2]);
+    v[2] = _mm256_sub_epi32(u[1], u[2]);
+
+    v[5] = _mm256_mullo_epi32(u[5], cospim32);
+    v[6] = _mm256_mullo_epi32(u[6], cospi32);
+    v[5] = _mm256_add_epi32(v[5], v[6]);
+    v[5] = _mm256_add_epi32(v[5], rnding);
+    v[5] = _mm256_srai_epi32(v[5], bit);
+
+    u[0] = _mm256_mullo_epi32(u[5], cospi32);
+    v[6] = _mm256_mullo_epi32(u[6], cospim32);
+    v[6] = _mm256_sub_epi32(u[0], v[6]);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    // stage 3
+    // type 0
+    v[0] = _mm256_mullo_epi32(v[0], cospi32);
+    v[1] = _mm256_mullo_epi32(v[1], cospi32);
+    u[0] = _mm256_add_epi32(v[0], v[1]);
+    u[0] = _mm256_add_epi32(u[0], rnding);
+    u[0] = _mm256_srai_epi32(u[0], bit);
+
+    u[1] = _mm256_sub_epi32(v[0], v[1]);
+    u[1] = _mm256_add_epi32(u[1], rnding);
+    u[1] = _mm256_srai_epi32(u[1], bit);
+
+    // type 1
+    v[0] = _mm256_mullo_epi32(v[2], cospi48);
+    v[1] = _mm256_mullo_epi32(v[3], cospi16);
+    u[2] = _mm256_add_epi32(v[0], v[1]);
+    u[2] = _mm256_add_epi32(u[2], rnding);
+    u[2] = _mm256_srai_epi32(u[2], bit);
+
+    v[0] = _mm256_mullo_epi32(v[2], cospi16);
+    v[1] = _mm256_mullo_epi32(v[3], cospi48);
+    u[3] = _mm256_sub_epi32(v[1], v[0]);
+    u[3] = _mm256_add_epi32(u[3], rnding);
+    u[3] = _mm256_srai_epi32(u[3], bit);
+
+    u[4] = _mm256_add_epi32(v[4], v[5]);
+    u[5] = _mm256_sub_epi32(v[4], v[5]);
+    u[6] = _mm256_sub_epi32(v[7], v[6]);
+    u[7] = _mm256_add_epi32(v[7], v[6]);
+
+    // stage 4
+    // stage 5
+    v[0] = _mm256_mullo_epi32(u[4], cospi56);
+    v[1] = _mm256_mullo_epi32(u[7], cospi8);
+    v[0] = _mm256_add_epi32(v[0], v[1]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[4]
+
+    v[0] = _mm256_mullo_epi32(u[4], cospi8);
+    v[1] = _mm256_mullo_epi32(u[7], cospi56);
+    v[0] = _mm256_sub_epi32(v[1], v[0]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[7]
+
+    v[0] = _mm256_mullo_epi32(u[5], cospi24);
+    v[1] = _mm256_mullo_epi32(u[6], cospi40);
+    v[0] = _mm256_add_epi32(v[0], v[1]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[5]
+
+    v[0] = _mm256_mullo_epi32(u[5], cospi40);
+    v[1] = _mm256_mullo_epi32(u[6], cospi24);
+    v[0] = _mm256_sub_epi32(v[1], v[0]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[6]
+
+    out[0 * outstride + col] = u[0];  // buf0[0]
+    out[4 * outstride + col] = u[1];  // buf0[1]
+    out[2 * outstride + col] = u[2];  // buf0[2]
+    out[6 * outstride + col] = u[3];  // buf0[3]
+  }
+}
+static void av1_fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                            const int col_num, const int outstirde) {
+  (void)col_num;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m256i x, y;
+  for (int col = 0; col < col_num; ++col) {
+    u0 = in[0 * col_num + col];
+    u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]);
+    u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]);
+    u3 = in[4 * col_num + col];
+    u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]);
+    u5 = in[6 * col_num + col];
+    u6 = in[2 * col_num + col];
+    u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = _mm256_mullo_epi32(u2, cospi32);
+    y = _mm256_mullo_epi32(u3, cospi32);
+    v2 = _mm256_add_epi32(x, y);
+    v2 = _mm256_add_epi32(v2, rnding);
+    v2 = _mm256_srai_epi32(v2, bit);
+
+    v3 = _mm256_sub_epi32(x, y);
+    v3 = _mm256_add_epi32(v3, rnding);
+    v3 = _mm256_srai_epi32(v3, bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = _mm256_mullo_epi32(u6, cospi32);
+    y = _mm256_mullo_epi32(u7, cospi32);
+    v6 = _mm256_add_epi32(x, y);
+    v6 = _mm256_add_epi32(v6, rnding);
+    v6 = _mm256_srai_epi32(v6, bit);
+
+    v7 = _mm256_sub_epi32(x, y);
+    v7 = _mm256_add_epi32(v7, rnding);
+    v7 = _mm256_srai_epi32(v7, bit);
+
+    // stage 3
+    u0 = _mm256_add_epi32(v0, v2);
+    u1 = _mm256_add_epi32(v1, v3);
+    u2 = _mm256_sub_epi32(v0, v2);
+    u3 = _mm256_sub_epi32(v1, v3);
+    u4 = _mm256_add_epi32(v4, v6);
+    u5 = _mm256_add_epi32(v5, v7);
+    u6 = _mm256_sub_epi32(v4, v6);
+    u7 = _mm256_sub_epi32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm256_mullo_epi32(u4, cospi16);
+    y = _mm256_mullo_epi32(u5, cospi48);
+    v4 = _mm256_add_epi32(x, y);
+    v4 = _mm256_add_epi32(v4, rnding);
+    v4 = _mm256_srai_epi32(v4, bit);
+
+    x = _mm256_mullo_epi32(u4, cospi48);
+    y = _mm256_mullo_epi32(u5, cospim16);
+    v5 = _mm256_add_epi32(x, y);
+    v5 = _mm256_add_epi32(v5, rnding);
+    v5 = _mm256_srai_epi32(v5, bit);
+
+    x = _mm256_mullo_epi32(u6, cospim48);
+    y = _mm256_mullo_epi32(u7, cospi16);
+    v6 = _mm256_add_epi32(x, y);
+    v6 = _mm256_add_epi32(v6, rnding);
+    v6 = _mm256_srai_epi32(v6, bit);
+
+    x = _mm256_mullo_epi32(u6, cospi16);
+    y = _mm256_mullo_epi32(u7, cospi48);
+    v7 = _mm256_add_epi32(x, y);
+    v7 = _mm256_add_epi32(v7, rnding);
+    v7 = _mm256_srai_epi32(v7, bit);
+
+    // stage 5
+    u0 = _mm256_add_epi32(v0, v4);
+    u1 = _mm256_add_epi32(v1, v5);
+    u2 = _mm256_add_epi32(v2, v6);
+    u3 = _mm256_add_epi32(v3, v7);
+    u4 = _mm256_sub_epi32(v0, v4);
+    u5 = _mm256_sub_epi32(v1, v5);
+    u6 = _mm256_sub_epi32(v2, v6);
+    u7 = _mm256_sub_epi32(v3, v7);
+
+    // stage 6
+    x = _mm256_mullo_epi32(u0, cospi4);
+    y = _mm256_mullo_epi32(u1, cospi60);
+    v0 = _mm256_add_epi32(x, y);
+    v0 = _mm256_add_epi32(v0, rnding);
+    v0 = _mm256_srai_epi32(v0, bit);
+
+    x = _mm256_mullo_epi32(u0, cospi60);
+    y = _mm256_mullo_epi32(u1, cospim4);
+    v1 = _mm256_add_epi32(x, y);
+    v1 = _mm256_add_epi32(v1, rnding);
+    v1 = _mm256_srai_epi32(v1, bit);
+
+    x = _mm256_mullo_epi32(u2, cospi20);
+    y = _mm256_mullo_epi32(u3, cospi44);
+    v2 = _mm256_add_epi32(x, y);
+    v2 = _mm256_add_epi32(v2, rnding);
+    v2 = _mm256_srai_epi32(v2, bit);
+
+    x = _mm256_mullo_epi32(u2, cospi44);
+    y = _mm256_mullo_epi32(u3, cospim20);
+    v3 = _mm256_add_epi32(x, y);
+    v3 = _mm256_add_epi32(v3, rnding);
+    v3 = _mm256_srai_epi32(v3, bit);
+
+    x = _mm256_mullo_epi32(u4, cospi36);
+    y = _mm256_mullo_epi32(u5, cospi28);
+    v4 = _mm256_add_epi32(x, y);
+    v4 = _mm256_add_epi32(v4, rnding);
+    v4 = _mm256_srai_epi32(v4, bit);
+
+    x = _mm256_mullo_epi32(u4, cospi28);
+    y = _mm256_mullo_epi32(u5, cospim36);
+    v5 = _mm256_add_epi32(x, y);
+    v5 = _mm256_add_epi32(v5, rnding);
+    v5 = _mm256_srai_epi32(v5, bit);
+
+    x = _mm256_mullo_epi32(u6, cospi52);
+    y = _mm256_mullo_epi32(u7, cospi12);
+    v6 = _mm256_add_epi32(x, y);
+    v6 = _mm256_add_epi32(v6, rnding);
+    v6 = _mm256_srai_epi32(v6, bit);
+
+    x = _mm256_mullo_epi32(u6, cospi12);
+    y = _mm256_mullo_epi32(u7, cospim52);
+    v7 = _mm256_add_epi32(x, y);
+    v7 = _mm256_add_epi32(v7, rnding);
+    v7 = _mm256_srai_epi32(v7, bit);
+
+    // stage 7
+    out[0 * outstirde + col] = v1;
+    out[1 * outstirde + col] = v6;
+    out[2 * outstirde + col] = v3;
+    out[3 * outstirde + col] = v4;
+    out[4 * outstirde + col] = v5;
+    out[5 * outstirde + col] = v2;
+    out[6 * outstirde + col] = v7;
+    out[7 * outstirde + col] = v0;
+  }
+}
+static void av1_idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                           int col_num, int outstride) {
+  (void)bit;
+  (void)outstride;
+  int num_iters = 8 * col_num;
+  for (int i = 0; i < num_iters; i += 8) {
+    out[i] = _mm256_add_epi32(in[i], in[i]);
+    out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]);
+    out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]);
+    out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]);
+    out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]);
+    out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]);
+    out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]);
+    out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]);
+  }
+}
+void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  __m256i in[8], out[8];
+  const TX_SIZE tx_size = TX_8X8;
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int width = tx_size_wide[tx_size];
+  const int width_div8 = (width >> 3);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_fdct8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case ADST_DCT:
+      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_fdct8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case DCT_ADST:
+      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case ADST_ADST:
+      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case FLIPADST_DCT:
+      av1_load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_fdct8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case DCT_FLIPADST:
+      av1_load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+      av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case FLIPADST_FLIPADST:
+      av1_load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case ADST_FLIPADST:
+      av1_load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case FLIPADST_ADST:
+      av1_load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case IDTX:
+      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case V_DCT:
+      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case H_DCT:
+      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case V_ADST:
+      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case H_ADST:
+      av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case V_FLIPADST:
+      av1_load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case H_FLIPADST:
+      av1_load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+      av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                     width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static void av1_fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                            const int col_num, const int outstride) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  __m256i u[16], v[16], x;
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+    u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+    // stage 2
+    v[0] = _mm256_add_epi32(u[0], u[7]);
+    v[7] = _mm256_sub_epi32(u[0], u[7]);
+    v[1] = _mm256_add_epi32(u[1], u[6]);
+    v[6] = _mm256_sub_epi32(u[1], u[6]);
+    v[2] = _mm256_add_epi32(u[2], u[5]);
+    v[5] = _mm256_sub_epi32(u[2], u[5]);
+    v[3] = _mm256_add_epi32(u[3], u[4]);
+    v[4] = _mm256_sub_epi32(u[3], u[4]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    v[10] = _mm256_mullo_epi32(u[10], cospim32);
+    x = _mm256_mullo_epi32(u[13], cospi32);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[13] = _mm256_mullo_epi32(u[10], cospi32);
+    x = _mm256_mullo_epi32(u[13], cospim32);
+    v[13] = _mm256_sub_epi32(v[13], x);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[11] = _mm256_mullo_epi32(u[11], cospim32);
+    x = _mm256_mullo_epi32(u[12], cospi32);
+    v[11] = _mm256_add_epi32(v[11], x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = _mm256_mullo_epi32(u[11], cospi32);
+    x = _mm256_mullo_epi32(u[12], cospim32);
+    v[12] = _mm256_sub_epi32(v[12], x);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 3
+    u[0] = _mm256_add_epi32(v[0], v[3]);
+    u[3] = _mm256_sub_epi32(v[0], v[3]);
+    u[1] = _mm256_add_epi32(v[1], v[2]);
+    u[2] = _mm256_sub_epi32(v[1], v[2]);
+    u[4] = v[4];
+
+    u[5] = _mm256_mullo_epi32(v[5], cospim32);
+    x = _mm256_mullo_epi32(v[6], cospi32);
+    u[5] = _mm256_add_epi32(u[5], x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    u[6] = _mm256_mullo_epi32(v[5], cospi32);
+    x = _mm256_mullo_epi32(v[6], cospim32);
+    u[6] = _mm256_sub_epi32(u[6], x);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm256_add_epi32(v[8], v[11]);
+    u[11] = _mm256_sub_epi32(v[8], v[11]);
+    u[9] = _mm256_add_epi32(v[9], v[10]);
+    u[10] = _mm256_sub_epi32(v[9], v[10]);
+    u[12] = _mm256_sub_epi32(v[15], v[12]);
+    u[15] = _mm256_add_epi32(v[15], v[12]);
+    u[13] = _mm256_sub_epi32(v[14], v[13]);
+    u[14] = _mm256_add_epi32(v[14], v[13]);
+
+    // stage 4
+    u[0] = _mm256_mullo_epi32(u[0], cospi32);
+    u[1] = _mm256_mullo_epi32(u[1], cospi32);
+    v[0] = _mm256_add_epi32(u[0], u[1]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    v[0] = _mm256_srai_epi32(v[0], bit);
+
+    v[1] = _mm256_sub_epi32(u[0], u[1]);
+    v[1] = _mm256_add_epi32(v[1], rnding);
+    v[1] = _mm256_srai_epi32(v[1], bit);
+
+    v[2] = _mm256_mullo_epi32(u[2], cospi48);
+    x = _mm256_mullo_epi32(u[3], cospi16);
+    v[2] = _mm256_add_epi32(v[2], x);
+    v[2] = _mm256_add_epi32(v[2], rnding);
+    v[2] = _mm256_srai_epi32(v[2], bit);
+
+    v[3] = _mm256_mullo_epi32(u[2], cospi16);
+    x = _mm256_mullo_epi32(u[3], cospi48);
+    v[3] = _mm256_sub_epi32(x, v[3]);
+    v[3] = _mm256_add_epi32(v[3], rnding);
+    v[3] = _mm256_srai_epi32(v[3], bit);
+
+    v[4] = _mm256_add_epi32(u[4], u[5]);
+    v[5] = _mm256_sub_epi32(u[4], u[5]);
+    v[6] = _mm256_sub_epi32(u[7], u[6]);
+    v[7] = _mm256_add_epi32(u[7], u[6]);
+    v[8] = u[8];
+
+    v[9] = _mm256_mullo_epi32(u[9], cospim16);
+    x = _mm256_mullo_epi32(u[14], cospi48);
+    v[9] = _mm256_add_epi32(v[9], x);
+    v[9] = _mm256_add_epi32(v[9], rnding);
+    v[9] = _mm256_srai_epi32(v[9], bit);
+
+    v[14] = _mm256_mullo_epi32(u[9], cospi48);
+    x = _mm256_mullo_epi32(u[14], cospim16);
+    v[14] = _mm256_sub_epi32(v[14], x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[10] = _mm256_mullo_epi32(u[10], cospim48);
+    x = _mm256_mullo_epi32(u[13], cospim16);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[13] = _mm256_mullo_epi32(u[10], cospim16);
+    x = _mm256_mullo_epi32(u[13], cospim48);
+    v[13] = _mm256_sub_epi32(v[13], x);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[11] = u[11];
+    v[12] = u[12];
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+
+    u[4] = _mm256_mullo_epi32(v[4], cospi56);
+    x = _mm256_mullo_epi32(v[7], cospi8);
+    u[4] = _mm256_add_epi32(u[4], x);
+    u[4] = _mm256_add_epi32(u[4], rnding);
+    u[4] = _mm256_srai_epi32(u[4], bit);
+
+    u[7] = _mm256_mullo_epi32(v[4], cospi8);
+    x = _mm256_mullo_epi32(v[7], cospi56);
+    u[7] = _mm256_sub_epi32(x, u[7]);
+    u[7] = _mm256_add_epi32(u[7], rnding);
+    u[7] = _mm256_srai_epi32(u[7], bit);
+
+    u[5] = _mm256_mullo_epi32(v[5], cospi24);
+    x = _mm256_mullo_epi32(v[6], cospi40);
+    u[5] = _mm256_add_epi32(u[5], x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    u[6] = _mm256_mullo_epi32(v[5], cospi40);
+    x = _mm256_mullo_epi32(v[6], cospi24);
+    u[6] = _mm256_sub_epi32(x, u[6]);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    u[8] = _mm256_add_epi32(v[8], v[9]);
+    u[9] = _mm256_sub_epi32(v[8], v[9]);
+    u[10] = _mm256_sub_epi32(v[11], v[10]);
+    u[11] = _mm256_add_epi32(v[11], v[10]);
+    u[12] = _mm256_add_epi32(v[12], v[13]);
+    u[13] = _mm256_sub_epi32(v[12], v[13]);
+    u[14] = _mm256_sub_epi32(v[15], v[14]);
+    u[15] = _mm256_add_epi32(v[15], v[14]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm256_mullo_epi32(u[8], cospi60);
+    x = _mm256_mullo_epi32(u[15], cospi4);
+    v[8] = _mm256_add_epi32(v[8], x);
+    v[8] = _mm256_add_epi32(v[8], rnding);
+    v[8] = _mm256_srai_epi32(v[8], bit);
+
+    v[15] = _mm256_mullo_epi32(u[8], cospi4);
+    x = _mm256_mullo_epi32(u[15], cospi60);
+    v[15] = _mm256_sub_epi32(x, v[15]);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    v[9] = _mm256_mullo_epi32(u[9], cospi28);
+    x = _mm256_mullo_epi32(u[14], cospi36);
+    v[9] = _mm256_add_epi32(v[9], x);
+    v[9] = _mm256_add_epi32(v[9], rnding);
+    v[9] = _mm256_srai_epi32(v[9], bit);
+
+    v[14] = _mm256_mullo_epi32(u[9], cospi36);
+    x = _mm256_mullo_epi32(u[14], cospi28);
+    v[14] = _mm256_sub_epi32(x, v[14]);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[10] = _mm256_mullo_epi32(u[10], cospi44);
+    x = _mm256_mullo_epi32(u[13], cospi20);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[13] = _mm256_mullo_epi32(u[10], cospi20);
+    x = _mm256_mullo_epi32(u[13], cospi44);
+    v[13] = _mm256_sub_epi32(x, v[13]);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[11] = _mm256_mullo_epi32(u[11], cospi12);
+    x = _mm256_mullo_epi32(u[12], cospi52);
+    v[11] = _mm256_add_epi32(v[11], x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = _mm256_mullo_epi32(u[11], cospi52);
+    x = _mm256_mullo_epi32(u[12], cospi12);
+    v[12] = _mm256_sub_epi32(x, v[12]);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+
+    out[0 * outstride + col] = v[0];
+    out[1 * outstride + col] = v[8];
+    out[2 * outstride + col] = v[4];
+    out[3 * outstride + col] = v[12];
+    out[4 * outstride + col] = v[2];
+    out[5 * outstride + col] = v[10];
+    out[6 * outstride + col] = v[6];
+    out[7 * outstride + col] = v[14];
+    out[8 * outstride + col] = v[1];
+    out[9 * outstride + col] = v[9];
+    out[10 * outstride + col] = v[5];
+    out[11 * outstride + col] = v[13];
+    out[12 * outstride + col] = v[3];
+    out[13 * outstride + col] = v[11];
+    out[14 * outstride + col] = v[7];
+    out[15 * outstride + col] = v[15];
+  }
+}
+static void av1_fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                             const int num_cols, const int outstride) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const __m256i zero = _mm256_setzero_si256();
+
+  __m256i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < num_cols; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * num_cols + col];
+    u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]);
+    u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]);
+    u[3] = in[8 * num_cols + col];
+    u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]);
+    u[5] = in[12 * num_cols + col];
+    u[6] = in[4 * num_cols + col];
+    u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]);
+    u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]);
+    u[9] = in[14 * num_cols + col];
+    u[10] = in[6 * num_cols + col];
+    u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]);
+    u[12] = in[2 * num_cols + col];
+    u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]);
+    u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]);
+    u[15] = in[10 * num_cols + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = _mm256_mullo_epi32(u[2], cospi32);
+    y = _mm256_mullo_epi32(u[3], cospi32);
+    v[2] = _mm256_add_epi32(x, y);
+    v[2] = _mm256_add_epi32(v[2], rnding);
+    v[2] = _mm256_srai_epi32(v[2], bit);
+
+    v[3] = _mm256_sub_epi32(x, y);
+    v[3] = _mm256_add_epi32(v[3], rnding);
+    v[3] = _mm256_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm256_mullo_epi32(u[6], cospi32);
+    y = _mm256_mullo_epi32(u[7], cospi32);
+    v[6] = _mm256_add_epi32(x, y);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    v[7] = _mm256_sub_epi32(x, y);
+    v[7] = _mm256_add_epi32(v[7], rnding);
+    v[7] = _mm256_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm256_mullo_epi32(u[10], cospi32);
+    y = _mm256_mullo_epi32(u[11], cospi32);
+    v[10] = _mm256_add_epi32(x, y);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[11] = _mm256_sub_epi32(x, y);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = _mm256_mullo_epi32(u[14], cospi32);
+    y = _mm256_mullo_epi32(u[15], cospi32);
+    v[14] = _mm256_add_epi32(x, y);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_sub_epi32(x, y);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm256_add_epi32(v[0], v[2]);
+    u[1] = _mm256_add_epi32(v[1], v[3]);
+    u[2] = _mm256_sub_epi32(v[0], v[2]);
+    u[3] = _mm256_sub_epi32(v[1], v[3]);
+    u[4] = _mm256_add_epi32(v[4], v[6]);
+    u[5] = _mm256_add_epi32(v[5], v[7]);
+    u[6] = _mm256_sub_epi32(v[4], v[6]);
+    u[7] = _mm256_sub_epi32(v[5], v[7]);
+    u[8] = _mm256_add_epi32(v[8], v[10]);
+    u[9] = _mm256_add_epi32(v[9], v[11]);
+    u[10] = _mm256_sub_epi32(v[8], v[10]);
+    u[11] = _mm256_sub_epi32(v[9], v[11]);
+    u[12] = _mm256_add_epi32(v[12], v[14]);
+    u[13] = _mm256_add_epi32(v[13], v[15]);
+    u[14] = _mm256_sub_epi32(v[12], v[14]);
+    u[15] = _mm256_sub_epi32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+    v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+    v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+    v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+    v[13] =
+        av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+    v[14] =
+        av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+    v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+
+    // stage 5
+    u[0] = _mm256_add_epi32(v[0], v[4]);
+    u[1] = _mm256_add_epi32(v[1], v[5]);
+    u[2] = _mm256_add_epi32(v[2], v[6]);
+    u[3] = _mm256_add_epi32(v[3], v[7]);
+    u[4] = _mm256_sub_epi32(v[0], v[4]);
+    u[5] = _mm256_sub_epi32(v[1], v[5]);
+    u[6] = _mm256_sub_epi32(v[2], v[6]);
+    u[7] = _mm256_sub_epi32(v[3], v[7]);
+    u[8] = _mm256_add_epi32(v[8], v[12]);
+    u[9] = _mm256_add_epi32(v[9], v[13]);
+    u[10] = _mm256_add_epi32(v[10], v[14]);
+    u[11] = _mm256_add_epi32(v[11], v[15]);
+    u[12] = _mm256_sub_epi32(v[8], v[12]);
+    u[13] = _mm256_sub_epi32(v[9], v[13]);
+    u[14] = _mm256_sub_epi32(v[10], v[14]);
+    u[15] = _mm256_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+    v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+    v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+    v[11] =
+        av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+    v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+    v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+    v[14] =
+        av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+    v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+    // stage 7
+    u[0] = _mm256_add_epi32(v[0], v[8]);
+    u[1] = _mm256_add_epi32(v[1], v[9]);
+    u[2] = _mm256_add_epi32(v[2], v[10]);
+    u[3] = _mm256_add_epi32(v[3], v[11]);
+    u[4] = _mm256_add_epi32(v[4], v[12]);
+    u[5] = _mm256_add_epi32(v[5], v[13]);
+    u[6] = _mm256_add_epi32(v[6], v[14]);
+    u[7] = _mm256_add_epi32(v[7], v[15]);
+    u[8] = _mm256_sub_epi32(v[0], v[8]);
+    u[9] = _mm256_sub_epi32(v[1], v[9]);
+    u[10] = _mm256_sub_epi32(v[2], v[10]);
+    u[11] = _mm256_sub_epi32(v[3], v[11]);
+    u[12] = _mm256_sub_epi32(v[4], v[12]);
+    u[13] = _mm256_sub_epi32(v[5], v[13]);
+    u[14] = _mm256_sub_epi32(v[6], v[14]);
+    u[15] = _mm256_sub_epi32(v[7], v[15]);
+
+    // stage 8
+    v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+    v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+    v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+    v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+    v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+    v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+    v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+    v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+    v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+    v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+    v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+    v[11] =
+        av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+    v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+    v[13] =
+        av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+    v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+    v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+
+    // stage 9
+    out[0 * outstride + col] = v[1];
+    out[1 * outstride + col] = v[14];
+    out[2 * outstride + col] = v[3];
+    out[3 * outstride + col] = v[12];
+    out[4 * outstride + col] = v[5];
+    out[5 * outstride + col] = v[10];
+    out[6 * outstride + col] = v[7];
+    out[7 * outstride + col] = v[8];
+    out[8 * outstride + col] = v[9];
+    out[9 * outstride + col] = v[6];
+    out[10 * outstride + col] = v[11];
+    out[11 * outstride + col] = v[4];
+    out[12 * outstride + col] = v[13];
+    out[13 * outstride + col] = v[2];
+    out[14 * outstride + col] = v[15];
+    out[15 * outstride + col] = v[0];
+  }
+}
+static void av1_idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                            int col_num, const int outstride) {
+  (void)bit;
+  (void)outstride;
+  __m256i fact = _mm256_set1_epi32(2 * NewSqrt2);
+  __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m256i a_low;
+
+  int num_iters = 16 * col_num;
+  for (int i = 0; i < num_iters; i++) {
+    a_low = _mm256_mullo_epi32(in[i], fact);
+    a_low = _mm256_add_epi32(a_low, offset);
+    out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits);
+  }
+}
+static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = {
+  av1_fdct16_avx2,   // DCT_DCT
+  av1_fadst16_avx2,  // ADST_DCT
+  av1_fdct16_avx2,   // DCT_ADST
+  av1_fadst16_avx2,  // ADST_ADST
+  av1_fadst16_avx2,  // FLIPADST_DCT
+  av1_fdct16_avx2,   // DCT_FLIPADST
+  av1_fadst16_avx2,  // FLIPADST_FLIPADST
+  av1_fadst16_avx2,  // ADST_FLIPADST
+  av1_fadst16_avx2,  // FLIPADST_ADST
+  av1_idtx16_avx2,   // IDTX
+  av1_fdct16_avx2,   // V_DCT
+  av1_idtx16_avx2,   // H_DCT
+  av1_fadst16_avx2,  // V_ADST
+  av1_idtx16_avx2,   // H_ADST
+  av1_fadst16_avx2,  // V_FLIPADST
+  av1_idtx16_avx2    // H_FLIPADST
+};
+static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = {
+  av1_fdct8_avx2,   // DCT_DCT
+  av1_fdct8_avx2,   // ADST_DCT
+  av1_fadst8_avx2,  // DCT_ADST
+  av1_fadst8_avx2,  // ADST_ADST
+  av1_fdct8_avx2,   // FLIPADST_DCT
+  av1_fadst8_avx2,  // DCT_FLIPADST
+  av1_fadst8_avx2,  // FLIPADST_FLIPADST
+  av1_fadst8_avx2,  // ADST_FLIPADST
+  av1_fadst8_avx2,  // FLIPADST_ADST
+  av1_idtx8_avx2,   // IDTX
+  av1_idtx8_avx2,   // V_DCT
+  av1_fdct8_avx2,   // H_DCT
+  av1_idtx8_avx2,   // V_ADST
+  av1_fadst8_avx2,  // H_ADST
+  av1_idtx8_avx2,   // V_FLIPADST
+  av1_fadst8_avx2   // H_FLIPADST
+};
+void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  __m256i in[16], out[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+  const int8_t bit = fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  av1_load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, out, bit, 1, 1);
+  col_txfm_8x8_rounding(out, -shift[1]);
+  col_txfm_8x8_rounding(&out[8], -shift[1]);
+  av1_fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
+  av1_fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
+  row_txfm(in, out, bit, 2, 2);
+  av1_fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
+  av1_fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
+  av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
+  av1_store_buffer_avx2(in, coeff, 8, 16);
+  (void)bd;
+}
+static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = {
+  av1_fdct8_avx2,   // DCT_DCT
+  av1_fadst8_avx2,  // ADST_DCT
+  av1_fdct8_avx2,   // DCT_ADST
+  av1_fadst8_avx2,  // ADST_ADST
+  av1_fadst8_avx2,  // FLIPADST_DCT
+  av1_fdct8_avx2,   // DCT_FLIPADST
+  av1_fadst8_avx2,  // FLIPADST_FLIPADST
+  av1_fadst8_avx2,  // ADST_FLIPADST
+  av1_fadst8_avx2,  // FLIPADST_ADST
+  av1_idtx8_avx2,   // IDTX
+  av1_fdct8_avx2,   // V_DCT
+  av1_idtx8_avx2,   // H_DCT
+  av1_fadst8_avx2,  // V_ADST
+  av1_idtx8_avx2,   // H_ADST
+  av1_fadst8_avx2,  // V_FLIPADST
+  av1_idtx8_avx2    // H_FLIPADST
+};
+static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = {
+  av1_fdct16_avx2,   // DCT_DCT
+  av1_fdct16_avx2,   // ADST_DCT
+  av1_fadst16_avx2,  // DCT_ADST
+  av1_fadst16_avx2,  // ADST_ADST
+  av1_fdct16_avx2,   // FLIPADST_DCT
+  av1_fadst16_avx2,  // DCT_FLIPADST
+  av1_fadst16_avx2,  // FLIPADST_FLIPADST
+  av1_fadst16_avx2,  // ADST_FLIPADST
+  av1_fadst16_avx2,  // FLIPADST_ADST
+  av1_idtx16_avx2,   // IDTX
+  av1_idtx16_avx2,   // V_DCT
+  av1_fdct16_avx2,   // H_DCT
+  av1_idtx16_avx2,   // V_ADST
+  av1_fadst16_avx2,  // H_ADST
+  av1_idtx16_avx2,   // V_FLIPADST
+  av1_fadst16_avx2   // H_FLIPADST
+};
+void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  __m256i in[16], out[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  const int8_t bit = fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  av1_load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip);
+  av1_round_shift_32_8xn_avx2(in, 16, shift[0], 1);
+  col_txfm(in, out, bit, 2, 2);
+  av1_round_shift_32_8xn_avx2(out, 16, shift[1], 1);
+  av1_fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
+  av1_fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
+  row_txfm(in, out, bit, 1, 1);
+  av1_fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
+  av1_fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
+  av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
+  av1_store_buffer_avx2(in, coeff, 8, 16);
+  (void)bd;
+}
+void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  __m256i in[32], out[32];
+  const TX_SIZE tx_size = TX_16X16;
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const int width_div8 = (width >> 3);
+  const int width_div16 = (width >> 4);
+  const int size = (height << 1);
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case ADST_DCT:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case DCT_ADST:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case ADST_ADST:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case FLIPADST_DCT:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case DCT_FLIPADST:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case FLIPADST_FLIPADST:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case ADST_FLIPADST:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case FLIPADST_ADST:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case IDTX:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case V_DCT:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case H_DCT:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case V_ADST:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case H_ADST:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case V_FLIPADST:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case H_FLIPADST:
+      av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+      av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                      width_div8);
+      av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                       width_div8);
+      av1_fwd_txfm_transpose_16x16_avx2(out, in);
+      av1_store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+static INLINE void av1_fdct32_avx2(__m256i *input, __m256i *output,
+                                   const int8_t cos_bit, const int instride,
+                                   const int outstride) {
+  __m256i buf0[32];
+  __m256i buf1[32];
+  const int32_t *cospi;
+  int startidx = 0 * instride;
+  int endidx = 31 * instride;
+  // stage 0
+  // stage 1
+  buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+  // stage 2
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]);
+  buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
+  buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]);
+  buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
+  buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]);
+  buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
+  buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]);
+  buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
+  buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]);
+  buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
+  buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]);
+  buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
+  buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]);
+  buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]);
+  buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]);
+  buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                    buf0[27], cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                    buf0[26], cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                    buf0[25], cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                    buf0[24], cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
+  buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
+  buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
+  buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
+  buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
+  buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
+  buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
+  buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                    buf1[13], cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                    buf1[12], cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]);
+  buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]);
+  buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]);
+  buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]);
+  buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]);
+  buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]);
+  buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]);
+  buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]);
+  buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]);
+  buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]);
+  buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]);
+  buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]);
+  buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]);
+  buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]);
+  buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]);
+  buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]);
+
+  // stage 4
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
+  buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]);
+  buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
+  buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]);
+  buf0[4] = buf1[4];
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+                    cos_bit);
+  buf0[7] = buf1[7];
+  buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]);
+  buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]);
+  buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]);
+  buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]);
+  buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]);
+  buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]);
+  buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]);
+  buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                    buf0[29], cos_bit);
+  btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                    buf0[28], cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                    buf0[27], cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                    buf0[26], cos_bit);
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  cospi = cospi_arr(cos_bit);
+  btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3],
+                    cos_bit);
+  buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]);
+  buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]);
+  buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]);
+  buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]);
+  buf1[8] = buf0[8];
+  btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
+                    cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                    buf1[13], cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]);
+  buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]);
+  buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]);
+  buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]);
+  buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]);
+  buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]);
+  buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]);
+  buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]);
+  buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]);
+  buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]);
+  buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]);
+  buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]);
+  buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]);
+  buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]);
+  buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]);
+  buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]);
+
+  // stage 6
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6],
+                    cos_bit);
+  buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]);
+  buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]);
+  buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]);
+  buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]);
+  buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]);
+  buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]);
+  buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]);
+  buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]);
+  buf0[16] = buf1[16];
+  btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                    buf0[30], cos_bit);
+  btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                    buf0[29], cos_bit);
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+  btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                    buf0[26], cos_bit);
+  btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                    buf0[25], cos_bit);
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+  btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10],
+                    buf1[13], cos_bit);
+  btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11],
+                    buf1[12], cos_bit);
+  buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]);
+  buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]);
+  buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]);
+  buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]);
+  buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]);
+  buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]);
+  buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]);
+  buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]);
+  buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]);
+  buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]);
+  buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]);
+  buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]);
+  buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]);
+  buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]);
+  buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]);
+  buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]);
+
+  // stage 8
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+  btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17],
+                    buf0[30], cos_bit);
+  btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18],
+                    buf0[29], cos_bit);
+  btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19],
+                    buf0[28], cos_bit);
+  btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20],
+                    buf0[27], cos_bit);
+  btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21],
+                    buf0[26], cos_bit);
+  btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22],
+                    buf0[25], cos_bit);
+  btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24],
+                    cos_bit);
+
+  startidx = 0 * outstride;
+  endidx = 31 * outstride;
+  // stage 9
+  output[startidx] = buf0[0];
+  output[endidx] = buf0[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[16];
+  output[endidx] = buf0[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[8];
+  output[endidx] = buf0[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[24];
+  output[endidx] = buf0[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[4];
+  output[endidx] = buf0[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[20];
+  output[endidx] = buf0[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[12];
+  output[endidx] = buf0[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[28];
+  output[endidx] = buf0[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[2];
+  output[endidx] = buf0[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[18];
+  output[endidx] = buf0[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[10];
+  output[endidx] = buf0[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[26];
+  output[endidx] = buf0[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[6];
+  output[endidx] = buf0[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[22];
+  output[endidx] = buf0[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[14];
+  output[endidx] = buf0[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[30];
+  output[endidx] = buf0[1];
+}
+static INLINE void idtx32x32_avx2(__m256i *input, __m256i *output,
+                                  const int8_t cos_bit, int instride,
+                                  int outstride) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; i += 8) {
+    output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2);
+    output[(i + 1) * outstride] =
+        _mm256_slli_epi32(input[(i + 1) * instride], 2);
+    output[(i + 2) * outstride] =
+        _mm256_slli_epi32(input[(i + 2) * instride], 2);
+    output[(i + 3) * outstride] =
+        _mm256_slli_epi32(input[(i + 3) * instride], 2);
+    output[(i + 4) * outstride] =
+        _mm256_slli_epi32(input[(i + 4) * instride], 2);
+    output[(i + 5) * outstride] =
+        _mm256_slli_epi32(input[(i + 5) * instride], 2);
+    output[(i + 6) * outstride] =
+        _mm256_slli_epi32(input[(i + 6) * instride], 2);
+    output[(i + 7) * outstride] =
+        _mm256_slli_epi32(input[(i + 7) * instride], 2);
+  }
+}
+static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct32_avx2,  // DCT_DCT
+  NULL,             // ADST_DCT
+  NULL,             // DCT_ADST
+  NULL,             // ADST_ADST
+  NULL,             // FLIPADST_DCT
+  NULL,             // DCT_FLIPADST
+  NULL,             // FLIPADST_FLIPADST
+  NULL,             // ADST_FLIPADST
+  NULL,             // FLIPADST_ADST
+  idtx32x32_avx2,   // IDTX
+  NULL,             // V_DCT
+  NULL,             // H_DCT
+  NULL,             // V_ADST
+  NULL,             // H_ADST
+  NULL,             // V_FLIPADST
+  NULL              // H_FLIPADST
+};
+static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct32_avx2,  // DCT_DCT
+  NULL,             // ADST_DCT
+  NULL,             // DCT_ADST
+  NULL,             // ADST_ADST
+  NULL,             // FLIPADST_DCT
+  NULL,             // DCT_FLIPADST
+  NULL,             // FLIPADST_FLIPADST
+  NULL,             // ADST_FLIPADST
+  NULL,             // FLIPADST_ADST
+  idtx32x32_avx2,   // IDTX
+  NULL,             // V_DCT
+  NULL,             // H_DCT
+  NULL,             // V_ADST
+  NULL,             // H_ADST
+  NULL,             // V_FLIPADST
+  NULL              // H_FLIPADST
+};
+void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m256i buf0[128], buf1[128];
+  const int tx_size = TX_32X32;
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type];
+  int r, c;
+  const int width_div16 = (width >> 4);
+  const int width_div8 = (width >> 3);
+
+  for (int i = 0; i < width_div16; i++) {
+    av1_load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height,
+                              width_div8, 0, 0);
+    av1_round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8);
+    av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0],
+                                width_div8);
+    col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8,
+             width_div8);
+    col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
+             width_div8);
+    av1_round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8);
+    av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1],
+                                width_div8);
+  }
+
+  for (r = 0; r < height; r += 8) {
+    for (c = 0; c < width_div8; c++) {
+      av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+                                      &buf1[c * 8 * width_div8 + (r >> 3)],
+                                      width_div8, width_div8);
+    }
+  }
+
+  for (int i = 0; i < width_div16; i++) {
+    row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8,
+             width_div8);
+    row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8,
+             width_div8);
+    av1_round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8);
+    av1_round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2],
+                                width_div8);
+  }
+
+  for (r = 0; r < height; r += 8) {
+    for (c = 0; c < width_div8; c++) {
+      av1_fwd_txfm_transpose_8x8_avx2(&buf1[r * width_div8 + c],
+                                      &buf0[c * 8 * width_div8 + (r >> 3)],
+                                      width_div8, width_div8);
+    }
+  }
+
+  av1_store_buffer_avx2(buf0, output, 8, 128);
+}
+static INLINE void av1_fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
+                                          __m256i *cospi_m32,
+                                          __m256i *cospi_p32,
+                                          const __m256i *__rounding,
+                                          int8_t cos_bit) {
+  x2[0] = _mm256_add_epi32(x1[0], x1[31]);
+  x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
+  x2[1] = _mm256_add_epi32(x1[1], x1[30]);
+  x2[30] = _mm256_sub_epi32(x1[1], x1[30]);
+  x2[2] = _mm256_add_epi32(x1[2], x1[29]);
+  x2[29] = _mm256_sub_epi32(x1[2], x1[29]);
+  x2[3] = _mm256_add_epi32(x1[3], x1[28]);
+  x2[28] = _mm256_sub_epi32(x1[3], x1[28]);
+  x2[4] = _mm256_add_epi32(x1[4], x1[27]);
+  x2[27] = _mm256_sub_epi32(x1[4], x1[27]);
+  x2[5] = _mm256_add_epi32(x1[5], x1[26]);
+  x2[26] = _mm256_sub_epi32(x1[5], x1[26]);
+  x2[6] = _mm256_add_epi32(x1[6], x1[25]);
+  x2[25] = _mm256_sub_epi32(x1[6], x1[25]);
+  x2[7] = _mm256_add_epi32(x1[7], x1[24]);
+  x2[24] = _mm256_sub_epi32(x1[7], x1[24]);
+  x2[8] = _mm256_add_epi32(x1[8], x1[23]);
+  x2[23] = _mm256_sub_epi32(x1[8], x1[23]);
+  x2[9] = _mm256_add_epi32(x1[9], x1[22]);
+  x2[22] = _mm256_sub_epi32(x1[9], x1[22]);
+  x2[10] = _mm256_add_epi32(x1[10], x1[21]);
+  x2[21] = _mm256_sub_epi32(x1[10], x1[21]);
+  x2[11] = _mm256_add_epi32(x1[11], x1[20]);
+  x2[20] = _mm256_sub_epi32(x1[11], x1[20]);
+  x2[12] = _mm256_add_epi32(x1[12], x1[19]);
+  x2[19] = _mm256_sub_epi32(x1[12], x1[19]);
+  x2[13] = _mm256_add_epi32(x1[13], x1[18]);
+  x2[18] = _mm256_sub_epi32(x1[13], x1[18]);
+  x2[14] = _mm256_add_epi32(x1[14], x1[17]);
+  x2[17] = _mm256_sub_epi32(x1[14], x1[17]);
+  x2[15] = _mm256_add_epi32(x1[15], x1[16]);
+  x2[16] = _mm256_sub_epi32(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48],
+                        *__rounding, cos_bit);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+}
+static INLINE void av1_fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
+                                          __m256i *cospi_m32,
+                                          __m256i *cospi_p32,
+                                          const __m256i *__rounding,
+                                          int8_t cos_bit) {
+  x3[0] = _mm256_add_epi32(x2[0], x2[15]);
+  x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
+  x3[1] = _mm256_add_epi32(x2[1], x2[14]);
+  x3[14] = _mm256_sub_epi32(x2[1], x2[14]);
+  x3[2] = _mm256_add_epi32(x2[2], x2[13]);
+  x3[13] = _mm256_sub_epi32(x2[2], x2[13]);
+  x3[3] = _mm256_add_epi32(x2[3], x2[12]);
+  x3[12] = _mm256_sub_epi32(x2[3], x2[12]);
+  x3[4] = _mm256_add_epi32(x2[4], x2[11]);
+  x3[11] = _mm256_sub_epi32(x2[4], x2[11]);
+  x3[5] = _mm256_add_epi32(x2[5], x2[10]);
+  x3[10] = _mm256_sub_epi32(x2[5], x2[10]);
+  x3[6] = _mm256_add_epi32(x2[6], x2[9]);
+  x3[9] = _mm256_sub_epi32(x2[6], x2[9]);
+  x3[7] = _mm256_add_epi32(x2[7], x2[8]);
+  x3[8] = _mm256_sub_epi32(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24],
+                        *__rounding, cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm256_add_epi32(x2[32], x2[47]);
+  x3[47] = _mm256_sub_epi32(x2[32], x2[47]);
+  x3[33] = _mm256_add_epi32(x2[33], x2[46]);
+  x3[46] = _mm256_sub_epi32(x2[33], x2[46]);
+  x3[34] = _mm256_add_epi32(x2[34], x2[45]);
+  x3[45] = _mm256_sub_epi32(x2[34], x2[45]);
+  x3[35] = _mm256_add_epi32(x2[35], x2[44]);
+  x3[44] = _mm256_sub_epi32(x2[35], x2[44]);
+  x3[36] = _mm256_add_epi32(x2[36], x2[43]);
+  x3[43] = _mm256_sub_epi32(x2[36], x2[43]);
+  x3[37] = _mm256_add_epi32(x2[37], x2[42]);
+  x3[42] = _mm256_sub_epi32(x2[37], x2[42]);
+  x3[38] = _mm256_add_epi32(x2[38], x2[41]);
+  x3[41] = _mm256_sub_epi32(x2[38], x2[41]);
+  x3[39] = _mm256_add_epi32(x2[39], x2[40]);
+  x3[40] = _mm256_sub_epi32(x2[39], x2[40]);
+  x3[48] = _mm256_sub_epi32(x2[63], x2[48]);
+  x3[63] = _mm256_add_epi32(x2[63], x2[48]);
+  x3[49] = _mm256_sub_epi32(x2[62], x2[49]);
+  x3[62] = _mm256_add_epi32(x2[62], x2[49]);
+  x3[50] = _mm256_sub_epi32(x2[61], x2[50]);
+  x3[61] = _mm256_add_epi32(x2[61], x2[50]);
+  x3[51] = _mm256_sub_epi32(x2[60], x2[51]);
+  x3[60] = _mm256_add_epi32(x2[60], x2[51]);
+  x3[52] = _mm256_sub_epi32(x2[59], x2[52]);
+  x3[59] = _mm256_add_epi32(x2[59], x2[52]);
+  x3[53] = _mm256_sub_epi32(x2[58], x2[53]);
+  x3[58] = _mm256_add_epi32(x2[58], x2[53]);
+  x3[54] = _mm256_sub_epi32(x2[57], x2[54]);
+  x3[57] = _mm256_add_epi32(x2[57], x2[54]);
+  x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
+  x3[56] = _mm256_add_epi32(x2[56], x2[55]);
+}
+static INLINE void av1_fdct64_stage4_avx2(
+    __m256i *x3, __m256i *x4, __m256i *cospi_m32, __m256i *cospi_p32,
+    __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
+    const __m256i *__rounding, int8_t cos_bit) {
+  x4[0] = _mm256_add_epi32(x3[0], x3[7]);
+  x4[7] = _mm256_sub_epi32(x3[0], x3[7]);
+  x4[1] = _mm256_add_epi32(x3[1], x3[6]);
+  x4[6] = _mm256_sub_epi32(x3[1], x3[6]);
+  x4[2] = _mm256_add_epi32(x3[2], x3[5]);
+  x4[5] = _mm256_sub_epi32(x3[2], x3[5]);
+  x4[3] = _mm256_add_epi32(x3[3], x3[4]);
+  x4[4] = _mm256_sub_epi32(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12],
+                        *__rounding, cos_bit);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm256_add_epi32(x3[16], x3[23]);
+  x4[23] = _mm256_sub_epi32(x3[16], x3[23]);
+  x4[17] = _mm256_add_epi32(x3[17], x3[22]);
+  x4[22] = _mm256_sub_epi32(x3[17], x3[22]);
+  x4[18] = _mm256_add_epi32(x3[18], x3[21]);
+  x4[21] = _mm256_sub_epi32(x3[18], x3[21]);
+  x4[19] = _mm256_add_epi32(x3[19], x3[20]);
+  x4[20] = _mm256_sub_epi32(x3[19], x3[20]);
+  x4[24] = _mm256_sub_epi32(x3[31], x3[24]);
+  x4[31] = _mm256_add_epi32(x3[31], x3[24]);
+  x4[25] = _mm256_sub_epi32(x3[30], x3[25]);
+  x4[30] = _mm256_add_epi32(x3[30], x3[25]);
+  x4[26] = _mm256_sub_epi32(x3[29], x3[26]);
+  x4[29] = _mm256_add_epi32(x3[29], x3[26]);
+  x4[27] = _mm256_sub_epi32(x3[28], x3[27]);
+  x4[28] = _mm256_add_epi32(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52],
+                        *__rounding, cos_bit);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+}
+static INLINE void av1_fdct64_stage5_avx2(
+    __m256i *x4, __m256i *x5, __m256i *cospi_m32, __m256i *cospi_p32,
+    __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
+    const __m256i *__rounding, int8_t cos_bit) {
+  x5[0] = _mm256_add_epi32(x4[0], x4[3]);
+  x5[3] = _mm256_sub_epi32(x4[0], x4[3]);
+  x5[1] = _mm256_add_epi32(x4[1], x4[2]);
+  x5[2] = _mm256_sub_epi32(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6],
+                        *__rounding, cos_bit);
+  x5[7] = x4[7];
+  x5[8] = _mm256_add_epi32(x4[8], x4[11]);
+  x5[11] = _mm256_sub_epi32(x4[8], x4[11]);
+  x5[9] = _mm256_add_epi32(x4[9], x4[10]);
+  x5[10] = _mm256_sub_epi32(x4[9], x4[10]);
+  x5[12] = _mm256_sub_epi32(x4[15], x4[12]);
+  x5[15] = _mm256_add_epi32(x4[15], x4[12]);
+  x5[13] = _mm256_sub_epi32(x4[14], x4[13]);
+  x5[14] = _mm256_add_epi32(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26],
+                        *__rounding, cos_bit);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm256_add_epi32(x4[32], x4[39]);
+  x5[39] = _mm256_sub_epi32(x4[32], x4[39]);
+  x5[33] = _mm256_add_epi32(x4[33], x4[38]);
+  x5[38] = _mm256_sub_epi32(x4[33], x4[38]);
+  x5[34] = _mm256_add_epi32(x4[34], x4[37]);
+  x5[37] = _mm256_sub_epi32(x4[34], x4[37]);
+  x5[35] = _mm256_add_epi32(x4[35], x4[36]);
+  x5[36] = _mm256_sub_epi32(x4[35], x4[36]);
+  x5[40] = _mm256_sub_epi32(x4[47], x4[40]);
+  x5[47] = _mm256_add_epi32(x4[47], x4[40]);
+  x5[41] = _mm256_sub_epi32(x4[46], x4[41]);
+  x5[46] = _mm256_add_epi32(x4[46], x4[41]);
+  x5[42] = _mm256_sub_epi32(x4[45], x4[42]);
+  x5[45] = _mm256_add_epi32(x4[45], x4[42]);
+  x5[43] = _mm256_sub_epi32(x4[44], x4[43]);
+  x5[44] = _mm256_add_epi32(x4[44], x4[43]);
+  x5[48] = _mm256_add_epi32(x4[48], x4[55]);
+  x5[55] = _mm256_sub_epi32(x4[48], x4[55]);
+  x5[49] = _mm256_add_epi32(x4[49], x4[54]);
+  x5[54] = _mm256_sub_epi32(x4[49], x4[54]);
+  x5[50] = _mm256_add_epi32(x4[50], x4[53]);
+  x5[53] = _mm256_sub_epi32(x4[50], x4[53]);
+  x5[51] = _mm256_add_epi32(x4[51], x4[52]);
+  x5[52] = _mm256_sub_epi32(x4[51], x4[52]);
+  x5[56] = _mm256_sub_epi32(x4[63], x4[56]);
+  x5[63] = _mm256_add_epi32(x4[63], x4[56]);
+  x5[57] = _mm256_sub_epi32(x4[62], x4[57]);
+  x5[62] = _mm256_add_epi32(x4[62], x4[57]);
+  x5[58] = _mm256_sub_epi32(x4[61], x4[58]);
+  x5[61] = _mm256_add_epi32(x4[61], x4[58]);
+  x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
+  x5[60] = _mm256_add_epi32(x4[60], x4[59]);
+}
+static INLINE void av1_fdct64_stage6_avx2(
+    __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32,
+    __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
+    __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56,
+    __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24,
+    const __m256i *__rounding, int8_t cos_bit) {
+  btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3],
+                        *__rounding, cos_bit);
+  x6[4] = _mm256_add_epi32(x5[4], x5[5]);
+  x6[5] = _mm256_sub_epi32(x5[4], x5[5]);
+  x6[6] = _mm256_sub_epi32(x5[7], x5[6]);
+  x6[7] = _mm256_add_epi32(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13],
+                        *__rounding, cos_bit);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm256_add_epi32(x5[16], x5[19]);
+  x6[19] = _mm256_sub_epi32(x5[16], x5[19]);
+  x6[17] = _mm256_add_epi32(x5[17], x5[18]);
+  x6[18] = _mm256_sub_epi32(x5[17], x5[18]);
+  x6[20] = _mm256_sub_epi32(x5[23], x5[20]);
+  x6[23] = _mm256_add_epi32(x5[23], x5[20]);
+  x6[21] = _mm256_sub_epi32(x5[22], x5[21]);
+  x6[22] = _mm256_add_epi32(x5[22], x5[21]);
+  x6[24] = _mm256_add_epi32(x5[24], x5[27]);
+  x6[27] = _mm256_sub_epi32(x5[24], x5[27]);
+  x6[25] = _mm256_add_epi32(x5[25], x5[26]);
+  x6[26] = _mm256_sub_epi32(x5[25], x5[26]);
+  x6[28] = _mm256_sub_epi32(x5[31], x5[28]);
+  x6[31] = _mm256_add_epi32(x5[31], x5[28]);
+  x6[29] = _mm256_sub_epi32(x5[30], x5[29]);
+  x6[30] = _mm256_add_epi32(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58],
+                        *__rounding, cos_bit);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50],
+                        *__rounding, cos_bit);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+}
+static INLINE void av1_fdct64_stage7_avx2(
+    __m256i *x6, __m256i *x7, __m256i *cospi_p08, __m256i *cospi_p56,
+    __m256i *cospi_p40, __m256i *cospi_p24, __m256i *cospi_m08,
+    __m256i *cospi_m56, __m256i *cospi_m40, __m256i *cospi_m24,
+    const __m256i *__rounding, int8_t cos_bit) {
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6],
+                        *__rounding, cos_bit);
+  x7[8] = _mm256_add_epi32(x6[8], x6[9]);
+  x7[9] = _mm256_sub_epi32(x6[8], x6[9]);
+  x7[10] = _mm256_sub_epi32(x6[11], x6[10]);
+  x7[11] = _mm256_add_epi32(x6[11], x6[10]);
+  x7[12] = _mm256_add_epi32(x6[12], x6[13]);
+  x7[13] = _mm256_sub_epi32(x6[12], x6[13]);
+  x7[14] = _mm256_sub_epi32(x6[15], x6[14]);
+  x7[15] = _mm256_add_epi32(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29],
+                        *__rounding, cos_bit);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25],
+                        *__rounding, cos_bit);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm256_add_epi32(x6[32], x6[35]);
+  x7[35] = _mm256_sub_epi32(x6[32], x6[35]);
+  x7[33] = _mm256_add_epi32(x6[33], x6[34]);
+  x7[34] = _mm256_sub_epi32(x6[33], x6[34]);
+  x7[36] = _mm256_sub_epi32(x6[39], x6[36]);
+  x7[39] = _mm256_add_epi32(x6[39], x6[36]);
+  x7[37] = _mm256_sub_epi32(x6[38], x6[37]);
+  x7[38] = _mm256_add_epi32(x6[38], x6[37]);
+  x7[40] = _mm256_add_epi32(x6[40], x6[43]);
+  x7[43] = _mm256_sub_epi32(x6[40], x6[43]);
+  x7[41] = _mm256_add_epi32(x6[41], x6[42]);
+  x7[42] = _mm256_sub_epi32(x6[41], x6[42]);
+  x7[44] = _mm256_sub_epi32(x6[47], x6[44]);
+  x7[47] = _mm256_add_epi32(x6[47], x6[44]);
+  x7[45] = _mm256_sub_epi32(x6[46], x6[45]);
+  x7[46] = _mm256_add_epi32(x6[46], x6[45]);
+  x7[48] = _mm256_add_epi32(x6[48], x6[51]);
+  x7[51] = _mm256_sub_epi32(x6[48], x6[51]);
+  x7[49] = _mm256_add_epi32(x6[49], x6[50]);
+  x7[50] = _mm256_sub_epi32(x6[49], x6[50]);
+  x7[52] = _mm256_sub_epi32(x6[55], x6[52]);
+  x7[55] = _mm256_add_epi32(x6[55], x6[52]);
+  x7[53] = _mm256_sub_epi32(x6[54], x6[53]);
+  x7[54] = _mm256_add_epi32(x6[54], x6[53]);
+  x7[56] = _mm256_add_epi32(x6[56], x6[59]);
+  x7[59] = _mm256_sub_epi32(x6[56], x6[59]);
+  x7[57] = _mm256_add_epi32(x6[57], x6[58]);
+  x7[58] = _mm256_sub_epi32(x6[57], x6[58]);
+  x7[60] = _mm256_sub_epi32(x6[63], x6[60]);
+  x7[63] = _mm256_add_epi32(x6[63], x6[60]);
+  x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
+  x7[62] = _mm256_add_epi32(x6[62], x6[61]);
+}
+static INLINE void av1_fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
+                                          const int32_t *cospi,
+                                          const __m256i *__rounding,
+                                          int8_t cos_bit) {
+  __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+  __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+  __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+  __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+  __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+  __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+  __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+  __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+  __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+  __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+  __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+  __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+  __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+  __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+  __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+  __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+
+  btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12],
+                        *__rounding, cos_bit);
+  x8[16] = _mm256_add_epi32(x7[16], x7[17]);
+  x8[17] = _mm256_sub_epi32(x7[16], x7[17]);
+  x8[18] = _mm256_sub_epi32(x7[19], x7[18]);
+  x8[19] = _mm256_add_epi32(x7[19], x7[18]);
+  x8[20] = _mm256_add_epi32(x7[20], x7[21]);
+  x8[21] = _mm256_sub_epi32(x7[20], x7[21]);
+  x8[22] = _mm256_sub_epi32(x7[23], x7[22]);
+  x8[23] = _mm256_add_epi32(x7[23], x7[22]);
+  x8[24] = _mm256_add_epi32(x7[24], x7[25]);
+  x8[25] = _mm256_sub_epi32(x7[24], x7[25]);
+  x8[26] = _mm256_sub_epi32(x7[27], x7[26]);
+  x8[27] = _mm256_add_epi32(x7[27], x7[26]);
+  x8[28] = _mm256_add_epi32(x7[28], x7[29]);
+  x8[29] = _mm256_sub_epi32(x7[28], x7[29]);
+  x8[30] = _mm256_sub_epi32(x7[31], x7[30]);
+  x8[31] = _mm256_add_epi32(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+                        *__rounding, cos_bit);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+                        *__rounding, cos_bit);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+                        *__rounding, cos_bit);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+                        *__rounding, cos_bit);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+}
+static INLINE void av1_fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
+                                          const int32_t *cospi,
+                                          const __m256i *__rounding,
+                                          int8_t cos_bit) {
+  __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+  __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+  __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+  __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+  __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+  __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+  __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+  __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+  __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+  __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+  __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+  __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+  __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+  __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+  __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+  __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24],
+                        *__rounding, cos_bit);
+  x9[32] = _mm256_add_epi32(x8[32], x8[33]);
+  x9[33] = _mm256_sub_epi32(x8[32], x8[33]);
+  x9[34] = _mm256_sub_epi32(x8[35], x8[34]);
+  x9[35] = _mm256_add_epi32(x8[35], x8[34]);
+  x9[36] = _mm256_add_epi32(x8[36], x8[37]);
+  x9[37] = _mm256_sub_epi32(x8[36], x8[37]);
+  x9[38] = _mm256_sub_epi32(x8[39], x8[38]);
+  x9[39] = _mm256_add_epi32(x8[39], x8[38]);
+  x9[40] = _mm256_add_epi32(x8[40], x8[41]);
+  x9[41] = _mm256_sub_epi32(x8[40], x8[41]);
+  x9[42] = _mm256_sub_epi32(x8[43], x8[42]);
+  x9[43] = _mm256_add_epi32(x8[43], x8[42]);
+  x9[44] = _mm256_add_epi32(x8[44], x8[45]);
+  x9[45] = _mm256_sub_epi32(x8[44], x8[45]);
+  x9[46] = _mm256_sub_epi32(x8[47], x8[46]);
+  x9[47] = _mm256_add_epi32(x8[47], x8[46]);
+  x9[48] = _mm256_add_epi32(x8[48], x8[49]);
+  x9[49] = _mm256_sub_epi32(x8[48], x8[49]);
+  x9[50] = _mm256_sub_epi32(x8[51], x8[50]);
+  x9[51] = _mm256_add_epi32(x8[51], x8[50]);
+  x9[52] = _mm256_add_epi32(x8[52], x8[53]);
+  x9[53] = _mm256_sub_epi32(x8[52], x8[53]);
+  x9[54] = _mm256_sub_epi32(x8[55], x8[54]);
+  x9[55] = _mm256_add_epi32(x8[55], x8[54]);
+  x9[56] = _mm256_add_epi32(x8[56], x8[57]);
+  x9[57] = _mm256_sub_epi32(x8[56], x8[57]);
+  x9[58] = _mm256_sub_epi32(x8[59], x8[58]);
+  x9[59] = _mm256_add_epi32(x8[59], x8[58]);
+  x9[60] = _mm256_add_epi32(x8[60], x8[61]);
+  x9[61] = _mm256_sub_epi32(x8[60], x8[61]);
+  x9[62] = _mm256_sub_epi32(x8[63], x8[62]);
+  x9[63] = _mm256_add_epi32(x8[63], x8[62]);
+}
+static INLINE void av1_fdct64_stage10_avx2(__m256i *x9, __m256i *x10,
+                                           const int32_t *cospi,
+                                           const __m256i *__rounding,
+                                           int8_t cos_bit) {
+  __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+  __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+  __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+  __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+  __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+  __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+  __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+  __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+  __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+  __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+  __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+  __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+  __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+  __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+  __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+  __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+  __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+  __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+  __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+  __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+  __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+  __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+  __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+  __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+  __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+  __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+  __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+  __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+  __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+  __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+  __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+  __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48],
+                        *__rounding, cos_bit);
+}
+static void av1_fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit,
+                            const int instride, const int outstride) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+  __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+  __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+  __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+  __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+  __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+  __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+  __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+  __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+  __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+  __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+  __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+  __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+  __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+  __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+
+  int startidx = 0 * instride;
+  int endidx = 63 * instride;
+  // stage 1
+  __m256i x1[64];
+  x1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[16] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[17] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[18] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[19] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[20] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[21] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[22] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[23] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[24] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[25] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[26] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[27] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[28] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[29] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[30] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[31] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+  // stage 2
+  __m256i x2[64];
+  av1_fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+  // stage 3
+  av1_fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+  // stage 4
+  av1_fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+                         &cospi_m48, &__rounding, cos_bit);
+  // stage 5
+  av1_fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+                         &cospi_m48, &__rounding, cos_bit);
+  // stage 6
+  av1_fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48,
+                         &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56,
+                         &cospi_m40, &cospi_p24, &cospi_m24, &__rounding,
+                         cos_bit);
+  // stage 7
+  av1_fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24,
+                         &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24,
+                         &__rounding, cos_bit);
+  // stage 8
+  av1_fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit);
+  // stage 9
+  av1_fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit);
+  // stage 10
+  av1_fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit);
+
+  startidx = 0 * outstride;
+  endidx = 63 * outstride;
+
+  // stage 11
+  output[startidx] = x2[0];
+  output[endidx] = x2[63];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[32];
+  output[endidx] = x2[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[16];
+  output[endidx] = x2[47];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[48];
+  output[endidx] = x2[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[8];
+  output[endidx] = x2[55];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[40];
+  output[endidx] = x2[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[24];
+  output[endidx] = x2[39];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[56];
+  output[endidx] = x2[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[4];
+  output[endidx] = x2[59];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[36];
+  output[endidx] = x2[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[20];
+  output[endidx] = x2[43];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[52];
+  output[endidx] = x2[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[12];
+  output[endidx] = x2[51];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[44];
+  output[endidx] = x2[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[28];
+  output[endidx] = x2[35];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[60];
+  output[endidx] = x2[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[2];
+  output[endidx] = x2[61];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[34];
+  output[endidx] = x2[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[18];
+  output[endidx] = x2[45];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[50];
+  output[endidx] = x2[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[10];
+  output[endidx] = x2[53];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[42];
+  output[endidx] = x2[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[26];
+  output[endidx] = x2[37];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[58];
+  output[endidx] = x2[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[6];
+  output[endidx] = x2[57];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[38];
+  output[endidx] = x2[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[22];
+  output[endidx] = x2[41];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[54];
+  output[endidx] = x2[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[14];
+  output[endidx] = x2[49];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[46];
+  output[endidx] = x2[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[30];
+  output[endidx] = x2[33];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[62];
+  output[endidx] = x2[1];
+}
+void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  __m256i buf0[512], buf1[512];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = av1_fdct64_avx2;
+  const transform_1d_avx2 row_txfm = av1_fdct64_avx2;
+  const int width_div16 = (width >> 4);
+  const int width_div8 = (width >> 3);
+  int r, c;
+  for (int i = 0; i < width_div16; i++) {
+    av1_load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height,
+                              width_div8, 0, 0);
+    av1_round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8);
+    av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0],
+                                width_div8);
+    col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8);
+    col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
+             width_div8);
+    av1_round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8);
+    av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1],
+                                width_div8);
+  }
+
+  for (r = 0; r < height; r += 8) {
+    for (c = 0; c < width_div8; c++) {
+      av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+                                      &buf1[c * 8 * width_div8 + (r >> 3)],
+                                      width_div8, width_div8);
+    }
+  }
+
+  for (int i = 0; i < 2; i++) {
+    row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8,
+             width_div16);
+    row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8,
+             width_div16);
+    av1_round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2],
+                                width_div16);
+    av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2],
+                                width_div16);
+  }
+
+  for (r = 0; r < (height >> 1); r += 8) {
+    for (c = 0; c < width_div16; c++) {
+      av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div16 + c],
+                                      &buf1[c * 8 * width_div16 + (r >> 3)],
+                                      width_div16, width_div16);
+    }
+  }
+  av1_store_buffer_avx2(buf1, output, 8, 128);
+}
diff --git a/libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c b/libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c
new file mode 100644
index 0000000..f199b0f
--- /dev/null
+++ b/libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c
@@ -0,0 +1,954 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/x86/temporal_filter_constants.h"
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+                                       uint32_t *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu16_epi32(a_reg);
+  const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero);
+  const __m128i b_first = _mm_cvtepu16_epi32(b_reg);
+  const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero);
+
+  __m128i dist_first, dist_second;
+
+  dist_first = _mm_sub_epi32(a_first, b_first);
+  dist_second = _mm_sub_epi32(a_second, b_second);
+  dist_first = _mm_mullo_epi32(dist_first, dist_first);
+  dist_second = _mm_mullo_epi32(dist_second, dist_second);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+  _mm_storeu_si128((__m128i *)(dst + 4), dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) {
+  __m128i dist_reg, dist_left, dist_right;
+
+  dist_reg = _mm_loadu_si128((const __m128i *)dist);
+  dist_left = _mm_loadu_si128((const __m128i *)(dist - 1));
+  dist_right = _mm_loadu_si128((const __m128i *)(dist + 1));
+
+  *sum = _mm_add_epi32(dist_reg, dist_left);
+  *sum = _mm_add_epi32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first,
+                                    __m128i *sum_second) {
+  highbd_get_sum_4(dist, sum_first);
+  highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(__m128i *output, const __m128i *sum,
+                                    const __m128i *mul_constants,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u32 = _mm_set1_epi32(rounding);
+  const __m128i weight_u32 = _mm_set1_epi32(weight);
+  const __m128i sixteen = _mm_set1_epi32(16);
+  const __m128i zero = _mm_setzero_si128();
+
+  // modifier * 3 / index;
+  const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero);
+  const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero);
+  const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero);
+  const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero);
+
+  const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo);
+  const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32);
+  const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi);
+  const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32);
+
+  // Now we have
+  //   mul_lo: 00 a1 00 a0
+  //   mul_hi: 00 a3 00 a2
+  // Unpack as 64 bit words to get even and odd elements
+  //   unpack_lo: 00 a2 00 a0
+  //   unpack_hi: 00 a3 00 a1
+  // Then we can shift and OR the results to get everything in 32-bits
+  const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div);
+  const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div);
+  const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4);
+  const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift);
+
+  // Round
+  *output = _mm_add_epi32(mul, rounding_u32);
+  *output = _mm_srl_epi32(*output, strength_u128);
+
+  // Multiply with the weight
+  *output = _mm_min_epu32(*output, sixteen);
+  *output = _mm_sub_epi32(sixteen, *output);
+  *output = _mm_mullo_epi32(*output, weight_u32);
+}
+
+static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1,
+                                    const __m128i *sum_0_u32,
+                                    const __m128i *sum_1_u32,
+                                    const __m128i *mul_constants_0,
+                                    const __m128i *mul_constants_1,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+                   weight);
+  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+                   weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
+                                                 const __m128i sum_second_u32,
+                                                 const uint16_t *pred,
+                                                 uint16_t *count,
+                                                 uint32_t *accumulator) {
+  // Cast down to 16-bit ints
+  const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32);
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred);
+  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+
+  __m128i pred_0_u32, pred_1_u32;
+  __m128i accum_0_u32, accum_1_u32;
+
+  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+  _mm_storeu_si128((__m128i *)count, count_u16);
+
+  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) {
+  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first,
+                                      __m128i *reg_second) {
+  highbd_read_dist_4(dist, reg_first);
+  highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first,
+    __m128i *u_second, __m128i *v_first, __m128i *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 8 entries from chroma.
+    highbd_read_dist_8(u_dist, u_first, u_second);
+    highbd_read_dist_8(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    __m128i u_reg, v_reg;
+
+    highbd_read_dist_4(u_dist, &u_reg);
+
+    *u_first = _mm_unpacklo_epi32(u_reg, u_reg);
+    *u_second = _mm_unpackhi_epi32(u_reg, u_reg);
+
+    highbd_read_dist_4(v_dist, &v_reg);
+
+    *v_first = _mm_unpacklo_epi32(v_reg, v_reg);
+    *v_second = _mm_unpackhi_epi32(v_reg, v_reg);
+  }
+}
+
+static void av1_highbd_apply_temporal_filter_luma_8(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
+    uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
+    const uint32_t *v_dist, const uint32_t *const *neighbors_first,
+    const uint32_t *const *neighbors_second, int top_weight,
+    int bottom_weight) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_first, mul_second;
+
+  __m128i sum_row_1_first, sum_row_1_second;
+  __m128i sum_row_2_first, sum_row_2_second;
+  __m128i sum_row_3_first, sum_row_3_second;
+
+  __m128i u_first, u_second;
+  __m128i v_first, v_second;
+
+  __m128i sum_row_first;
+  __m128i sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(block_width == 8);
+
+  (void)block_width;
+
+  // First row
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+  // Add luma values
+  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+  sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first);
+  sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                &v_first, &v_second);
+
+  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+
+  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+
+  y_src += y_src_stride;
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      weight = bottom_weight;
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+    sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                    &v_first, &v_second);
+
+      u_src += uv_src_stride;
+      u_pre += uv_pre_stride;
+      u_dist += DIST_STRIDE;
+      v_src += uv_src_stride;
+      v_pre += uv_pre_stride;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+    sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+    // Get modifier and store result
+    highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                     &sum_row_second, &mul_first, &mul_second, strength,
+                     rounding, weight);
+    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                  y_accum);
+
+    y_src += y_src_stride;
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+  sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                  &v_first, &v_second);
+  }
+
+  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void av1_highbd_apply_temporal_filter_luma(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_first;
+  const uint32_t *const *neighbors_second;
+
+  // Left
+  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  av1_highbd_apply_temporal_filter_luma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    av1_highbd_apply_temporal_filter_luma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
+        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    av1_highbd_apply_temporal_filter_luma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
+        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight);
+  }
+
+  // Right
+  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+  av1_highbd_apply_temporal_filter_luma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+    const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst,
+    __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) {
+  __m128i y_reg_fst, y_reg_snd;
+  if (!ss_x) {
+    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+      y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst);
+      y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd);
+    }
+  } else {
+    // Temporary
+    __m128i y_fst, y_snd;
+
+    // First 8
+    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+    }
+
+    y_reg_fst = _mm_hadd_epi32(y_fst, y_snd);
+
+    // Second 8
+    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+    }
+
+    y_reg_snd = _mm_hadd_epi32(y_fst, y_snd);
+  }
+
+  *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst);
+  *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd);
+  *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst);
+  *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void av1_highbd_apply_temporal_filter_chroma_8(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int uv_block_width,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+    int top_weight, int bottom_weight, const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_fst, mul_snd;
+
+  __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+  __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+  __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+  __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+  __m128i u_sum_row_fst, v_sum_row_fst;
+  __m128i u_sum_row_snd, v_sum_row_snd;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // First row
+  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
+  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
+
+  // Add chroma values
+  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+  u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst);
+  u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+  v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst);
+  v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_src += y_src_stride * (1 + ss_y);
+  y_pre += y_pre_stride * (1 + ss_y);
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[1]);
+  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1_fst = u_sum_row_2_fst;
+    u_sum_row_2_fst = u_sum_row_3_fst;
+    u_sum_row_1_snd = u_sum_row_2_snd;
+    u_sum_row_2_snd = u_sum_row_3_snd;
+
+    v_sum_row_1_fst = v_sum_row_2_fst;
+    v_sum_row_2_fst = v_sum_row_3_fst;
+    v_sum_row_1_snd = v_sum_row_2_snd;
+    v_sum_row_2_snd = v_sum_row_3_snd;
+
+    // Add chroma values
+    u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+    u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+    u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst);
+    u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd);
+
+    v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+    v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+    v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst);
+    v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd);
+
+    // Add luma values
+    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                         &u_sum_row_snd, &v_sum_row_fst,
+                                         &v_sum_row_snd);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+      highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+    } else {
+      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                       &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                       &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+    }
+
+    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                  u_accum);
+    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                  v_accum);
+
+    u_src += uv_src_stride;
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_src += uv_src_stride;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_src += y_src_stride * (1 + ss_y);
+    y_pre += y_pre_stride * (1 + ss_y);
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
+  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
+
+  // Shift the rows up
+  u_sum_row_1_fst = u_sum_row_2_fst;
+  u_sum_row_2_fst = u_sum_row_3_fst;
+  u_sum_row_1_snd = u_sum_row_2_snd;
+  u_sum_row_2_snd = u_sum_row_3_snd;
+
+  v_sum_row_1_fst = v_sum_row_2_fst;
+  v_sum_row_2_fst = v_sum_row_3_fst;
+  v_sum_row_1_snd = v_sum_row_2_snd;
+  v_sum_row_2_snd = v_sum_row_3_snd;
+
+  // Add chroma values
+  u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+  v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+  u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+  v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void av1_highbd_apply_temporal_filter_chroma(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_fst;
+  const uint32_t *const *neighbors_snd;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    } else {
+      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      av1_highbd_apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+    } else {
+      av1_highbd_apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  av1_highbd_apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
+      top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    av1_highbd_apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    av1_highbd_apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  av1_highbd_apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
+      top_weight, bottom_weight, NULL);
+}
+
+void av1_highbd_apply_temporal_filter_sse4_1(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint16_t *y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
+                 *u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
+                 *v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
+  const uint16_t *y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
+                 *u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
+                 *v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                          y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                          u_dist_ptr + blk_col);
+      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                          v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
+  u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
+  v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
+  y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
+  u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
+  v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  av1_highbd_apply_temporal_filter_luma(
+      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
+      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
+      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, y_accum,
+      y_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+
+  av1_highbd_apply_temporal_filter_chroma(
+      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
+      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
+      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, u_accum,
+      u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/libaom/av1/encoder/x86/pickrst_avx2.c b/libaom/av1/encoder/x86/pickrst_avx2.c
index 7a63c60..d00fca0 100644
--- a/libaom/av1/encoder/x86/pickrst_avx2.c
+++ b/libaom/av1/encoder/x86/pickrst_avx2.c
@@ -536,7 +536,7 @@ int64_t av1_lowbd_pixel_proj_error_avx2(
         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
         int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -581,7 +581,7 @@ int64_t av1_lowbd_pixel_proj_error_avx2(
         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
         int32_t v = xq_active * (flt[k] - u);
         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -605,7 +605,7 @@ int64_t av1_lowbd_pixel_proj_error_avx2(
       }
       for (k = j; k < width; ++k) {
         const int32_t e = (int32_t)(dat[k]) - src[k];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -711,7 +711,7 @@ int64_t av1_highbd_pixel_proj_error_avx2(
         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
         int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -788,7 +788,7 @@ int64_t av1_highbd_pixel_proj_error_avx2(
         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
         int32_t v = xq_on * (flt[k] - u);
         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -828,7 +828,7 @@ int64_t av1_highbd_pixel_proj_error_avx2(
       // Process remaining pixels (modulu 16)
       for (k = j; k < width; ++k) {
         const int32_t e = (int32_t)(dat[k]) - src[k];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
diff --git a/libaom/av1/encoder/x86/pickrst_sse4.c b/libaom/av1/encoder/x86/pickrst_sse4.c
index 2326736..a94e169 100644
--- a/libaom/av1/encoder/x86/pickrst_sse4.c
+++ b/libaom/av1/encoder/x86/pickrst_sse4.c
@@ -539,7 +539,7 @@ int64_t av1_lowbd_pixel_proj_error_sse4_1(
         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
         int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -578,7 +578,7 @@ int64_t av1_lowbd_pixel_proj_error_sse4_1(
         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
         int32_t v = xq_active * (flt[k] - u);
         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -607,7 +607,7 @@ int64_t av1_lowbd_pixel_proj_error_sse4_1(
       }
       for (k = j; k < width; ++k) {
         const int32_t e = (int32_t)(dat[k]) - src[k];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -709,7 +709,7 @@ int64_t av1_highbd_pixel_proj_error_sse4_1(
         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
         int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -777,7 +777,7 @@ int64_t av1_highbd_pixel_proj_error_sse4_1(
         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
         int32_t v = xq_on * (flt[k] - u);
         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
@@ -814,7 +814,7 @@ int64_t av1_highbd_pixel_proj_error_sse4_1(
       // Process remaining pixels (modulu 8)
       for (k = j; k < width; ++k) {
         const int32_t e = (int32_t)(dat[k]) - src[k];
-        err += e * e;
+        err += ((int64_t)e * e);
       }
       dat += dat_stride;
       src += src_stride;
diff --git a/libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm
deleted file mode 100644
index 30983d1..0000000
--- a/libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm
+++ /dev/null
@@ -1,217 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-SECTION .text
-
-; void av1_temporal_filter_apply_sse2 | arg
-;  (unsigned char  *frame1,           |  0
-;   unsigned int    stride,           |  1
-;   unsigned char  *frame2,           |  2
-;   unsigned int    block_width,      |  3
-;   unsigned int    block_height,     |  4
-;   int             strength,         |  5
-;   int             filter_weight,    |  6
-;   unsigned int   *accumulator,      |  7
-;   unsigned short *count)            |  8
-global sym(av1_temporal_filter_apply_sse2) PRIVATE
-sym(av1_temporal_filter_apply_sse2):
-
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ALIGN_STACK 16, rax
-    %define block_width    0
-    %define block_height  16
-    %define strength      32
-    %define filter_weight 48
-    %define rounding_bit  64
-    %define rbp_backup    80
-    %define stack_size    96
-    sub         rsp,           stack_size
-    mov         [rsp + rbp_backup], rbp
-    ; end prolog
-
-        mov         edx,            arg(3)
-        mov         [rsp + block_width], rdx
-        mov         edx,            arg(4)
-        mov         [rsp + block_height], rdx
-        movd        xmm6,           arg(5)
-        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
-
-        ; calculate the rounding bit outside the loop
-        ; 0x8000 >> (16 - strength)
-        mov         rdx,            16
-        sub         rdx,            arg(5) ; 16 - strength
-        movq        xmm4,           rdx    ; can't use rdx w/ shift
-        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
-        psrlw       xmm5,           xmm4
-        movdqa      [rsp + rounding_bit], xmm5
-
-        mov         rsi,            arg(0) ; src/frame1
-        mov         rdx,            arg(2) ; predictor frame
-        mov         rdi,            arg(7) ; accumulator
-        mov         rax,            arg(8) ; count
-
-        ; dup the filter weight and store for later
-        movd        xmm0,           arg(6) ; filter_weight
-        pshuflw     xmm0,           xmm0, 0
-        punpcklwd   xmm0,           xmm0
-        movdqa      [rsp + filter_weight], xmm0
-
-        mov         rbp,            arg(1) ; stride
-        pxor        xmm7,           xmm7   ; zero for extraction
-
-        mov         rcx,            [rsp + block_width]
-        imul        rcx,            [rsp + block_height]
-        add         rcx,            rdx
-        cmp         dword ptr [rsp + block_width], 8
-        jne         .temporal_filter_apply_load_16
-
-.temporal_filter_apply_load_8:
-        movq        xmm0,           [rsi]  ; first row
-        lea         rsi,            [rsi + rbp] ; += stride
-        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
-        movq        xmm1,           [rsi]  ; second row
-        lea         rsi,            [rsi + rbp] ; += stride
-        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
-        jmp         .temporal_filter_apply_load_finished
-
-.temporal_filter_apply_load_16:
-        movdqa      xmm0,           [rsi]  ; src (frame1)
-        lea         rsi,            [rsi + rbp] ; += stride
-        movdqa      xmm1,           xmm0
-        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
-        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
-
-.temporal_filter_apply_load_finished:
-        movdqa      xmm2,           [rdx]  ; predictor (frame2)
-        movdqa      xmm3,           xmm2
-        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
-        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
-
-        ; modifier = src_byte - pixel_value
-        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
-        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
-
-        ; modifier *= modifier
-        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
-        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
-
-        ; modifier *= 3
-        pmullw      xmm0,           [GLOBAL(_const_3w)]
-        pmullw      xmm1,           [GLOBAL(_const_3w)]
-
-        ; modifer += 0x8000 >> (16 - strength)
-        paddw       xmm0,           [rsp + rounding_bit]
-        paddw       xmm1,           [rsp + rounding_bit]
-
-        ; modifier >>= strength
-        psrlw       xmm0,           [rsp + strength]
-        psrlw       xmm1,           [rsp + strength]
-
-        ; modifier = 16 - modifier
-        ; saturation takes care of modifier > 16
-        movdqa      xmm3,           [GLOBAL(_const_16w)]
-        movdqa      xmm2,           [GLOBAL(_const_16w)]
-        psubusw     xmm3,           xmm1
-        psubusw     xmm2,           xmm0
-
-        ; modifier *= filter_weight
-        pmullw      xmm2,           [rsp + filter_weight]
-        pmullw      xmm3,           [rsp + filter_weight]
-
-        ; count
-        movdqa      xmm4,           [rax]
-        movdqa      xmm5,           [rax+16]
-        ; += modifier
-        paddw       xmm4,           xmm2
-        paddw       xmm5,           xmm3
-        ; write back
-        movdqa      [rax],          xmm4
-        movdqa      [rax+16],       xmm5
-        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
-
-        ; load and extract the predictor up to shorts
-        pxor        xmm7,           xmm7
-        movdqa      xmm0,           [rdx]
-        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
-        movdqa      xmm1,           xmm0
-        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
-        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
-
-        ; modifier *= pixel_value
-        pmullw      xmm0,           xmm2
-        pmullw      xmm1,           xmm3
-
-        ; expand to double words
-        movdqa      xmm2,           xmm0
-        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
-        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
-        movdqa      xmm3,           xmm1
-        punpcklwd   xmm1,           xmm7   ; [ 8-11]
-        punpckhwd   xmm3,           xmm7   ; [12-15]
-
-        ; accumulator
-        movdqa      xmm4,           [rdi]
-        movdqa      xmm5,           [rdi+16]
-        movdqa      xmm6,           [rdi+32]
-        movdqa      xmm7,           [rdi+48]
-        ; += modifier
-        paddd       xmm4,           xmm0
-        paddd       xmm5,           xmm2
-        paddd       xmm6,           xmm1
-        paddd       xmm7,           xmm3
-        ; write back
-        movdqa      [rdi],          xmm4
-        movdqa      [rdi+16],       xmm5
-        movdqa      [rdi+32],       xmm6
-        movdqa      [rdi+48],       xmm7
-        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
-
-        cmp         rdx,            rcx
-        je          .temporal_filter_apply_epilog
-        pxor        xmm7,           xmm7   ; zero for extraction
-        cmp         dword ptr [rsp + block_width], 16
-        je          .temporal_filter_apply_load_16
-        jmp         .temporal_filter_apply_load_8
-
-.temporal_filter_apply_epilog:
-    ; begin epilog
-    mov         rbp,            [rsp + rbp_backup]
-    add         rsp,            stack_size
-    pop         rsp
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-_const_3w:
-    times 8 dw 3
-align 16
-_const_top_bit:
-    times 8 dw 1<<15
-align 16
-_const_16w:
-    times 8 dw 16
diff --git a/libaom/av1/encoder/x86/temporal_filter_constants.h b/libaom/av1/encoder/x86/temporal_filter_constants.h
new file mode 100644
index 0000000..b3a10dd
--- /dev/null
+++ b/libaom/av1/encoder/x86/temporal_filter_constants.h
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#define AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+
+// Division using multiplication and shifting. The C implementation does:
+// modifier *= 3;
+// modifier /= index;
+// where 'modifier' is a set of summed values and 'index' is the number of
+// summed values.
+//
+// This equation works out to (m * 3) / i which reduces to:
+// m * 3/4
+// m * 1/2
+// m * 1/3
+//
+// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
+// m * C / 65536
+// we can create a C to replicate the division.
+//
+// m * 49152 / 65536 = m * 3/4
+// m * 32758 / 65536 = m * 1/2
+// m * 21846 / 65536 = m * 0.3333
+//
+// These are loaded using an instruction expecting int16_t values but are used
+// with _mm_mulhi_epu16(), which treats them as unsigned.
+#define NEIGHBOR_CONSTANT_4 (int16_t)49152
+#define NEIGHBOR_CONSTANT_5 (int16_t)39322
+#define NEIGHBOR_CONSTANT_6 (int16_t)32768
+#define NEIGHBOR_CONSTANT_7 (int16_t)28087
+#define NEIGHBOR_CONSTANT_8 (int16_t)24576
+#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+#define NEIGHBOR_CONSTANT_10 (int16_t)19661
+#define NEIGHBOR_CONSTANT_11 (int16_t)17874
+#define NEIGHBOR_CONSTANT_13 (int16_t)15124
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_7,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+  TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+  TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4
+};
+
+#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U
+#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U
+#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U
+#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U
+#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U
+#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U
+#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U
+#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U
+#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7,
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8,
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11,
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13,
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
+};
+
+static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] =
+    { HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
+    };
+
+static const uint32_t *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] =
+    { HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2,
+      HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 };
+
+static const uint32_t *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] =
+    { HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4 };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4
+    };
+
+static const uint32_t *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] =
+    { HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4,
+      HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4 };
+
+#define DIST_STRIDE ((BW) + 2)
+#endif  // AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/libaom/av1/encoder/x86/temporal_filter_sse4.c b/libaom/av1/encoder/x86/temporal_filter_sse4.c
new file mode 100644
index 0000000..556d00c
--- /dev/null
+++ b/libaom/av1/encoder/x86/temporal_filter_sse4.c
@@ -0,0 +1,1006 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/x86/temporal_filter_constants.h"
+
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+                                uint16_t *dst) {
+  const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+
+  __m128i dist_first;
+
+  dist_first = _mm_sub_epi16(a_first, b_first);
+  dist_first = _mm_mullo_epi16(dist_first, dist_first);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+                                 uint16_t *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+  const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero);
+  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+  const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero);
+
+  __m128i dist_first, dist_second;
+
+  dist_first = _mm_sub_epi16(a_first, b_first);
+  dist_second = _mm_sub_epi16(a_second, b_second);
+  dist_first = _mm_mullo_epi16(dist_first, dist_first);
+  dist_second = _mm_mullo_epi16(dist_second, dist_second);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+  _mm_storeu_si128((__m128i *)(dst + 8), dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) {
+  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first,
+                                __m128i *reg_second) {
+  read_dist_8(dist, reg_first);
+  read_dist_8(dist + 8, reg_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static __m128i average_8(__m128i sum, const __m128i *mul_constants,
+                         const int strength, const int rounding,
+                         const int weight) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+  const __m128i weight_u16 = _mm_set1_epi16(weight);
+  const __m128i sixteen = _mm_set1_epi16(16);
+
+  // modifier * 3 / index;
+  sum = _mm_mulhi_epu16(sum, *mul_constants);
+
+  sum = _mm_adds_epu16(sum, rounding_u16);
+  sum = _mm_srl_epi16(sum, strength_u128);
+
+  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+  // So this needs to use the epu16 version which did not come until SSE4.
+  sum = _mm_min_epu16(sum, sixteen);
+
+  sum = _mm_sub_epi16(sixteen, sum);
+
+  return _mm_mullo_epi16(sum, weight_u16);
+}
+
+static __m128i average_4_4(__m128i sum, const __m128i *mul_constants,
+                           const int strength, const int rounding,
+                           const int weight_0, const int weight_1) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+  const __m128i weight_u16 =
+      _mm_setr_epi16(weight_0, weight_0, weight_0, weight_0, weight_1, weight_1,
+                     weight_1, weight_1);
+  const __m128i sixteen = _mm_set1_epi16(16);
+
+  // modifier * 3 / index;
+  sum = _mm_mulhi_epu16(sum, *mul_constants);
+
+  sum = _mm_adds_epu16(sum, rounding_u16);
+  sum = _mm_srl_epi16(sum, strength_u128);
+
+  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+  // So this needs to use the epu16 version which did not come until SSE4.
+  sum = _mm_min_epu16(sum, sixteen);
+
+  sum = _mm_sub_epi16(sixteen, sum);
+
+  return _mm_mullo_epi16(sum, weight_u16);
+}
+
+static INLINE void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
+                              const __m128i *mul_constants_0,
+                              const __m128i *mul_constants_1,
+                              const int strength, const int rounding,
+                              const int weight) {
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+  const __m128i weight_u16 = _mm_set1_epi16(weight);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  __m128i input_0, input_1;
+
+  input_0 = _mm_mulhi_epu16(*sum_0_u16, *mul_constants_0);
+  input_0 = _mm_adds_epu16(input_0, rounding_u16);
+
+  input_1 = _mm_mulhi_epu16(*sum_1_u16, *mul_constants_1);
+  input_1 = _mm_adds_epu16(input_1, rounding_u16);
+
+  input_0 = _mm_srl_epi16(input_0, strength_u128);
+  input_1 = _mm_srl_epi16(input_1, strength_u128);
+
+  input_0 = _mm_min_epu16(input_0, sixteen);
+  input_1 = _mm_min_epu16(input_1, sixteen);
+  input_0 = _mm_sub_epi16(sixteen, input_0);
+  input_1 = _mm_sub_epi16(sixteen, input_1);
+
+  *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
+  *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
+                                   uint16_t *count, uint32_t *accumulator) {
+  const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+  __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
+  __m128i pred_0_u32, pred_1_u32;
+  __m128i accum_0_u32, accum_1_u32;
+
+  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+  _mm_storeu_si128((__m128i *)count, count_u16);
+
+  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void accumulate_and_store_16(const __m128i sum_0_u16,
+                                           const __m128i sum_1_u16,
+                                           const uint8_t *pred, uint16_t *count,
+                                           uint32_t *accumulator) {
+  const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
+          count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
+  __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
+          pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
+  __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
+  __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+  count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
+  _mm_storeu_si128((__m128i *)count, count_0_u16);
+
+  count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
+  _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
+
+  pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
+  pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
+  pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
+  pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+  accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
+  accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+  accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
+  accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
+}
+
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) {
+  __m128i dist_reg, dist_left, dist_right;
+
+  dist_reg = _mm_loadu_si128((const __m128i *)y_dist);
+  dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1));
+  dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1));
+
+  *sum = _mm_adds_epu16(dist_reg, dist_left);
+  *sum = _mm_adds_epu16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first,
+                              __m128i *sum_second) {
+  get_sum_8(y_dist, sum_first);
+  get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+                                           const uint16_t *v_dist,
+                                           __m128i *u_first, __m128i *u_second,
+                                           __m128i *v_first,
+                                           __m128i *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 16 entries from chroma.
+    read_dist_16(u_dist, u_first, u_second);
+    read_dist_16(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    __m128i u_reg, v_reg;
+
+    read_dist_8(u_dist, &u_reg);
+
+    *u_first = _mm_unpacklo_epi16(u_reg, u_reg);
+    *u_second = _mm_unpackhi_epi16(u_reg, u_reg);
+
+    read_dist_8(v_dist, &v_reg);
+
+    *v_first = _mm_unpacklo_epi16(v_reg, v_reg);
+    *v_second = _mm_unpackhi_epi16(v_reg, v_reg);
+  }
+}
+
+// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit
+// int in dst.
+static INLINE void hadd_epu16(__m128i *src, __m128i *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i shift_right = _mm_srli_si128(*src, 2);
+
+  const __m128i odd = _mm_blend_epi16(shift_right, zero, 170);
+  const __m128i even = _mm_blend_epi16(*src, zero, 170);
+
+  *dst = _mm_add_epi32(even, odd);
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+                                                 int ss_x, int ss_y,
+                                                 __m128i *u_mod,
+                                                 __m128i *v_mod) {
+  __m128i y_reg;
+  if (!ss_x) {
+    read_dist_8(y_dist, &y_reg);
+    if (ss_y == 1) {
+      __m128i y_tmp;
+      read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+      y_reg = _mm_adds_epu16(y_reg, y_tmp);
+    }
+  } else {
+    __m128i y_first, y_second;
+    read_dist_16(y_dist, &y_first, &y_second);
+    if (ss_y == 1) {
+      __m128i y_tmp_0, y_tmp_1;
+      read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+      y_first = _mm_adds_epu16(y_first, y_tmp_0);
+      y_second = _mm_adds_epu16(y_second, y_tmp_1);
+    }
+
+    hadd_epu16(&y_first, &y_first);
+    hadd_epu16(&y_second, &y_second);
+
+    y_reg = _mm_packus_epi32(y_first, y_second);
+  }
+
+  *u_mod = _mm_adds_epu16(*u_mod, y_reg);
+  *v_mod = _mm_adds_epu16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void av1_apply_temporal_filter_luma_16(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
+    uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist,
+    const uint16_t *v_dist, const int16_t *const *neighbors_first,
+    const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_first, mul_second;
+
+  __m128i sum_row_1_first, sum_row_1_second;
+  __m128i sum_row_2_first, sum_row_2_second;
+  __m128i sum_row_3_first, sum_row_3_second;
+
+  __m128i u_first, u_second;
+  __m128i v_first, v_second;
+
+  __m128i sum_row_first;
+  __m128i sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 0);
+  assert(strength <= 6);
+
+  assert(block_width == 16);
+
+  (void)block_width;
+
+  // First row
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+  // Add luma values
+  get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+  get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first);
+  sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                          &v_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    sum_row_first =
+        average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
+    sum_row_second =
+        average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
+  } else {
+    average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
+               strength, rounding, weight);
+  }
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+
+  y_src += y_src_stride;
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+    sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+    get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+                              &v_first, &v_second);
+
+      u_src += uv_src_stride;
+      u_pre += uv_pre_stride;
+      u_dist += DIST_STRIDE;
+      v_src += uv_src_stride;
+      v_pre += uv_pre_stride;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+    sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      sum_row_first =
+          average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
+      sum_row_second =
+          average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
+    } else {
+      average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
+                 strength, rounding, weight);
+    }
+    accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                            y_accum);
+
+    y_src += y_src_stride;
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+  sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                            &v_second);
+  }
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    sum_row_first =
+        average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
+    sum_row_second =
+        average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
+  } else {
+    average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
+               strength, rounding, weight);
+  }
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void av1_apply_temporal_filter_luma(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist,
+    const uint16_t *u_dist, const uint16_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors_first;
+  const int16_t *const *neighbors_second;
+
+  if (block_width == 16) {
+    // Special Case: The blockwidth is 16 and we are operating on a row of 16
+    // chroma pixels. In this case, we can't use the usualy left-midle-right
+    // pattern. We also don't support splitting now.
+    neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+    neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+    if (use_whole_blk) {
+      av1_apply_temporal_filter_luma_16(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
+          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+          v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+          bottom_weight, NULL);
+    } else {
+      av1_apply_temporal_filter_luma_16(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
+          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+          v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  av1_apply_temporal_filter_luma_16(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    av1_apply_temporal_filter_luma_16(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
+        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    av1_apply_temporal_filter_luma_16(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
+        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight, NULL);
+  }
+
+  // Right
+  neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+  av1_apply_temporal_filter_luma_16(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void av1_apply_temporal_filter_chroma_8(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int uv_block_width,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul;
+
+  __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3;
+  __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+  __m128i u_sum_row, v_sum_row;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // First row
+  mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
+
+  // Add chroma values
+  get_sum_8(u_dist, &u_sum_row_2);
+  get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+  u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3);
+
+  get_sum_8(v_dist, &v_sum_row_2);
+  get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+  v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    u_sum_row =
+        average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+    v_sum_row =
+        average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+  } else {
+    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
+    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
+  }
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_src += y_src_stride * (1 + ss_y);
+  y_pre += y_pre_stride * (1 + ss_y);
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul = _mm_loadu_si128((const __m128i *)neighbors[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1 = u_sum_row_2;
+    u_sum_row_2 = u_sum_row_3;
+
+    v_sum_row_1 = v_sum_row_2;
+    v_sum_row_2 = v_sum_row_3;
+
+    // Add chroma values
+    u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+    get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+    u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3);
+
+    v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+    get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+    v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3);
+
+    // Add luma values
+    add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      u_sum_row = average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0],
+                              blk_fw[1]);
+      v_sum_row = average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0],
+                              blk_fw[1]);
+    } else {
+      u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
+      v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
+    }
+
+    accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+    accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+    u_src += uv_src_stride;
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_src += uv_src_stride;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_src += y_src_stride * (1 + ss_y);
+    y_pre += y_pre_stride * (1 + ss_y);
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
+
+  // Shift the rows up
+  u_sum_row_1 = u_sum_row_2;
+  u_sum_row_2 = u_sum_row_3;
+
+  v_sum_row_1 = v_sum_row_2;
+  v_sum_row_2 = v_sum_row_3;
+
+  // Add chroma values
+  u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+  v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    u_sum_row =
+        average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+    v_sum_row =
+        average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+  } else {
+    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
+    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
+  }
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void av1_apply_temporal_filter_chroma(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    } else {
+      neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      av1_apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+          top_weight, bottom_weight, NULL);
+    } else {
+      av1_apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+          0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+  }
+
+  av1_apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    av1_apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+        top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    av1_apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+        top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  av1_apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+}
+
+void av1_apply_temporal_filter_sse4_1(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+  const int *blk_fw_ptr = blk_fw;
+
+  uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference sqaured
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+      store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                    y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                   u_dist_ptr + blk_col);
+      store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                   v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  av1_apply_temporal_filter_luma(
+      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw_ptr, use_whole_blk, y_accum, y_count, y_dist_ptr,
+      u_dist_ptr, v_dist_ptr);
+
+  av1_apply_temporal_filter_chroma(
+      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
+      y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/libaom/build/cmake/aom_config_defaults.cmake b/libaom/build/cmake/aom_config_defaults.cmake
index feb9b5e..f498acd 100644
--- a/libaom/build/cmake/aom_config_defaults.cmake
+++ b/libaom/build/cmake/aom_config_defaults.cmake
@@ -101,8 +101,6 @@ set_aom_config_var(CONFIG_DENOISE 1 NUMBER
                    "Denoise/noise modeling support in encoder.")
 set_aom_config_var(CONFIG_FILEOPTIONS 1 NUMBER
                    "Enables encoder config file support.")
-set_aom_config_var(CONFIG_FIX_GF_LENGTH 1 NUMBER
-                   "Fix the GF length if possible")
 set_aom_config_var(CONFIG_INSPECTION 0 NUMBER "Enables bitstream inspection.")
 set_aom_config_var(CONFIG_INTERNAL_STATS 0 NUMBER
                    "Enables internal encoder stats.")
@@ -112,34 +110,29 @@ set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2 NUMBER
                    "Max profile to support decoding.")
 set_aom_config_var(CONFIG_NORMAL_TILE_MODE 0 NUMBER
                    "Only enables normal tile mode.")
-set_aom_config_var(
-  CONFIG_REDUCED_ENCODER_BORDER 0 NUMBER
-  "Enable reduced border extention for encoder. \
-                    Disables superres and resize support."
-  )
 set_aom_config_var(CONFIG_SIZE_LIMIT 0 NUMBER "Limit max decode width/height.")
 set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 NUMBER "Spatial resampling.")
 set_aom_config_var(DECODE_HEIGHT_LIMIT 0 NUMBER "Set limit for decode height.")
 set_aom_config_var(DECODE_WIDTH_LIMIT 0 NUMBER "Set limit for decode width.")
-set_aom_config_var(CONFIG_GLOBAL_MOTION_SEARCH 1 NUMBER
-                   "Global motion search flag.")
 
 # AV1 experiment flags.
-set_aom_config_var(CONFIG_COLLECT_INTER_MODE_RD_STATS 1 NUMBER
-                   "AV1 experiment flag.")
+set_aom_config_var(CONFIG_SPEED_STATS 0 NUMBER "AV1 experiment flag.")
 set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 NUMBER "AV1 experiment flag.")
 set_aom_config_var(CONFIG_DIST_8X8 0 NUMBER "AV1 experiment flag.")
 set_aom_config_var(CONFIG_ENTROPY_STATS 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_FP_MB_STATS 0 NUMBER "AV1 experiment flag.")
 set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 NUMBER "AV1 experiment flag.")
 set_aom_config_var(CONFIG_RD_DEBUG 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL 1 NUMBER
+set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1 NUMBER
+                   "AV1 experiment flag.")
+set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3 NUMBER
                    "AV1 experiment flag.")
 set_aom_config_var(CONFIG_SHARP_SETTINGS 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_ONE_PASS_SVM 0 NUMBER "AV1 experiment flag.")
 set_aom_config_var(CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1 NUMBER
-                   "Disable full_pixel_motion_search_based_split on BLOCK_8X8")
-
+                   "Disable full_pixel_motion_search_based_split on BLOCK_8X8.")
+set_aom_config_var(CONFIG_COLLECT_PARTITION_STATS 0 NUMBER
+                   "Collect stats on partition decisions.")
+set_aom_config_var(CONFIG_COLLECT_COMPONENT_TIMING 0 NUMBER
+                   "Collect encoding component timing information.")
 #
 # Variables in this section control optional features of the build system.
 #
diff --git a/libaom/build/cmake/aom_experiment_deps.cmake b/libaom/build/cmake/aom_experiment_deps.cmake
index 0688704..2e36157 100644
--- a/libaom/build/cmake/aom_experiment_deps.cmake
+++ b/libaom/build/cmake/aom_experiment_deps.cmake
@@ -21,10 +21,6 @@ macro(fix_experiment_configs)
     change_config_and_warn(CONFIG_INSPECTION 1 CONFIG_ANALYZER)
   endif()
 
-  if(CONFIG_RD_DEBUG)
-    change_config_and_warn(CONFIG_RD_DEBUG 0 CONFIG_JNT_COMP)
-  endif()
-
   if(CONFIG_DIST_8X8 AND CONFIG_MULTITHREAD)
     change_config_and_warn(CONFIG_DIST_8X8 0 CONFIG_MULTITHREAD)
   endif()
diff --git a/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake b/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake
index b5b2ff1..bfeac92 100644
--- a/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake
+++ b/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake
@@ -27,6 +27,3 @@ set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
 
 # No runtime cpu detect for arm64-mingw-gcc.
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
-
-# Disable the use of the gtest's CMake support.
-set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake b/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake
index 7d3d630..6cbc2a8 100644
--- a/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake
+++ b/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake
@@ -28,16 +28,13 @@ endif()
 set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
 set(AS_EXECUTABLE ${CROSS}as)
-set(CMAKE_C_COMPILER_ARG1
-    "-march=armv7-a -mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
-set(CMAKE_CXX_COMPILER_ARG1
-    "-march=armv7-a -mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
+set(CMAKE_C_COMPILER_ARG1 "-march=armv7-a ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
+set(CMAKE_CXX_COMPILER_ARG1 "-march=armv7-a ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
 set(AOM_AS_FLAGS --defsym ARCHITECTURE=7 -march=armv7-a -mfpu=neon
     ${AOM_EXTRA_TOOLCHAIN_FLAGS})
 set(CMAKE_SYSTEM_PROCESSOR "armv7")
 
-# No intrinsics flag required for armv7-linux-gcc.
-set(AOM_NEON_INTRIN_FLAG "")
+set(AOM_NEON_INTRIN_FLAG "-mfpu=neon")
 
 # No runtime cpu detect for armv7-linux-gcc.
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
diff --git a/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake b/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake
index cf06a11..eb488ec 100644
--- a/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake
+++ b/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake
@@ -27,6 +27,3 @@ set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
 
 # No runtime cpu detect for armv7-mingw-gcc.
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
-
-# Disable the use of the gtest's CMake support.
-set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/libaom/build/cmake/toolchains/x86-mingw-gcc.cmake b/libaom/build/cmake/toolchains/x86-mingw-gcc.cmake
index c986c4e..4839c9d 100644
--- a/libaom/build/cmake/toolchains/x86-mingw-gcc.cmake
+++ b/libaom/build/cmake/toolchains/x86-mingw-gcc.cmake
@@ -26,6 +26,3 @@ set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
 set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
 set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
-
-# Disable the use of the gtest's CMake support.
-set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake b/libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
index 00d94d5..4b2d28d 100644
--- a/libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
+++ b/libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
@@ -24,6 +24,3 @@ set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
 set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
 set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
-
-# Disable the use of the gtest's CMake support.
-set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/libaom/common/av1_config.c b/libaom/common/av1_config.c
index e8decf7..90955fb 100644
--- a/libaom/common/av1_config.c
+++ b/libaom/common/av1_config.c
@@ -322,7 +322,7 @@ static int parse_sequence_header(const uint8_t *const buffer, size_t length,
   AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_height_minus_1,
                                  frame_height_bits_minus_1 + 1);
 
-  int frame_id_numbers_present = 0;
+  uint8_t frame_id_numbers_present = 0;
   if (!reduced_still_picture_header) {
     AV1C_READ_BIT_OR_RETURN_ERROR(frame_id_numbers_present_flag);
     frame_id_numbers_present = frame_id_numbers_present_flag;
@@ -345,7 +345,7 @@ static int parse_sequence_header(const uint8_t *const buffer, size_t length,
 
     AV1C_READ_BIT_OR_RETURN_ERROR(enable_order_hint);
     if (enable_order_hint) {
-      AV1C_READ_BIT_OR_RETURN_ERROR(enable_jnt_comp);
+      AV1C_READ_BIT_OR_RETURN_ERROR(enable_dist_wtd_comp);
       AV1C_READ_BIT_OR_RETURN_ERROR(enable_ref_frame_mvs);
     }
 
diff --git a/libaom/common/rawenc.c b/libaom/common/rawenc.c
index 5a2731d..b72132c 100644
--- a/libaom/common/rawenc.c
+++ b/libaom/common/rawenc.c
@@ -9,36 +9,88 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <stdbool.h>
 #include "common/rawenc.h"
 
-void raw_write_image_file(const aom_image_t *img, const int *planes,
-                          const int num_planes, FILE *file) {
-  const int bytes_per_sample = ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
-  for (int i = 0; i < num_planes; ++i) {
-    const int plane = planes[i];
-    const unsigned char *buf = img->planes[plane];
-    const int stride = img->stride[plane];
-    const int w = aom_img_plane_width(img, plane);
-    const int h = aom_img_plane_height(img, plane);
-    for (int y = 0; y < h; ++y) {
-      fwrite(buf, bytes_per_sample, w, file);
-      buf += stride;
+#define BATCH_SIZE 8
+// When writing greyscale color, batch 8 writes for low bit-depth, 4 writes
+// for high bit-depth.
+static const uint8_t batched[BATCH_SIZE] = { 128, 128, 128, 128,
+                                             128, 128, 128, 128 };
+static const uint8_t batched_hbd[BATCH_SIZE] = {
+  0, 128, 0, 128, 0, 128, 0, 128
+};
+
+// Interface to writing to either a file or MD5Context. Takes a pointer to
+// either the file or MD5Context, the buffer, the size of each element, and
+// number of elements to write. Note that size and nmemb (last two args) must
+// be unsigned int, as the interface to MD5Update requires that.
+typedef void (*WRITER)(void *, const uint8_t *, unsigned int, unsigned int);
+
+static void write_file(void *fp, const uint8_t *buffer, unsigned int size,
+                       unsigned int nmemb) {
+  fwrite(buffer, size, nmemb, (FILE *)fp);
+}
+
+static void write_md5(void *md5, const uint8_t *buffer, unsigned int size,
+                      unsigned int nmemb) {
+  MD5Update((MD5Context *)md5, buffer, size * nmemb);
+}
+
+// Writes out n greyscale values.
+static void write_greyscale(const bool high_bitdepth, int n, WRITER writer_func,
+                            void *file_or_md5) {
+  const uint8_t *b = batched;
+  if (high_bitdepth) {
+    b = batched_hbd;
+  }
+  const int num_batched_writes =
+      high_bitdepth ? n / (BATCH_SIZE / 2) : n / BATCH_SIZE;
+  for (int i = 0; i < num_batched_writes; ++i) {
+    writer_func(file_or_md5, b, sizeof(uint8_t), BATCH_SIZE);
+  }
+  const int remaining = high_bitdepth ? n % (BATCH_SIZE / 2) : n % BATCH_SIZE;
+  for (int i = 0; i < remaining; ++i) {
+    if (high_bitdepth) {
+      writer_func(file_or_md5, batched_hbd, sizeof(uint8_t), 2);
+    } else {
+      writer_func(file_or_md5, batched, sizeof(uint8_t), 1);
     }
   }
 }
 
-void raw_update_image_md5(const aom_image_t *img, const int *planes,
-                          const int num_planes, MD5Context *md5) {
+// Encapsulates the logic for writing raw data to either an image file or
+// to an MD5 context.
+static void raw_write_image_file_or_md5(const aom_image_t *img,
+                                        const int *planes, const int num_planes,
+                                        void *file_or_md5, WRITER writer_func) {
+  const bool high_bitdepth = img->fmt & AOM_IMG_FMT_HIGHBITDEPTH;
+  const int bytes_per_sample = high_bitdepth ? 2 : 1;
   for (int i = 0; i < num_planes; ++i) {
     const int plane = planes[i];
+    const int w = aom_img_plane_width(img, plane);
+    const int h = aom_img_plane_height(img, plane);
+    // If we're on a color plane and the output is monochrome, write a greyscale
+    // value. Since there are only YUV planes, compare against Y.
+    if (img->monochrome && plane != AOM_PLANE_Y) {
+      write_greyscale(high_bitdepth, w * h, writer_func, file_or_md5);
+      continue;
+    }
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = aom_img_plane_width(img, plane) *
-                  ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
-    const int h = aom_img_plane_height(img, plane);
     for (int y = 0; y < h; ++y) {
-      MD5Update(md5, buf, w);
+      writer_func(file_or_md5, buf, bytes_per_sample, w);
       buf += stride;
     }
   }
 }
+
+void raw_write_image_file(const aom_image_t *img, const int *planes,
+                          const int num_planes, FILE *file) {
+  raw_write_image_file_or_md5(img, planes, num_planes, file, write_file);
+}
+
+void raw_update_image_md5(const aom_image_t *img, const int *planes,
+                          const int num_planes, MD5Context *md5) {
+  raw_write_image_file_or_md5(img, planes, num_planes, md5, write_md5);
+}
diff --git a/libaom/common/tools_common.c b/libaom/common/tools_common.c
index 2e32f61..51c1c52 100644
--- a/libaom/common/tools_common.c
+++ b/libaom/common/tools_common.c
@@ -149,6 +149,11 @@ const AvxInterface *get_aom_encoder_by_name(const char *name) {
 
   return NULL;
 }
+
+// large scale tile encoding
+static const AvxInterface aom_lst_encoder = { "av1", LST_FOURCC,
+                                              &aom_codec_av1_cx };
+const AvxInterface *get_aom_lst_encoder(void) { return &aom_lst_encoder; }
 #endif  // CONFIG_AV1_ENCODER
 
 #if CONFIG_AV1_DECODER
diff --git a/libaom/common/tools_common.h b/libaom/common/tools_common.h
index df3b62b..d9a68f0 100644
--- a/libaom/common/tools_common.h
+++ b/libaom/common/tools_common.h
@@ -18,6 +18,7 @@
 #include "aom/aom_codec.h"
 #include "aom/aom_image.h"
 #include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
 #include "aom_ports/msvc.h"
 
 #if CONFIG_AV1_ENCODER
@@ -78,11 +79,14 @@ enum VideoFileType {
 };
 
 // Used in lightfield example.
-typedef enum OUTPUT_FORMAT {
+enum {
   YUV1D,  // 1D tile output for conformance test.
   YUV,    // Tile output in YUV format.
   NV12,   // Tile output in NV12 format.
-} OUTPUT_FORMAT;
+} UENUM1BYTE(OUTPUT_FORMAT);
+
+// The fourcc for large_scale_tile encoding is "LSTC".
+#define LST_FOURCC 0x4354534c
 
 struct FileTypeDetectionBuffer {
   char buf[4];
@@ -149,6 +153,7 @@ typedef struct AvxInterface {
 int get_aom_encoder_count(void);
 const AvxInterface *get_aom_encoder_by_index(int i);
 const AvxInterface *get_aom_encoder_by_name(const char *name);
+const AvxInterface *get_aom_lst_encoder(void);
 
 int get_aom_decoder_count(void);
 const AvxInterface *get_aom_decoder_by_index(int i);
diff --git a/libaom/common/video_reader.c b/libaom/common/video_reader.c
index 47ad6e1..7b021bc 100644
--- a/libaom/common/video_reader.c
+++ b/libaom/common/video_reader.c
@@ -121,3 +121,7 @@ FILE *aom_video_reader_get_file(AvxVideoReader *reader) {
 const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) {
   return &reader->info;
 }
+
+void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc) {
+  reader->info.codec_fourcc = fourcc;
+}
diff --git a/libaom/common/video_reader.h b/libaom/common/video_reader.h
index 903deae..9ab439e 100644
--- a/libaom/common/video_reader.h
+++ b/libaom/common/video_reader.h
@@ -50,6 +50,9 @@ FILE *aom_video_reader_get_file(AvxVideoReader *reader);
 // Fills AvxVideoInfo with information from opened video file.
 const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader);
 
+// Set fourcc.
+void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libaom/common/video_writer.c b/libaom/common/video_writer.c
index a7ec309..2b42e36 100644
--- a/libaom/common/video_writer.c
+++ b/libaom/common/video_writer.c
@@ -75,3 +75,7 @@ int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
 
   return 1;
 }
+
+void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc) {
+  writer->info.codec_fourcc = fourcc;
+}
diff --git a/libaom/common/video_writer.h b/libaom/common/video_writer.h
index 3e2b655..8712d47 100644
--- a/libaom/common/video_writer.h
+++ b/libaom/common/video_writer.h
@@ -14,7 +14,7 @@
 
 #include "common/video_common.h"
 
-typedef enum { kContainerIVF } AvxContainer;
+enum { kContainerIVF } UENUM1BYTE(AvxContainer);
 
 struct AvxVideoWriterStruct;
 typedef struct AvxVideoWriterStruct AvxVideoWriter;
@@ -37,6 +37,8 @@ void aom_video_writer_close(AvxVideoWriter *writer);
 // Writes frame bytes to the file.
 int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
                                  size_t size, int64_t pts);
+// Set fourcc.
+void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libaom/common/webmenc.h b/libaom/common/webmenc.h
index 4cdfd68..a4aa992 100644
--- a/libaom/common/webmenc.h
+++ b/libaom/common/webmenc.h
@@ -30,13 +30,13 @@ struct WebmOutputContext {
 };
 
 /* Stereo 3D packed frame format */
-typedef enum stereo_format {
+enum {
   STEREO_FORMAT_MONO = 0,
   STEREO_FORMAT_LEFT_RIGHT = 1,
   STEREO_FORMAT_BOTTOM_TOP = 2,
   STEREO_FORMAT_TOP_BOTTOM = 3,
   STEREO_FORMAT_RIGHT_LEFT = 11
-} stereo_format_t;
+} UENUM1BYTE(stereo_format_t);
 
 // The following functions wrap libwebm's mkvmuxer. All functions return 0 upon
 // success, or -1 upon failure.
diff --git a/libaom/examples/analyzer.cc b/libaom/examples/analyzer.cc
index 6a42eca..261d085 100644
--- a/libaom/examples/analyzer.cc
+++ b/libaom/examples/analyzer.cc
@@ -162,7 +162,7 @@ bool AV1Decoder::setInspectionCallback() {
 
 void AV1Decoder::inspect(void *pbi, void *data) {
   AV1Decoder *decoder = (AV1Decoder *)data;
-  ifd_inspect(&decoder->frame_data, pbi);
+  ifd_inspect(&decoder->frame_data, pbi, 0);
 }
 
 #define MIN_ZOOM (1)
diff --git a/libaom/examples/av1_dec_fuzzer.cc b/libaom/examples/av1_dec_fuzzer.cc
new file mode 100644
index 0000000..96d16a8
--- /dev/null
+++ b/libaom/examples/av1_dec_fuzzer.cc
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*
+ * See build_av1_dec_fuzzer.sh for building instructions.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory>
+
+#include "config/aom_config.h"
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_ports/mem_ops.h"
+#include "common/ivfdec.h"
+
+static void close_file(FILE *file) { fclose(file); }
+
+extern "C" void usage_exit(void) { exit(EXIT_FAILURE); }
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  std::unique_ptr<FILE, decltype(&close_file)> file(
+      fmemopen((void *)data, size, "rb"), &close_file);
+  if (file == nullptr) {
+    return 0;
+  }
+
+  char header[32];
+  if (fread(header, 1, 32, file.get()) != 32) {
+    return 0;
+  }
+  const AvxInterface *decoder = get_aom_decoder_by_name("av1");
+  if (decoder == nullptr) {
+    return 0;
+  }
+
+  aom_codec_ctx_t codec;
+  // Set thread count in the range [1, 64].
+  const unsigned int threads = (header[0] & 0x3f) + 1;
+  aom_codec_dec_cfg_t cfg = { threads, 0, 0, CONFIG_LOWBITDEPTH };
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), &cfg, 0)) {
+    return 0;
+  }
+
+  uint8_t *buffer = nullptr;
+  size_t buffer_size = 0;
+  size_t frame_size = 0;
+  while (!ivf_read_frame(file.get(), &buffer, &frame_size, &buffer_size,
+                         nullptr)) {
+    const aom_codec_err_t err =
+        aom_codec_decode(&codec, buffer, frame_size, nullptr);
+    static_cast<void>(err);
+    aom_codec_iter_t iter = nullptr;
+    aom_image_t *img = nullptr;
+    while ((img = aom_codec_get_frame(&codec, &iter)) != nullptr) {
+    }
+  }
+  aom_codec_destroy(&codec);
+  free(buffer);
+  return 0;
+}
diff --git a/libaom/examples/build_av1_dec_fuzzer.sh b/libaom/examples/build_av1_dec_fuzzer.sh
new file mode 100755
index 0000000..86992a0
--- /dev/null
+++ b/libaom/examples/build_av1_dec_fuzzer.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+#
+# Copyright (c) 2019, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+###############################################################################
+# Fuzzer for libaom decoder.
+# ==========================
+# Requirements
+# ---------------------
+# Clang6.0 or above (must support -fsanitize=fuzzer)
+#
+# References:
+# ---------------------
+# http://llvm.org/docs/LibFuzzer.html
+# https://github.com/google/oss-fuzz
+#
+# Steps to build / run
+# ---------------------
+
+set -eu
+
+# Have a copy of AOM and a build directory ready.
+if [[ $# -ne 2 ]]; then
+  echo "Pass in the AOM source tree as first argument, and a build directory "
+  echo "as the second argument. The AOM source tree can be obtained via: "
+  echo "  git clone https://aomedia.googlesource.com/aom"
+  exit 2
+fi
+if [[ -z "$CC" ]]; then
+  echo "Set the CC environment variable to point to your C compiler."
+  exit 2
+fi
+if [[ -z "$CXX" ]]; then
+  echo "Set the CXX environment variable to point to your C++ compiler."
+  exit 2
+fi
+
+AOM_DIR=$1
+BUILD_DIR=$2
+# Run CMake with address sanitizer enabled and build the codec.
+# Enable DO_RANGE_CHECK_CLAMP to suppress the noise of integer overflows
+# in the transform functions. Also set memory limits.
+EXTRA_C_FLAGS='-DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824'
+cd "${BUILD_DIR}"
+cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \
+  -DCONFIG_SCALABILITY=0 -DCONFIG_LOWBITDEPTH=1 -DCONFIG_AV1_ENCODER=0 \
+  -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 -DCONFIG_SIZE_LIMIT=1 \
+  -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \
+  -DAOM_EXTRA_C_FLAGS="${EXTRA_C_FLAGS}" \
+  -DAOM_EXTRA_CXX_FLAGS="${EXTRA_C_FLAGS}" -DSANITIZE=address
+
+# Build the codec.
+make -j$(nproc)
+
+# Build some libaom utils that are not part of the core lib.
+$CC -std=c99 -c -I${AOM_DIR} -I${BUILD_DIR} \
+  ${AOM_DIR}/common/ivfdec.c -o ${BUILD_DIR}/ivfdec.o
+
+$CC -std=c99 -c -I${AOM_DIR} -I${BUILD_DIR} \
+  ${AOM_DIR}/common/tools_common.c -o ${BUILD_DIR}/tools_common.o
+
+# Build the av1 fuzzer
+$CXX -std=c++11 -DDECODER=av1 -I${AOM_DIR} -I${BUILD_DIR} \
+    -fsanitize=fuzzer -Wl,--start-group \
+    ${AOM_DIR}/examples/av1_dec_fuzzer.cc -o ${BUILD_DIR}/av1_dec_fuzzer \
+    ${BUILD_DIR}/libaom.a ${BUILD_DIR}/ivfdec.o ${BUILD_DIR}/tools_common.o \
+    -Wl,--end-group
+
+echo "Fuzzer built at ${BUILD_DIR}/av1_dec_fuzzer."
+echo "Create a corpus directory, copy IVF files in there, and run:"
+echo "  av1_dec_fuzzer CORPUS_DIR"
diff --git a/libaom/examples/inspect.c b/libaom/examples/inspect.c
index 7b7b3cd..9ca2a02 100644
--- a/libaom/examples/inspect.c
+++ b/libaom/examples/inspect.c
@@ -62,7 +62,10 @@ typedef enum {
   SEGMENT_ID_LAYER = 1 << 14,
   MOTION_MODE_LAYER = 1 << 15,
   COMPOUND_TYPE_LAYER = 1 << 16,
-  ALL_LAYERS = (1 << 17) - 1
+  INTRABC_LAYER = 1 << 17,
+  PALETTE_LAYER = 1 << 18,
+  UV_PALETTE_LAYER = 1 << 19,
+  ALL_LAYERS = (1 << 20) - 1
 } LayerType;
 
 static LayerType layers = 0;
@@ -106,7 +109,20 @@ static const arg_def_t dump_delta_q_arg =
     ARG_DEF("dq", "delta_q", 0, "Dump QIndex");
 static const arg_def_t dump_seg_id_arg =
     ARG_DEF("si", "seg_id", 0, "Dump Segment ID");
+static const arg_def_t dump_intrabc_arg =
+    ARG_DEF("ibc", "intrabc", 0, "Dump If IntraBC Is Used");
+static const arg_def_t dump_palette_arg =
+    ARG_DEF("plt", "palette", 0, "Dump Palette Size");
+static const arg_def_t dump_uv_palette_arg =
+    ARG_DEF("uvp", "uv_palette", 0, "Dump UV Palette Size");
 static const arg_def_t usage_arg = ARG_DEF("h", "help", 0, "Help");
+static const arg_def_t skip_non_transform_arg = ARG_DEF(
+    "snt", "skip_non_transform", 1, "Skip is counted as a non transform.");
+static const arg_def_t combined_arg =
+    ARG_DEF("comb", "combined", 1, "combinining parameters into one output.");
+
+int combined_parm_list[15];
+int combined_parm_count = 0;
 
 static const arg_def_t *main_args[] = { &limit_arg,
                                         &dump_all_arg,
@@ -130,7 +146,12 @@ static const arg_def_t *main_args[] = { &limit_arg,
                                         &dump_motion_vectors_arg,
                                         &dump_delta_q_arg,
                                         &dump_seg_id_arg,
+                                        &dump_intrabc_arg,
+                                        &dump_palette_arg,
+                                        &dump_uv_palette_arg,
                                         &usage_arg,
+                                        &skip_non_transform_arg,
+                                        &combined_arg,
                                         NULL };
 #define ENUM(name) \
   { #name, name }
@@ -158,6 +179,8 @@ const map_entry block_size_map[] = {
   ENUM(BLOCK_64X16),   LAST_ENUM
 };
 
+#define TX_SKIP -1
+
 const map_entry tx_size_map[] = {
   ENUM(TX_4X4),   ENUM(TX_8X8),   ENUM(TX_16X16), ENUM(TX_32X32),
   ENUM(TX_64X64), ENUM(TX_4X8),   ENUM(TX_8X4),   ENUM(TX_8X16),
@@ -225,10 +248,57 @@ const map_entry uv_prediction_mode_map[] = {
 
 const map_entry skip_map[] = { ENUM(SKIP), ENUM(NO_SKIP), LAST_ENUM };
 
+const map_entry intrabc_map[] = {
+  { "INTRABC", 1 }, { "NO_INTRABC", 0 }, LAST_ENUM
+};
+
+const map_entry palette_map[] = {
+  { "ZERO_COLORS", 0 },  { "TWO_COLORS", 2 },   { "THREE_COLORS", 3 },
+  { "FOUR_COLORS", 4 },  { "FIVE_COLORS", 5 },  { "SIX_COLORS", 6 },
+  { "SEVEN_COLORS", 7 }, { "EIGHT_COLORS", 8 }, LAST_ENUM
+};
+
 const map_entry config_map[] = { ENUM(MI_SIZE), LAST_ENUM };
 
 static const char *exec_name;
 
+struct parm_offset {
+  char parm[60];
+  char offset;
+};
+struct parm_offset parm_offsets[] = {
+  { "blockSize", offsetof(insp_mi_data, sb_type) },
+  { "transformSize", offsetof(insp_mi_data, tx_size) },
+  { "transformType", offsetof(insp_mi_data, tx_type) },
+  { "dualFilterType", offsetof(insp_mi_data, dual_filter_type) },
+  { "mode", offsetof(insp_mi_data, mode) },
+  { "uv_mode", offsetof(insp_mi_data, uv_mode) },
+  { "motion_mode", offsetof(insp_mi_data, motion_mode) },
+  { "compound_type", offsetof(insp_mi_data, compound_type) },
+  { "referenceFrame", offsetof(insp_mi_data, ref_frame) },
+  { "skip", offsetof(insp_mi_data, skip) },
+};
+int parm_count = sizeof(parm_offsets) / sizeof(parm_offsets[0]);
+
+int convert_to_indices(char *str, int *indices, int maxCount, int *count) {
+  *count = 0;
+  do {
+    char *comma = strchr(str, ',');
+    int length = (comma ? (int)(comma - str) : (int)strlen(str));
+    int i;
+    for (i = 0; i < parm_count; ++i) {
+      if (!strncmp(str, parm_offsets[i].parm, length)) {
+        break;
+      }
+    }
+    if (i == parm_count) return 0;
+    indices[(*count)++] = i;
+    if (*count > maxCount) return 0;
+    str += length + 1;
+  } while (strlen(str) > 0);
+  return 1;
+}
+
 insp_frame_data frame_data;
 int frame_count = 0;
 int decoded_frame_count = 0;
@@ -399,6 +469,38 @@ int put_motion_vectors(char *buffer) {
   return (int)(buf - buffer);
 }
 
+int put_combined(char *buffer) {
+  const int mi_rows = frame_data.mi_rows;
+  const int mi_cols = frame_data.mi_cols;
+  char *buf = buffer;
+  int r, c, p;
+  buf += put_str(buf, "  \"");
+  for (p = 0; p < combined_parm_count; ++p) {
+    if (p) buf += put_str(buf, "&");
+    buf += put_str(buf, parm_offsets[combined_parm_list[p]].parm);
+  }
+  buf += put_str(buf, "\": [");
+  for (r = 0; r < mi_rows; ++r) {
+    *(buf++) = '[';
+    for (c = 0; c < mi_cols; ++c) {
+      insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+      *(buf++) = '[';
+      for (p = 0; p < combined_parm_count; ++p) {
+        if (p) *(buf++) = ',';
+        int16_t *v = (int16_t *)(((int8_t *)mi) +
+                                 parm_offsets[combined_parm_list[p]].offset);
+        buf += put_num(buf, 0, v[0], 0);
+      }
+      *(buf++) = ']';
+      if (c < mi_cols - 1) *(buf++) = ',';
+    }
+    *(buf++) = ']';
+    if (r < mi_rows - 1) *(buf++) = ',';
+  }
+  buf += put_str(buf, "],\n");
+  return (int)(buf - buffer);
+}
+
 int put_block_info(char *buffer, const map_entry *map, const char *name,
                    size_t offset, int len) {
   const int mi_rows = frame_data.mi_rows;
@@ -507,9 +609,11 @@ int put_accounting(char *buffer) {
 }
 #endif
 
+int skip_non_transform = 0;
+
 void inspect(void *pbi, void *data) {
   /* Fetch frame data. */
-  ifd_inspect(&frame_data, pbi);
+  ifd_inspect(&frame_data, pbi, skip_non_transform);
 
   // Show existing frames just show a reference buffer we've already decoded.
   // There's no information to show.
@@ -584,6 +688,19 @@ void inspect(void *pbi, void *data) {
   if (layers & MOTION_VECTORS_LAYER) {
     buf += put_motion_vectors(buf);
   }
+  if (layers & INTRABC_LAYER) {
+    buf += put_block_info(buf, intrabc_map, "intrabc",
+                          offsetof(insp_mi_data, intrabc), 0);
+  }
+  if (layers & PALETTE_LAYER) {
+    buf += put_block_info(buf, palette_map, "palette",
+                          offsetof(insp_mi_data, palette), 0);
+  }
+  if (layers & UV_PALETTE_LAYER) {
+    buf += put_block_info(buf, palette_map, "uv_palette",
+                          offsetof(insp_mi_data, uv_palette), 0);
+  }
+  if (combined_parm_count > 0) buf += put_combined(buf);
   if (layers & REFERENCE_FRAME_LAYER) {
     buf += put_block_info(buf, refs_map, "referenceFrame",
                           offsetof(insp_mi_data, ref_frame), 2);
@@ -775,6 +892,12 @@ static void parse_args(char **argv) {
       layers |= Q_INDEX_LAYER;
     else if (arg_match(&arg, &dump_seg_id_arg, argi))
       layers |= SEGMENT_ID_LAYER;
+    else if (arg_match(&arg, &dump_intrabc_arg, argi))
+      layers |= INTRABC_LAYER;
+    else if (arg_match(&arg, &dump_palette_arg, argi))
+      layers |= PALETTE_LAYER;
+    else if (arg_match(&arg, &dump_uv_palette_arg, argi))
+      layers |= UV_PALETTE_LAYER;
     else if (arg_match(&arg, &dump_all_arg, argi))
       layers |= ALL_LAYERS;
     else if (arg_match(&arg, &compress_arg, argi))
@@ -783,6 +906,13 @@ static void parse_args(char **argv) {
       usage_exit();
     else if (arg_match(&arg, &limit_arg, argi))
       stop_after = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &skip_non_transform_arg, argi))
+      skip_non_transform = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &combined_arg, argi))
+      convert_to_indices(
+          (char *)arg.val, combined_parm_list,
+          sizeof(combined_parm_list) / sizeof(combined_parm_list[0]),
+          &combined_parm_count);
     else
       argj++;
   }
diff --git a/libaom/examples/lightfield_bitstream_parsing.c b/libaom/examples/lightfield_bitstream_parsing.c
index 9c90671..afacf44 100644
--- a/libaom/examples/lightfield_bitstream_parsing.c
+++ b/libaom/examples/lightfield_bitstream_parsing.c
@@ -211,6 +211,8 @@ int main(int argc, char **argv) {
   num_references = (int)strtol(argv[3], NULL, 0);
   info = aom_video_reader_get_info(reader);
 
+  aom_video_reader_set_fourcc(reader, AV1_FOURCC);
+
   // The writer to write out ivf file in tile list OBU, which can be decoded by
   // AV1 decoder.
   writer = aom_video_writer_open(argv[2], kContainerIVF, info);
diff --git a/libaom/examples/lightfield_decoder.c b/libaom/examples/lightfield_decoder.c
index 23dac98..7a445f0 100644
--- a/libaom/examples/lightfield_decoder.c
+++ b/libaom/examples/lightfield_decoder.c
@@ -188,8 +188,10 @@ int main(int argc, char **argv) {
 
   info = aom_video_reader_get_info(reader);
 
-  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
-  if (!decoder) die("Unknown input codec.");
+  if (info->codec_fourcc == LST_FOURCC)
+    decoder = get_aom_decoder_by_fourcc(AV1_FOURCC);
+  else
+    die("Unknown input codec.");
   printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
 
   if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
@@ -218,7 +220,7 @@ int main(int argc, char **argv) {
       // Allocate memory to store decoded references. Allocate memory with the
       // border so that it can be used as a reference.
       for (j = 0; j < num_references; j++) {
-        unsigned int border = AOM_BORDER_IN_PIXELS;
+        unsigned int border = AOM_DEC_BORDER_IN_PIXELS;
         if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
                                        frame_res[0], frame_res[1], 32, 8,
                                        border)) {
diff --git a/libaom/examples/lightfield_encoder.c b/libaom/examples/lightfield_encoder.c
index e55cd5c..4dd71ca 100644
--- a/libaom/examples/lightfield_encoder.c
+++ b/libaom/examples/lightfield_encoder.c
@@ -275,9 +275,13 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
   aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
   if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
   // Allocate memory with the border so that it can be used as a reference.
+  int border_in_pixels =
+      (codec.config.enc->rc_resize_mode || codec.config.enc->rc_superres_mode)
+          ? AOM_BORDER_IN_PIXELS
+          : AOM_ENC_NO_SCALE_BORDER;
   for (i = 0; i < reference_image_num; i++) {
     if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, cfg->g_w,
-                                   cfg->g_h, 32, 8, AOM_BORDER_IN_PIXELS)) {
+                                   cfg->g_h, 32, 8, border_in_pixels)) {
       die("Failed to allocate image.");
     }
   }
@@ -393,6 +397,10 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
   for (i = 0; i < reference_image_num; i++) aom_img_free(&reference_images[i]);
 
   if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  // Modify large_scale_file fourcc.
+  if (cfg->large_scale_tile == 1)
+    aom_video_writer_set_fourcc(writer, LST_FOURCC);
   aom_video_writer_close(writer);
 
   printf("\nSecond pass complete. Processed %d frames.\n", frame_count);
diff --git a/libaom/examples/lightfield_tile_list_decoder.c b/libaom/examples/lightfield_tile_list_decoder.c
index 4aabde1..87a8b43 100644
--- a/libaom/examples/lightfield_tile_list_decoder.c
+++ b/libaom/examples/lightfield_tile_list_decoder.c
@@ -160,7 +160,7 @@ int main(int argc, char **argv) {
       // Allocate memory to store decoded references. Allocate memory with the
       // border so that it can be used as a reference.
       for (j = 0; j < num_references; j++) {
-        unsigned int border = AOM_BORDER_IN_PIXELS;
+        unsigned int border = AOM_DEC_BORDER_IN_PIXELS;
         if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
                                        frame_res[0], frame_res[1], 32, 8,
                                        border)) {
diff --git a/libaom/test/av1_convolve_2d_test.cc b/libaom/test/av1_convolve_2d_test.cc
index 825cef2..b0cef81 100644
--- a/libaom/test/av1_convolve_2d_test.cc
+++ b/libaom/test/av1_convolve_2d_test.cc
@@ -19,6 +19,7 @@ using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DSrTest;
 using libaom_test::AV1HighbdConvolve2D::AV1HighbdJntConvolve2DTest;
 using ::testing::make_tuple;
 using ::testing::tuple;
+
 namespace {
 
 TEST_P(AV1Convolve2DSrTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
@@ -89,72 +90,72 @@ INSTANTIATE_TEST_CASE_P(NEON_COPY, AV1Convolve2DSrTest,
 TEST_P(AV1JntConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
 TEST_P(AV1JntConvolve2DTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
 
-INSTANTIATE_TEST_CASE_P(
-    C_COPY, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_copy_c, 0, 0));
+INSTANTIATE_TEST_CASE_P(C_COPY, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_dist_wtd_convolve_2d_copy_c, 0, 0));
 
 INSTANTIATE_TEST_CASE_P(
     C_X, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_c, 1, 0));
+    libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_x_c, 1, 0));
 
 INSTANTIATE_TEST_CASE_P(
     C_Y, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_c, 0, 1));
+    libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_y_c, 0, 1));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1JntConvolve2DTest,
                         libaom_test::AV1Convolve2D::BuildParams(
-                            av1_jnt_convolve_2d_copy_sse2, 0, 0));
-INSTANTIATE_TEST_CASE_P(
-    SSE2, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_sse2, 1, 1));
+                            av1_dist_wtd_convolve_2d_copy_sse2, 0, 0));
+INSTANTIATE_TEST_CASE_P(SSE2, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_dist_wtd_convolve_2d_sse2, 1, 1));
 
-INSTANTIATE_TEST_CASE_P(
-    SSE2_X, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_sse2, 1, 0));
+INSTANTIATE_TEST_CASE_P(SSE2_X, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_dist_wtd_convolve_x_sse2, 1, 0));
 
-INSTANTIATE_TEST_CASE_P(
-    SSE2_Y, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_sse2, 0, 1));
+INSTANTIATE_TEST_CASE_P(SSE2_Y, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_dist_wtd_convolve_y_sse2, 0, 1));
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_ssse3, 1, 1));
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_dist_wtd_convolve_2d_ssse3, 1, 1));
 
 #if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1JntConvolve2DTest,
                         libaom_test::AV1Convolve2D::BuildParams(
-                            av1_jnt_convolve_2d_copy_avx2, 0, 0));
-INSTANTIATE_TEST_CASE_P(
-    AVX2_X, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_avx2, 1, 0));
+                            av1_dist_wtd_convolve_2d_copy_avx2, 0, 0));
+INSTANTIATE_TEST_CASE_P(AVX2_X, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_dist_wtd_convolve_x_avx2, 1, 0));
 
-INSTANTIATE_TEST_CASE_P(
-    AVX2_Y, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_avx2, 0, 1));
+INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_dist_wtd_convolve_y_avx2, 0, 1));
 
-INSTANTIATE_TEST_CASE_P(
-    AVX2, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_avx2, 1, 1));
+INSTANTIATE_TEST_CASE_P(AVX2, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_dist_wtd_convolve_2d_avx2, 1, 1));
 #endif  // HAVE_AVX2
 #endif  // HAVE_SSSE3
 #endif  // HAVE_SSE2
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON_COPY, AV1JntConvolve2DTest,
                         libaom_test::AV1Convolve2D::BuildParams(
-                            av1_jnt_convolve_2d_copy_neon, 0, 0));
+                            av1_dist_wtd_convolve_2d_copy_neon, 0, 0));
 
-INSTANTIATE_TEST_CASE_P(
-    NEON, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_neon, 1, 1));
-INSTANTIATE_TEST_CASE_P(
-    NEON_X, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_neon, 1, 0));
+INSTANTIATE_TEST_CASE_P(NEON, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_dist_wtd_convolve_2d_neon, 1, 1));
+INSTANTIATE_TEST_CASE_P(NEON_X, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_dist_wtd_convolve_x_neon, 1, 0));
 
-INSTANTIATE_TEST_CASE_P(
-    NEON_Y, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_neon, 0, 1));
+INSTANTIATE_TEST_CASE_P(NEON_Y, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_dist_wtd_convolve_y_neon, 0, 1));
 #endif  // HAVE_NEON
 
 TEST_P(AV1HighbdConvolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
@@ -213,41 +214,41 @@ TEST_P(AV1HighbdJntConvolve2DTest, DISABLED_Speed) {
 
 INSTANTIATE_TEST_CASE_P(C_X, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_jnt_convolve_x_c, 1, 0));
+                            av1_highbd_dist_wtd_convolve_x_c, 1, 0));
 
 INSTANTIATE_TEST_CASE_P(C_Y, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_jnt_convolve_y_c, 0, 1));
+                            av1_highbd_dist_wtd_convolve_y_c, 0, 1));
 
 INSTANTIATE_TEST_CASE_P(C_COPY, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_jnt_convolve_2d_copy_c, 0, 0));
+                            av1_highbd_dist_wtd_convolve_2d_copy_c, 0, 0));
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(SSE4_1_COPY, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_jnt_convolve_2d_copy_sse4_1, 0, 0));
+                            av1_highbd_dist_wtd_convolve_2d_copy_sse4_1, 0, 0));
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_jnt_convolve_2d_sse4_1, 1, 1));
+                            av1_highbd_dist_wtd_convolve_2d_sse4_1, 1, 1));
 INSTANTIATE_TEST_CASE_P(SSE4_1_X, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_jnt_convolve_x_sse4_1, 1, 0));
+                            av1_highbd_dist_wtd_convolve_x_sse4_1, 1, 0));
 INSTANTIATE_TEST_CASE_P(SSE4_1_Y, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_jnt_convolve_y_sse4_1, 0, 1));
+                            av1_highbd_dist_wtd_convolve_y_sse4_1, 0, 1));
 #if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_jnt_convolve_2d_copy_avx2, 0, 0));
+                            av1_highbd_dist_wtd_convolve_2d_copy_avx2, 0, 0));
 INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_jnt_convolve_2d_avx2, 1, 1));
+                            av1_highbd_dist_wtd_convolve_2d_avx2, 1, 1));
 INSTANTIATE_TEST_CASE_P(AVX2_X, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_jnt_convolve_x_avx2, 1, 0));
+                            av1_highbd_dist_wtd_convolve_x_avx2, 1, 0));
 INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_jnt_convolve_y_avx2, 0, 1));
+                            av1_highbd_dist_wtd_convolve_y_avx2, 0, 1));
 #endif  // HAVE_AVX2
 #endif  // HAVE_SSE4_1
 }  // namespace
diff --git a/libaom/test/av1_convolve_2d_test_util.cc b/libaom/test/av1_convolve_2d_test_util.cc
index 409fd23..9cfe3e6 100644
--- a/libaom/test/av1_convolve_2d_test_util.cc
+++ b/libaom/test/av1_convolve_2d_test_util.cc
@@ -200,9 +200,9 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
         ConvolveParams conv_params2 =
             get_conv_params_no_round(do_average, 0, output2, MAX_SB_SIZE, 1, 8);
 
-        // Test special case where jnt_comp_avg is not used
-        conv_params1.use_jnt_comp_avg = 0;
-        conv_params2.use_jnt_comp_avg = 0;
+        // Test special case where dist_wtd_comp_avg is not used
+        conv_params1.use_dist_wtd_comp_avg = 0;
+        conv_params2.use_dist_wtd_comp_avg = 0;
 
         const int subx_range = has_subx ? 16 : 1;
         const int suby_range = has_suby ? 16 : 1;
@@ -211,9 +211,10 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
             // Choose random locations within the source block
             const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
             const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-            av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w, output8_1,
-                                  MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                                  filter_params_y, subx, suby, &conv_params1);
+            av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w,
+                                       output8_1, MAX_SB_SIZE, out_w, out_h,
+                                       filter_params_x, filter_params_y, subx,
+                                       suby, &conv_params1);
             test_impl(input + offset_r * w + offset_c, w, output8_2,
                       MAX_SB_SIZE, out_w, out_h, filter_params_x,
                       filter_params_y, subx, suby, &conv_params2);
@@ -222,7 +223,7 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
               for (int j = 0; j < out_w; ++j) {
                 int idx = i * MAX_SB_SIZE + j;
                 ASSERT_EQ(output1[idx], output2[idx])
-                    << "Mismatch at unit tests for av1_jnt_convolve_2d\n"
+                    << "Mismatch at unit tests for av1_dist_wtd_convolve_2d\n"
                     << out_w << "x" << out_h << " Pixel mismatch at index "
                     << idx << " = (" << i << ", " << j
                     << "), sub pixel offset = (" << suby << ", " << subx << ")";
@@ -247,8 +248,8 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
         // Test different combination of fwd and bck offset weights
         for (int k = 0; k < 2; ++k) {
           for (int l = 0; l < 4; ++l) {
-            conv_params1.use_jnt_comp_avg = 1;
-            conv_params2.use_jnt_comp_avg = 1;
+            conv_params1.use_dist_wtd_comp_avg = 1;
+            conv_params2.use_dist_wtd_comp_avg = 1;
             conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
             conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
             conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
@@ -259,10 +260,10 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
                 // Choose random locations within the source block
                 const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
                 const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-                av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w,
-                                      output8_1, MAX_SB_SIZE, out_w, out_h,
-                                      filter_params_x, filter_params_y, subx,
-                                      suby, &conv_params1);
+                av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w,
+                                           output8_1, MAX_SB_SIZE, out_w, out_h,
+                                           filter_params_x, filter_params_y,
+                                           subx, suby, &conv_params1);
                 test_impl(input + offset_r * w + offset_c, w, output8_2,
                           MAX_SB_SIZE, out_w, out_h, filter_params_x,
                           filter_params_y, subx, suby, &conv_params2);
@@ -272,7 +273,7 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
                     int idx = i * MAX_SB_SIZE + j;
                     ASSERT_EQ(output1[idx], output2[idx])
                         << "Mismatch at unit tests for "
-                           "av1_jnt_convolve_2d\n"
+                           "av1_dist_wtd_convolve_2d\n"
                         << out_w << "x" << out_h << " Pixel mismatch at index "
                         << idx << " = (" << i << ", " << j
                         << "), sub pixel offset = (" << suby << ", " << subx
@@ -333,7 +334,7 @@ void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) {
   ConvolveParams conv_params =
       get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, 8);
 
-  conv_params.use_jnt_comp_avg = 0;
+  conv_params.use_dist_wtd_comp_avg = 0;
 
   // Choose random locations within the source block
   const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
@@ -540,8 +541,8 @@ void AV1HighbdJntConvolve2DTest::RunSpeedTest(
   ConvolveParams conv_params =
       get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, bd);
 
-  // Test special case where jnt_comp_avg is not used
-  conv_params.use_jnt_comp_avg = 0;
+  // Test special case where dist_wtd_comp_avg is not used
+  conv_params.use_dist_wtd_comp_avg = 0;
 
   subx = 0;
   suby = 0;
@@ -601,9 +602,9 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput(
         ConvolveParams conv_params2 = get_conv_params_no_round(
             do_average, 0, output2, MAX_SB_SIZE, 1, bd);
 
-        // Test special case where jnt_comp_avg is not used
-        conv_params1.use_jnt_comp_avg = 0;
-        conv_params2.use_jnt_comp_avg = 0;
+        // Test special case where dist_wtd_comp_avg is not used
+        conv_params1.use_dist_wtd_comp_avg = 0;
+        conv_params2.use_dist_wtd_comp_avg = 0;
 
         const int subx_range = has_subx ? 16 : 1;
         const int suby_range = has_suby ? 16 : 1;
@@ -612,10 +613,10 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput(
             // Choose random locations within the source block
             const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
             const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-            av1_highbd_jnt_convolve_2d_c(input + offset_r * w + offset_c, w,
-                                         output16_1, MAX_SB_SIZE, out_w, out_h,
-                                         filter_params_x, filter_params_y, subx,
-                                         suby, &conv_params1, bd);
+            av1_highbd_dist_wtd_convolve_2d_c(
+                input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
+                out_w, out_h, filter_params_x, filter_params_y, subx, suby,
+                &conv_params1, bd);
             test_impl(input + offset_r * w + offset_c, w, output16_2,
                       MAX_SB_SIZE, out_w, out_h, filter_params_x,
                       filter_params_y, subx, suby, &conv_params2, bd);
@@ -648,8 +649,8 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput(
         // Test different combination of fwd and bck offset weights
         for (int k = 0; k < 2; ++k) {
           for (int l = 0; l < 4; ++l) {
-            conv_params1.use_jnt_comp_avg = 1;
-            conv_params2.use_jnt_comp_avg = 1;
+            conv_params1.use_dist_wtd_comp_avg = 1;
+            conv_params2.use_dist_wtd_comp_avg = 1;
             conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
             conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
             conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
@@ -662,7 +663,7 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput(
                 // Choose random locations within the source block
                 const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
                 const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-                av1_highbd_jnt_convolve_2d_c(
+                av1_highbd_dist_wtd_convolve_2d_c(
                     input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
                     out_w, out_h, filter_params_x, filter_params_y, subx, suby,
                     &conv_params1, bd);
diff --git a/libaom/test/av1_convolve_scale_test.cc b/libaom/test/av1_convolve_scale_test.cc
index 1929c49..a933fc9 100644
--- a/libaom/test/av1_convolve_scale_test.cc
+++ b/libaom/test/av1_convolve_scale_test.cc
@@ -286,13 +286,13 @@ class ConvolveScaleTestBase : public ::testing::Test {
   }
 
   void SetConvParamOffset(int i, int j, int is_compound, int do_average,
-                          int use_jnt_comp_avg) {
+                          int use_dist_wtd_comp_avg) {
     if (i == -1 && j == -1) {
-      convolve_params_.use_jnt_comp_avg = use_jnt_comp_avg;
+      convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
       convolve_params_.is_compound = is_compound;
       convolve_params_.do_average = do_average;
     } else {
-      convolve_params_.use_jnt_comp_avg = use_jnt_comp_avg;
+      convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
       convolve_params_.fwd_offset = quant_dist_lookup_table[i][j][0];
       convolve_params_.bck_offset = quant_dist_lookup_table[i][j][1];
       convolve_params_.is_compound = is_compound;
@@ -312,12 +312,12 @@ class ConvolveScaleTestBase : public ::testing::Test {
 
       is_compound = 1;
       for (int do_average = 0; do_average < 2; do_average++) {
-        for (int use_jnt_comp_avg = 0; use_jnt_comp_avg < 2;
-             use_jnt_comp_avg++) {
+        for (int use_dist_wtd_comp_avg = 0; use_dist_wtd_comp_avg < 2;
+             use_dist_wtd_comp_avg++) {
           for (int j = 0; j < 2; ++j) {
             for (int k = 0; k < 4; ++k) {
               SetConvParamOffset(j, k, is_compound, do_average,
-                                 use_jnt_comp_avg);
+                                 use_dist_wtd_comp_avg);
               Prep(&rnd);
               RunOne(true);
               RunOne(false);
diff --git a/libaom/test/av1_fwd_txfm2d_test.cc b/libaom/test/av1_fwd_txfm2d_test.cc
index c1b97f7..eb09cb1 100644
--- a/libaom/test/av1_fwd_txfm2d_test.cc
+++ b/libaom/test/av1_fwd_txfm2d_test.cc
@@ -288,6 +288,68 @@ void AV1FwdTxfm2dMatchTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
   }
 }
 
+void AV1FwdTxfm2dSpeedTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
+  TxfmParam param;
+  memset(&param, 0, sizeof(param));
+  const int rows = tx_size_high[tx_size];
+  const int cols = tx_size_wide[tx_size];
+  const int num_loops = 1000000 / (rows * cols);
+
+  for (int i = 0; i < 2; ++i) {
+    const int bd = 8;
+    for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (libaom_test::IsTxSizeTypeValid(
+              tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+        continue;
+      }
+
+      FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+      if (ref_func != NULL) {
+        DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+        DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+        DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+        int input_stride = 64;
+        ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+        for (int r = 0; r < rows; ++r) {
+          for (int c = 0; c < cols; ++c) {
+            input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+          }
+        }
+
+        param.tx_type = (TX_TYPE)tx_type;
+        param.tx_size = (TX_SIZE)tx_size;
+        param.tx_set_type = EXT_TX_SET_ALL16;
+        param.bd = bd;
+
+        aom_usec_timer ref_timer, test_timer;
+
+        aom_usec_timer_start(&ref_timer);
+        for (int i = 0; i < num_loops; ++i) {
+          ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+        }
+        aom_usec_timer_mark(&ref_timer);
+        const int elapsed_time_c =
+            static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+        aom_usec_timer_start(&test_timer);
+        for (int i = 0; i < num_loops; ++i) {
+          target_func(input, output, input_stride, &param);
+        }
+        aom_usec_timer_mark(&test_timer);
+        const int elapsed_time_simd =
+            static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+        printf(
+            "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t "
+            "gain=%d \n",
+            tx_size, tx_type, elapsed_time_c, elapsed_time_simd,
+            (elapsed_time_c / elapsed_time_simd));
+      }
+    }
+  }
+}
+
 typedef ::testing::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam;
 
 class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {};
@@ -295,7 +357,9 @@ class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {};
 TEST_P(AV1FwdTxfm2dTest, match) {
   AV1FwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
 }
-
+TEST_P(AV1FwdTxfm2dTest, DISABLED_Speed) {
+  AV1FwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
 using ::testing::Combine;
 using ::testing::Values;
 using ::testing::ValuesIn;
@@ -507,5 +571,12 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdFwdTxfm2dTest,
                         Combine(ValuesIn(Highbd_fwd_txfm_for_sse4_1),
                                 Values(av1_highbd_fwd_txfm)));
 #endif  // HAVE_SSE4_1
+#if HAVE_AVX2
+static TX_SIZE Highbd_fwd_txfm_for_avx2[] = { TX_8X8,   TX_16X16, TX_32X32,
+                                              TX_64X64, TX_8X16,  TX_16X8 };
 
+INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdFwdTxfm2dTest,
+                        Combine(ValuesIn(Highbd_fwd_txfm_for_avx2),
+                                Values(av1_highbd_fwd_txfm)));
+#endif  // HAVE_AVX2
 }  // namespace
diff --git a/libaom/test/av1_highbd_iht_test.cc b/libaom/test/av1_highbd_iht_test.cc
index 7f077b6..6d77cbf 100644
--- a/libaom/test/av1_highbd_iht_test.cc
+++ b/libaom/test/av1_highbd_iht_test.cc
@@ -308,7 +308,8 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvTxfm2d,
                         ::testing::Values(av1_highbd_inv_txfm_add_sse4_1));
 #endif
 
-#if HAVE_AVX2
+// TODO(http://crbug.com/aomedia/2350): these cause test vector mismatches.
+#if 0  // HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdInvTxfm2d,
                         ::testing::Values(av1_highbd_inv_txfm_add_avx2));
 #endif
diff --git a/libaom/test/av1_round_shift_array_test.cc b/libaom/test/av1_round_shift_array_test.cc
index 181a394..61dbed5 100644
--- a/libaom/test/av1_round_shift_array_test.cc
+++ b/libaom/test/av1_round_shift_array_test.cc
@@ -13,7 +13,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_timer.h"
diff --git a/libaom/test/av1_txfm_test.h b/libaom/test/av1_txfm_test.h
index a181647..5a56d28 100644
--- a/libaom/test/av1_txfm_test.h
+++ b/libaom/test/av1_txfm_test.h
@@ -29,14 +29,14 @@
 #include "av1/common/enums.h"
 
 namespace libaom_test {
-typedef enum {
+enum {
   TYPE_DCT = 0,
   TYPE_ADST,
   TYPE_IDTX,
   TYPE_IDCT,
   TYPE_IADST,
   TYPE_LAST
-} TYPE_TXFM;
+} UENUM1BYTE(TYPE_TXFM);
 
 int get_txfm1d_size(TX_SIZE tx_size);
 
diff --git a/libaom/test/comp_avg_pred_test.cc b/libaom/test/comp_avg_pred_test.cc
index 9c6ed90..3e5632e 100644
--- a/libaom/test/comp_avg_pred_test.cc
+++ b/libaom/test/comp_avg_pred_test.cc
@@ -12,61 +12,65 @@
 #include "test/comp_avg_pred_test.h"
 
 using libaom_test::ACMRandom;
-using libaom_test::AV1JNTCOMPAVG::AV1HighBDJNTCOMPAVGTest;
-using libaom_test::AV1JNTCOMPAVG::AV1HighBDJNTCOMPAVGUPSAMPLEDTest;
-using libaom_test::AV1JNTCOMPAVG::AV1JNTCOMPAVGTest;
-using libaom_test::AV1JNTCOMPAVG::AV1JNTCOMPAVGUPSAMPLEDTest;
+using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest;
+using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest;
+using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest;
+using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest;
 using ::testing::make_tuple;
 using ::testing::tuple;
 
 namespace {
 
-TEST_P(AV1JNTCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+TEST_P(AV1DISTWTDCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
 
-TEST_P(AV1JNTCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+TEST_P(AV1DISTWTDCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, AV1JNTCOMPAVGTest,
-    libaom_test::AV1JNTCOMPAVG::BuildParams(aom_jnt_comp_avg_pred_ssse3));
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1DISTWTDCOMPAVGTest,
+                        libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                            aom_dist_wtd_comp_avg_pred_ssse3));
 #endif
 
-TEST_P(AV1JNTCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
   RunSpeedTest(GET_PARAM(0));
 }
 
-TEST_P(AV1JNTCOMPAVGUPSAMPLEDTest, CheckOutput) {
+TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
   RunCheckOutput(GET_PARAM(0));
 }
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, AV1JNTCOMPAVGUPSAMPLEDTest,
-                        libaom_test::AV1JNTCOMPAVG::BuildParams(
-                            aom_jnt_comp_avg_upsampled_pred_ssse3));
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
+                        libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                            aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
 #endif
 
-TEST_P(AV1HighBDJNTCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(1)); }
+TEST_P(AV1HighBDDISTWTDCOMPAVGTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
 
-TEST_P(AV1HighBDJNTCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
+TEST_P(AV1HighBDDISTWTDCOMPAVGTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(1));
+}
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDJNTCOMPAVGTest,
-                        libaom_test::AV1JNTCOMPAVG::BuildParams(
-                            aom_highbd_jnt_comp_avg_pred_sse2, 1));
+INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDDISTWTDCOMPAVGTest,
+                        libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                            aom_highbd_dist_wtd_comp_avg_pred_sse2, 1));
 #endif
 
-TEST_P(AV1HighBDJNTCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
   RunSpeedTest(GET_PARAM(1));
 }
 
-TEST_P(AV1HighBDJNTCOMPAVGUPSAMPLEDTest, CheckOutput) {
+TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
   RunCheckOutput(GET_PARAM(1));
 }
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDJNTCOMPAVGUPSAMPLEDTest,
-                        libaom_test::AV1JNTCOMPAVG::BuildParams(
-                            aom_highbd_jnt_comp_avg_upsampled_pred_sse2));
+INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
+                        libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                            aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2));
 #endif
 
 }  // namespace
diff --git a/libaom/test/comp_avg_pred_test.h b/libaom/test/comp_avg_pred_test.h
index 65a0153..01ea35d 100644
--- a/libaom/test/comp_avg_pred_test.h
+++ b/libaom/test/comp_avg_pred_test.h
@@ -25,72 +25,73 @@
 namespace libaom_test {
 const int kMaxSize = 128 + 32;  // padding
 
-namespace AV1JNTCOMPAVG {
+namespace AV1DISTWTDCOMPAVG {
 
-typedef void (*jntcompavg_func)(uint8_t *comp_pred, const uint8_t *pred,
-                                int width, int height, const uint8_t *ref,
-                                int ref_stride,
-                                const JNT_COMP_PARAMS *jcp_param);
+typedef void (*distwtdcompavg_func)(uint8_t *comp_pred, const uint8_t *pred,
+                                    int width, int height, const uint8_t *ref,
+                                    int ref_stride,
+                                    const DIST_WTD_COMP_PARAMS *jcp_param);
 
-typedef void (*jntcompavgupsampled_func)(
+typedef void (*distwtdcompavgupsampled_func)(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search);
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
 
-typedef void (*highbdjntcompavgupsampled_func)(
+typedef void (*highbddistwtdcompavgupsampled_func)(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
     int subpel_search);
 
-typedef ::testing::tuple<jntcompavg_func, BLOCK_SIZE> JNTCOMPAVGParam;
+typedef ::testing::tuple<distwtdcompavg_func, BLOCK_SIZE> DISTWTDCOMPAVGParam;
 
-typedef ::testing::tuple<jntcompavgupsampled_func, BLOCK_SIZE>
-    JNTCOMPAVGUPSAMPLEDParam;
+typedef ::testing::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE>
+    DISTWTDCOMPAVGUPSAMPLEDParam;
 
-typedef ::testing::tuple<int, jntcompavg_func, BLOCK_SIZE>
-    HighbdJNTCOMPAVGParam;
+typedef ::testing::tuple<int, distwtdcompavg_func, BLOCK_SIZE>
+    HighbdDISTWTDCOMPAVGParam;
 
-typedef ::testing::tuple<int, highbdjntcompavgupsampled_func, BLOCK_SIZE>
-    HighbdJNTCOMPAVGUPSAMPLEDParam;
+typedef ::testing::tuple<int, highbddistwtdcompavgupsampled_func, BLOCK_SIZE>
+    HighbdDISTWTDCOMPAVGUPSAMPLEDParam;
 
-::testing::internal::ParamGenerator<JNTCOMPAVGParam> BuildParams(
-    jntcompavg_func filter) {
+::testing::internal::ParamGenerator<DISTWTDCOMPAVGParam> BuildParams(
+    distwtdcompavg_func filter) {
   return ::testing::Combine(::testing::Values(filter),
                             ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
 }
 
-::testing::internal::ParamGenerator<JNTCOMPAVGUPSAMPLEDParam> BuildParams(
-    jntcompavgupsampled_func filter) {
+::testing::internal::ParamGenerator<DISTWTDCOMPAVGUPSAMPLEDParam> BuildParams(
+    distwtdcompavgupsampled_func filter) {
   return ::testing::Combine(::testing::Values(filter),
                             ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
 }
 
-::testing::internal::ParamGenerator<HighbdJNTCOMPAVGParam> BuildParams(
-    jntcompavg_func filter, int is_hbd) {
+::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGParam> BuildParams(
+    distwtdcompavg_func filter, int is_hbd) {
   (void)is_hbd;
   return ::testing::Combine(::testing::Range(8, 13, 2),
                             ::testing::Values(filter),
                             ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
 }
 
-::testing::internal::ParamGenerator<HighbdJNTCOMPAVGUPSAMPLEDParam> BuildParams(
-    highbdjntcompavgupsampled_func filter) {
+::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGUPSAMPLEDParam>
+BuildParams(highbddistwtdcompavgupsampled_func filter) {
   return ::testing::Combine(::testing::Range(8, 13, 2),
                             ::testing::Values(filter),
                             ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
 }
 
-class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> {
+class AV1DISTWTDCOMPAVGTest
+    : public ::testing::TestWithParam<DISTWTDCOMPAVGParam> {
  public:
-  ~AV1JNTCOMPAVGTest() {}
+  ~AV1DISTWTDCOMPAVGTest() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
   void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
-  void RunCheckOutput(jntcompavg_func test_impl) {
+  void RunCheckOutput(distwtdcompavg_func test_impl) {
     const int w = kMaxSize, h = kMaxSize;
     const int block_idx = GET_PARAM(1);
 
@@ -107,27 +108,27 @@ class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> {
     const int in_w = block_size_wide[block_idx];
     const int in_h = block_size_high[block_idx];
 
-    JNT_COMP_PARAMS jnt_comp_params;
-    jnt_comp_params.use_jnt_comp_avg = 1;
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
     for (int ii = 0; ii < 2; ii++) {
       for (int jj = 0; jj < 4; jj++) {
-        jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-        jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
 
         const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
         const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
-        aom_jnt_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c, in_w,
-                                in_h, ref8 + offset_r * w + offset_c, in_w,
-                                &jnt_comp_params);
+        aom_dist_wtd_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c,
+                                     in_w, in_h, ref8 + offset_r * w + offset_c,
+                                     in_w, &dist_wtd_comp_params);
         test_impl(output2, pred8 + offset_r * w + offset_c, in_w, in_h,
-                  ref8 + offset_r * w + offset_c, in_w, &jnt_comp_params);
+                  ref8 + offset_r * w + offset_c, in_w, &dist_wtd_comp_params);
 
         for (int i = 0; i < in_h; ++i) {
           for (int j = 0; j < in_w; ++j) {
             int idx = i * in_w + j;
             ASSERT_EQ(output[idx], output2[idx])
-                << "Mismatch at unit tests for AV1JNTCOMPAVGTest\n"
+                << "Mismatch at unit tests for AV1DISTWTDCOMPAVGTest\n"
                 << in_w << "x" << in_h << " Pixel mismatch at index " << idx
                 << " = (" << i << ", " << j << ")";
           }
@@ -135,7 +136,7 @@ class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> {
       }
     }
   }
-  void RunSpeedTest(jntcompavg_func test_impl) {
+  void RunSpeedTest(distwtdcompavg_func test_impl) {
     const int w = kMaxSize, h = kMaxSize;
     const int block_idx = GET_PARAM(1);
 
@@ -152,49 +153,49 @@ class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> {
     const int in_w = block_size_wide[block_idx];
     const int in_h = block_size_high[block_idx];
 
-    JNT_COMP_PARAMS jnt_comp_params;
-    jnt_comp_params.use_jnt_comp_avg = 1;
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
 
     const int num_loops = 1000000000 / (in_w + in_h);
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
 
     for (int i = 0; i < num_loops; ++i)
-      aom_jnt_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w,
-                              &jnt_comp_params);
+      aom_dist_wtd_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w,
+                                   &dist_wtd_comp_params);
 
     aom_usec_timer_mark(&timer);
     const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("jntcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+    printf("distwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
            1000.0 * elapsed_time / num_loops);
 
     aom_usec_timer timer1;
     aom_usec_timer_start(&timer1);
 
     for (int i = 0; i < num_loops; ++i)
-      test_impl(output2, pred8, in_w, in_h, ref8, in_w, &jnt_comp_params);
+      test_impl(output2, pred8, in_w, in_h, ref8, in_w, &dist_wtd_comp_params);
 
     aom_usec_timer_mark(&timer1);
     const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
-    printf("jntcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+    printf("distwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
            1000.0 * elapsed_time1 / num_loops);
   }
 
   libaom_test::ACMRandom rnd_;
-};  // class AV1JNTCOMPAVGTest
+};  // class AV1DISTWTDCOMPAVGTest
 
-class AV1JNTCOMPAVGUPSAMPLEDTest
-    : public ::testing::TestWithParam<JNTCOMPAVGUPSAMPLEDParam> {
+class AV1DISTWTDCOMPAVGUPSAMPLEDTest
+    : public ::testing::TestWithParam<DISTWTDCOMPAVGUPSAMPLEDParam> {
  public:
-  ~AV1JNTCOMPAVGUPSAMPLEDTest() {}
+  ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
   void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
-  void RunCheckOutput(jntcompavgupsampled_func test_impl) {
+  void RunCheckOutput(distwtdcompavgupsampled_func test_impl) {
     const int w = kMaxSize, h = kMaxSize;
     const int block_idx = GET_PARAM(1);
 
@@ -211,8 +212,8 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
     const int in_w = block_size_wide[block_idx];
     const int in_h = block_size_high[block_idx];
 
-    JNT_COMP_PARAMS jnt_comp_params;
-    jnt_comp_params.use_jnt_comp_avg = 1;
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
     int sub_x_q3, sub_y_q3;
     int subpel_search;
     for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
@@ -221,28 +222,30 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
         for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
           for (int ii = 0; ii < 2; ii++) {
             for (int jj = 0; jj < 4; jj++) {
-              jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-              jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              dist_wtd_comp_params.fwd_offset =
+                  quant_dist_lookup_table[ii][jj][0];
+              dist_wtd_comp_params.bck_offset =
+                  quant_dist_lookup_table[ii][jj][1];
 
               const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
               const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
 
-              aom_jnt_comp_avg_upsampled_pred_c(
+              aom_dist_wtd_comp_avg_upsampled_pred_c(
                   NULL, NULL, 0, 0, NULL, output,
                   pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
                   sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
-                  &jnt_comp_params, subpel_search);
+                  &dist_wtd_comp_params, subpel_search);
               test_impl(NULL, NULL, 0, 0, NULL, output2,
                         pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
                         sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
-                        &jnt_comp_params, subpel_search);
+                        &dist_wtd_comp_params, subpel_search);
 
               for (int i = 0; i < in_h; ++i) {
                 for (int j = 0; j < in_w; ++j) {
                   int idx = i * in_w + j;
                   ASSERT_EQ(output[idx], output2[idx])
                       << "Mismatch at unit tests for "
-                         "AV1JNTCOMPAVGUPSAMPLEDTest\n"
+                         "AV1DISTWTDCOMPAVGUPSAMPLEDTest\n"
                       << in_w << "x" << in_h << " Pixel mismatch at index "
                       << idx << " = (" << i << ", " << j
                       << "), sub pixel offset = (" << sub_y_q3 << ", "
@@ -255,7 +258,7 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
       }
     }
   }
-  void RunSpeedTest(jntcompavgupsampled_func test_impl) {
+  void RunSpeedTest(distwtdcompavgupsampled_func test_impl) {
     const int w = kMaxSize, h = kMaxSize;
     const int block_idx = GET_PARAM(1);
 
@@ -272,11 +275,11 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
     const int in_w = block_size_wide[block_idx];
     const int in_h = block_size_high[block_idx];
 
-    JNT_COMP_PARAMS jnt_comp_params;
-    jnt_comp_params.use_jnt_comp_avg = 1;
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
 
     int sub_x_q3 = 0;
     int sub_y_q3 = 0;
@@ -287,13 +290,13 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
     int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
 
     for (int i = 0; i < num_loops; ++i)
-      aom_jnt_comp_avg_upsampled_pred_c(NULL, NULL, 0, 0, NULL, output, pred8,
-                                        in_w, in_h, sub_x_q3, sub_y_q3, ref8,
-                                        in_w, &jnt_comp_params, subpel_search);
+      aom_dist_wtd_comp_avg_upsampled_pred_c(
+          NULL, NULL, 0, 0, NULL, output, pred8, in_w, in_h, sub_x_q3, sub_y_q3,
+          ref8, in_w, &dist_wtd_comp_params, subpel_search);
 
     aom_usec_timer_mark(&timer);
     const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("jntcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+    printf("distwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
            1000.0 * elapsed_time / num_loops);
 
     aom_usec_timer timer1;
@@ -301,27 +304,27 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
 
     for (int i = 0; i < num_loops; ++i)
       test_impl(NULL, NULL, 0, 0, NULL, output2, pred8, in_w, in_h, sub_x_q3,
-                sub_y_q3, ref8, in_w, &jnt_comp_params, subpel_search);
+                sub_y_q3, ref8, in_w, &dist_wtd_comp_params, subpel_search);
 
     aom_usec_timer_mark(&timer1);
     const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
-    printf("jntcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+    printf("distwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
            1000.0 * elapsed_time1 / num_loops);
   }
 
   libaom_test::ACMRandom rnd_;
-};  // class AV1JNTCOMPAVGUPSAMPLEDTest
+};  // class AV1DISTWTDCOMPAVGUPSAMPLEDTest
 
-class AV1HighBDJNTCOMPAVGTest
-    : public ::testing::TestWithParam<HighbdJNTCOMPAVGParam> {
+class AV1HighBDDISTWTDCOMPAVGTest
+    : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGParam> {
  public:
-  ~AV1HighBDJNTCOMPAVGTest() {}
+  ~AV1HighBDDISTWTDCOMPAVGTest() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
   void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
-  void RunCheckOutput(jntcompavg_func test_impl) {
+  void RunCheckOutput(distwtdcompavg_func test_impl) {
     const int w = kMaxSize, h = kMaxSize;
     const int block_idx = GET_PARAM(2);
     const int bd = GET_PARAM(0);
@@ -338,31 +341,31 @@ class AV1HighBDJNTCOMPAVGTest
     const int in_w = block_size_wide[block_idx];
     const int in_h = block_size_high[block_idx];
 
-    JNT_COMP_PARAMS jnt_comp_params;
-    jnt_comp_params.use_jnt_comp_avg = 1;
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
     for (int ii = 0; ii < 2; ii++) {
       for (int jj = 0; jj < 4; jj++) {
-        jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-        jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
 
         const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
         const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
-        aom_highbd_jnt_comp_avg_pred_c(
+        aom_highbd_dist_wtd_comp_avg_pred_c(
             CONVERT_TO_BYTEPTR(output),
             CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h,
             CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
-            &jnt_comp_params);
+            &dist_wtd_comp_params);
         test_impl(CONVERT_TO_BYTEPTR(output2),
                   CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
                   in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
-                  in_w, &jnt_comp_params);
+                  in_w, &dist_wtd_comp_params);
 
         for (int i = 0; i < in_h; ++i) {
           for (int j = 0; j < in_w; ++j) {
             int idx = i * in_w + j;
             ASSERT_EQ(output[idx], output2[idx])
-                << "Mismatch at unit tests for AV1HighBDJNTCOMPAVGTest\n"
+                << "Mismatch at unit tests for AV1HighBDDISTWTDCOMPAVGTest\n"
                 << in_w << "x" << in_h << " Pixel mismatch at index " << idx
                 << " = (" << i << ", " << j << ")";
           }
@@ -370,7 +373,7 @@ class AV1HighBDJNTCOMPAVGTest
       }
     }
   }
-  void RunSpeedTest(jntcompavg_func test_impl) {
+  void RunSpeedTest(distwtdcompavg_func test_impl) {
     const int w = kMaxSize, h = kMaxSize;
     const int block_idx = GET_PARAM(2);
     const int bd = GET_PARAM(0);
@@ -387,24 +390,24 @@ class AV1HighBDJNTCOMPAVGTest
     const int in_w = block_size_wide[block_idx];
     const int in_h = block_size_high[block_idx];
 
-    JNT_COMP_PARAMS jnt_comp_params;
-    jnt_comp_params.use_jnt_comp_avg = 1;
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
 
     const int num_loops = 1000000000 / (in_w + in_h);
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
 
     for (int i = 0; i < num_loops; ++i)
-      aom_highbd_jnt_comp_avg_pred_c(
+      aom_highbd_dist_wtd_comp_avg_pred_c(
           CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
-          CONVERT_TO_BYTEPTR(ref8), in_w, &jnt_comp_params);
+          CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params);
 
     aom_usec_timer_mark(&timer);
     const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("highbdjntcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+    printf("highbddistwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
            1000.0 * elapsed_time / num_loops);
 
     aom_usec_timer timer1;
@@ -412,26 +415,26 @@ class AV1HighBDJNTCOMPAVGTest
 
     for (int i = 0; i < num_loops; ++i)
       test_impl(CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8), in_w,
-                in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &jnt_comp_params);
+                in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params);
 
     aom_usec_timer_mark(&timer1);
     const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
-    printf("highbdjntcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+    printf("highbddistwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
            1000.0 * elapsed_time1 / num_loops);
   }
 
   libaom_test::ACMRandom rnd_;
-};  // class AV1HighBDJNTCOMPAVGTest
+};  // class AV1HighBDDISTWTDCOMPAVGTest
 
-class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
-    : public ::testing::TestWithParam<HighbdJNTCOMPAVGUPSAMPLEDParam> {
+class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
+    : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGUPSAMPLEDParam> {
  public:
-  ~AV1HighBDJNTCOMPAVGUPSAMPLEDTest() {}
+  ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
   void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
-  void RunCheckOutput(highbdjntcompavgupsampled_func test_impl) {
+  void RunCheckOutput(highbddistwtdcompavgupsampled_func test_impl) {
     const int w = kMaxSize, h = kMaxSize;
     const int block_idx = GET_PARAM(2);
     const int bd = GET_PARAM(0);
@@ -448,8 +451,8 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
     const int in_w = block_size_wide[block_idx];
     const int in_h = block_size_high[block_idx];
 
-    JNT_COMP_PARAMS jnt_comp_params;
-    jnt_comp_params.use_jnt_comp_avg = 1;
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
     int sub_x_q3, sub_y_q3;
     int subpel_search;
     for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
@@ -458,30 +461,32 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
         for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
           for (int ii = 0; ii < 2; ii++) {
             for (int jj = 0; jj < 4; jj++) {
-              jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-              jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              dist_wtd_comp_params.fwd_offset =
+                  quant_dist_lookup_table[ii][jj][0];
+              dist_wtd_comp_params.bck_offset =
+                  quant_dist_lookup_table[ii][jj][1];
 
               const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
               const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
 
-              aom_highbd_jnt_comp_avg_upsampled_pred_c(
+              aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
                   NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output),
                   CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
                   in_h, sub_x_q3, sub_y_q3,
                   CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd,
-                  &jnt_comp_params, subpel_search);
+                  &dist_wtd_comp_params, subpel_search);
               test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2),
                         CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c,
                         in_w, in_h, sub_x_q3, sub_y_q3,
                         CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
-                        in_w, bd, &jnt_comp_params, subpel_search);
+                        in_w, bd, &dist_wtd_comp_params, subpel_search);
 
               for (int i = 0; i < in_h; ++i) {
                 for (int j = 0; j < in_w; ++j) {
                   int idx = i * in_w + j;
                   ASSERT_EQ(output[idx], output2[idx])
                       << "Mismatch at unit tests for "
-                         "AV1HighBDJNTCOMPAVGUPSAMPLEDTest\n"
+                         "AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest\n"
                       << in_w << "x" << in_h << " Pixel mismatch at index "
                       << idx << " = (" << i << ", " << j
                       << "), sub pixel offset = (" << sub_y_q3 << ", "
@@ -494,7 +499,7 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
       }
     }
   }
-  void RunSpeedTest(highbdjntcompavgupsampled_func test_impl) {
+  void RunSpeedTest(highbddistwtdcompavgupsampled_func test_impl) {
     const int w = kMaxSize, h = kMaxSize;
     const int block_idx = GET_PARAM(2);
     const int bd = GET_PARAM(0);
@@ -511,11 +516,11 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
     const int in_w = block_size_wide[block_idx];
     const int in_h = block_size_high[block_idx];
 
-    JNT_COMP_PARAMS jnt_comp_params;
-    jnt_comp_params.use_jnt_comp_avg = 1;
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
     int sub_x_q3 = 0;
     int sub_y_q3 = 0;
     const int num_loops = 1000000000 / (in_w + in_h);
@@ -523,15 +528,16 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
     aom_usec_timer_start(&timer);
     int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
     for (int i = 0; i < num_loops; ++i)
-      aom_highbd_jnt_comp_avg_upsampled_pred_c(
+      aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
           NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output),
           CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
-          CONVERT_TO_BYTEPTR(ref8), in_w, bd, &jnt_comp_params, subpel_search);
+          CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params,
+          subpel_search);
 
     aom_usec_timer_mark(&timer);
     const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("highbdjntcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time / num_loops);
+    printf("highbddistwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w,
+           in_h, 1000.0 * elapsed_time / num_loops);
 
     aom_usec_timer timer1;
     aom_usec_timer_start(&timer1);
@@ -539,19 +545,19 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
     for (int i = 0; i < num_loops; ++i)
       test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2),
                 CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
-                CONVERT_TO_BYTEPTR(ref8), in_w, bd, &jnt_comp_params,
+                CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params,
                 subpel_search);
 
     aom_usec_timer_mark(&timer1);
     const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
-    printf("highbdjntcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w,
+    printf("highbddistwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w,
            in_h, 1000.0 * elapsed_time1 / num_loops);
   }
 
   libaom_test::ACMRandom rnd_;
-};  // class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
+};  // class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
 
-}  // namespace AV1JNTCOMPAVG
+}  // namespace AV1DISTWTDCOMPAVG
 }  // namespace libaom_test
 
 #endif  // AOM_TEST_COMP_AVG_PRED_TEST_H_
diff --git a/libaom/test/corner_match_test.cc b/libaom/test/corner_match_test.cc
index 58e3139..af2baa7 100644
--- a/libaom/test/corner_match_test.cc
+++ b/libaom/test/corner_match_test.cc
@@ -24,9 +24,13 @@ namespace AV1CornerMatch {
 
 using libaom_test::ACMRandom;
 
+typedef double (*ComputeCrossCorrFunc)(unsigned char *im1, int stride1, int x1,
+                                       int y1, unsigned char *im2, int stride2,
+                                       int x2, int y2);
+
 using ::testing::make_tuple;
 using ::testing::tuple;
-typedef tuple<int> CornerMatchParam;
+typedef tuple<int, ComputeCrossCorrFunc> CornerMatchParam;
 
 class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
  public:
@@ -36,19 +40,24 @@ class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
   virtual void TearDown();
 
  protected:
-  void RunCheckOutput();
+  void RunCheckOutput(int run_times);
+  ComputeCrossCorrFunc target_func;
 
   libaom_test::ACMRandom rnd_;
 };
 
 AV1CornerMatchTest::~AV1CornerMatchTest() {}
-void AV1CornerMatchTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+void AV1CornerMatchTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+  target_func = GET_PARAM(1);
+}
 void AV1CornerMatchTest::TearDown() { libaom_test::ClearSystemState(); }
 
-void AV1CornerMatchTest::RunCheckOutput() {
+void AV1CornerMatchTest::RunCheckOutput(int run_times) {
   const int w = 128, h = 128;
   const int num_iters = 10000;
   int i, j;
+  aom_usec_timer ref_timer, test_timer;
 
   uint8_t *input1 = new uint8_t[w * h];
   uint8_t *input2 = new uint8_t[w * h];
@@ -80,21 +89,54 @@ void AV1CornerMatchTest::RunCheckOutput() {
 
     double res_c =
         compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
-    double res_sse4 =
-        compute_cross_correlation_sse4_1(input1, w, x1, y1, input2, w, x2, y2);
+    double res_simd = target_func(input1, w, x1, y1, input2, w, x2, y2);
 
-    ASSERT_EQ(res_sse4, res_c);
-  }
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (j = 0; j < run_times; j++) {
+        compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
 
+      aom_usec_timer_start(&test_timer);
+      for (j = 0; j < run_times; j++) {
+        target_func(input1, w, x1, y1, input2, w, x2, y2);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%d\n",
+          elapsed_time_c, elapsed_time_simd,
+          (elapsed_time_c / elapsed_time_simd));
+    } else {
+      ASSERT_EQ(res_simd, res_c);
+    }
+  }
   delete[] input1;
   delete[] input2;
 }
 
-TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(); }
-
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1CornerMatchTest,
-                        ::testing::Values(make_tuple(0), make_tuple(1)));
-
+TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(1); }
+TEST_P(AV1CornerMatchTest, DISABLED_Speed) { RunCheckOutput(100000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AV1CornerMatchTest,
+    ::testing::Values(make_tuple(0, compute_cross_correlation_sse4_1),
+                      make_tuple(1, compute_cross_correlation_sse4_1)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1CornerMatchTest,
+    ::testing::Values(make_tuple(0, compute_cross_correlation_avx2),
+                      make_tuple(1, compute_cross_correlation_avx2)));
+#endif
 }  // namespace AV1CornerMatch
 
 }  // namespace test_libaom
diff --git a/libaom/test/dr_prediction_test.cc b/libaom/test/dr_prediction_test.cc
index a64d39b..4be8489 100644
--- a/libaom/test/dr_prediction_test.cc
+++ b/libaom/test/dr_prediction_test.cc
@@ -59,7 +59,9 @@ typedef void (*Z1_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
 template <Z1_Lbd fn>
 void z1_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
                 const uint8_t *above, const uint8_t *left, int upsample_above,
-                int /*upsample_left*/, int dx, int dy, int /*bd*/) {
+                int upsample_left, int dx, int dy, int bd) {
+  (void)bd;
+  (void)upsample_left;
   fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy);
 }
 
@@ -69,7 +71,9 @@ typedef void (*Z2_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
 template <Z2_Lbd fn>
 void z2_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
                 const uint8_t *above, const uint8_t *left, int upsample_above,
-                int upsample_left, int dx, int dy, int /*bd*/) {
+                int upsample_left, int dx, int dy, int bd) {
+  (void)bd;
+  (void)upsample_left;
   fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy);
 }
 
@@ -78,9 +82,10 @@ typedef void (*Z3_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
                        int upsample_left, int dx, int dy);
 template <Z3_Lbd fn>
 void z3_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
-                const uint8_t *above, const uint8_t *left,
-                int /*upsample_above*/, int upsample_left, int dx, int dy,
-                int /*bd*/) {
+                const uint8_t *above, const uint8_t *left, int upsample_above,
+                int upsample_left, int dx, int dy, int bd) {
+  (void)bd;
+  (void)upsample_above;
   fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy);
 }
 
@@ -90,8 +95,10 @@ typedef void (*Z1_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
 template <Z1_Hbd fn>
 void z1_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
                     const uint16_t *above, const uint16_t *left,
-                    int upsample_above, int /*upsample_left*/, int dx, int dy,
+                    int upsample_above, int upsample_left, int dx, int dy,
                     int bd) {
+  (void)bd;
+  (void)upsample_left;
   fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy, bd);
 }
 
@@ -104,6 +111,7 @@ void z2_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
                     const uint16_t *above, const uint16_t *left,
                     int upsample_above, int upsample_left, int dx, int dy,
                     int bd) {
+  (void)bd;
   fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy,
      bd);
 }
@@ -114,8 +122,10 @@ typedef void (*Z3_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
 template <Z3_Hbd fn>
 void z3_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
                     const uint16_t *above, const uint16_t *left,
-                    int /*upsample_above*/, int upsample_left, int dx, int dy,
+                    int upsample_above, int upsample_left, int dx, int dy,
                     int bd) {
+  (void)bd;
+  (void)upsample_above;
   fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy, bd);
 }
 
@@ -135,7 +145,7 @@ struct DrPredFunc {
 template <typename Pixel, typename FuncType>
 class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
  protected:
-  static const int kMaxNumTests = 100000;
+  static const int kMaxNumTests = 10000;
   static const int kIterations = 10;
   static const int kDstStride = 64;
   static const int kDstSize = kDstStride * kDstStride;
@@ -171,6 +181,9 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
   void Predict(bool speedtest, int tx) {
     const int kNumTests = speedtest ? kMaxNumTests : 1;
     aom_usec_timer timer;
+    int tst_time = 0;
+
+    bd_ = params_.bit_depth;
 
     aom_usec_timer_start(&timer);
     for (int k = 0; k < kNumTests; ++k) {
@@ -180,25 +193,27 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
     aom_usec_timer_mark(&timer);
     const int ref_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
 
-    aom_usec_timer_start(&timer);
     if (params_.tst_fn) {
+      aom_usec_timer_start(&timer);
       for (int k = 0; k < kNumTests; ++k) {
         ASM_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_,
                                                 above_, left_, upsample_above_,
                                                 upsample_left_, dx_, dy_, bd_));
       }
+      aom_usec_timer_mark(&timer);
+      tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
     } else {
       for (int i = 0; i < kDstSize; ++i) {
         dst_ref_[i] = dst_tst_[i];
       }
     }
-    aom_usec_timer_mark(&timer);
-    const int tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
 
     OutputTimes(kNumTests, ref_time, tst_time, tx);
   }
 
   void RunTest(bool speedtest, bool needsaturation, int p_angle) {
+    bd_ = params_.bit_depth;
+
     if (needsaturation) {
       for (int i = 0; i < kBufSize; ++i) {
         above_data_[i] = left_data_[i] = (1 << bd_) - 1;
@@ -290,8 +305,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
 class LowbdDrPredTest : public DrPredTest<uint8_t, DrPred> {};
 
 TEST_P(LowbdDrPredTest, SaturatedValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    enable_upsample_ = iter & 1;
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
     for (int angle = start_angle_; angle < stop_angle_; ++angle) {
       dx_ = av1_get_dx(angle);
       dy_ = av1_get_dy(angle);
@@ -300,20 +314,6 @@ TEST_P(LowbdDrPredTest, SaturatedValues) {
   }
 }
 
-TEST_P(LowbdDrPredTest, DISABLED_Speed) {
-  const int angles[] = { 3, 45, 87 };
-  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
-    for (int i = 0; i < 3; ++i) {
-      const int angle = angles[i] + start_angle_;
-      dx_ = av1_get_dx(angle);
-      dy_ = av1_get_dy(angle);
-      printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
-             enable_upsample_, angle);
-      if (dx_ && dy_) RunTest(true, false, angle);
-    }
-  }
-}
-
 using ::testing::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
@@ -328,8 +328,7 @@ INSTANTIATE_TEST_CASE_P(
 class HighbdDrPredTest : public DrPredTest<uint16_t, DrPred_Hbd> {};
 
 TEST_P(HighbdDrPredTest, SaturatedValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    enable_upsample_ = iter & 1;
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
     for (int angle = start_angle_; angle < stop_angle_; ++angle) {
       dx_ = av1_get_dx(angle);
       dy_ = av1_get_dy(angle);
@@ -362,6 +361,46 @@ INSTANTIATE_TEST_CASE_P(
 
 #if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(
+    AVX2, LowbdDrPredTest,
+    ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+                                         &z1_wrapper<av1_dr_prediction_z1_avx2>,
+                                         AOM_BITS_8, kZ1Start),
+                      /* TODO(niva213@gmail.com): Re-enable this test after
+                      fixing valgrind issue: https://crbug.com/aomedia/2316
+                      DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+                                         &z2_wrapper<av1_dr_prediction_z2_avx2>,
+                                         AOM_BITS_8, kZ2Start), */
+                      DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+                                         &z3_wrapper<av1_dr_prediction_z3_avx2>,
+                                         AOM_BITS_8, kZ3Start)));
+
+TEST_P(LowbdDrPredTest, DISABLED_Speed) {
+  const int angles[] = { 3, 45, 87 };
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+    for (int i = 0; i < 3; ++i) {
+      const int angle = angles[i] + start_angle_;
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
+             enable_upsample_, angle);
+      if (dx_ && dy_) RunTest(true, false, angle);
+    }
+  }
+}
+
+TEST_P(LowbdDrPredTest, OperationCheck) {
+  if (params_.tst_fn == NULL) return;
+  // const int angles[] = { 3, 45, 81, 87, 93, 100, 145, 187, 199, 260 };
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+    for (int angle = start_angle_; angle < stop_angle_; ++angle) {
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      if (dx_ && dy_) RunTest(false, false, angle);
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
     AVX2, HighbdDrPredTest,
     ::testing::Values(DrPredFunc<DrPred_Hbd>(
                           &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
@@ -375,7 +414,9 @@ INSTANTIATE_TEST_CASE_P(
                           &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
                           &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_avx2>,
                           AOM_BITS_12, kZ1Start),
-                      /*DrPredFunc<DrPred_Hbd>(
+                      /* TODO(niva213@gmail.com): Re-enable these tests after
+                      fixing valgrind issue: https://crbug.com/aomedia/2316
+                      DrPredFunc<DrPred_Hbd>(
                           &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
                           &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_avx2>,
                           AOM_BITS_8, kZ2Start),
diff --git a/libaom/test/edge_detect_test.cc b/libaom/test/edge_detect_test.cc
index 47466cb..77a731f 100644
--- a/libaom/test/edge_detect_test.cc
+++ b/libaom/test/edge_detect_test.cc
@@ -185,8 +185,9 @@ TEST_P(EdgeDetectBrightnessTest, DetectUniformBrightness) {
   const bool high_bd = GET_PARAM(3);
   const int bd = GET_PARAM(4);
 
-  ASSERT_EQ(0, av1_edge_exists(input_, stride_8tap(width), width, height,
-                               high_bd, bd));
+  ASSERT_EQ(
+      0, av1_edge_exists(input_, stride_8tap(width), width, height, high_bd, bd)
+             .magnitude);
 }
 
 INSTANTIATE_TEST_CASE_P(ImageBrightnessTests, EdgeDetectBrightnessTest,
@@ -245,9 +246,11 @@ TEST_P(EdgeDetectImageTest, BlackWhite) {
   free(orig);
   // Value should be between 556 and 560.
   ASSERT_LE(556, av1_edge_exists(padded, stride_8tap(width), width, height,
-                                 high_bd, bd));
+                                 high_bd, bd)
+                     .magnitude);
   ASSERT_GE(560, av1_edge_exists(padded, stride_8tap(width), width, height,
-                                 high_bd, bd));
+                                 high_bd, bd)
+                     .magnitude);
 
   free_pad_8tap(padded, width, high_bd);
 }
diff --git a/libaom/test/encode_api_test.cc b/libaom/test/encode_api_test.cc
index c26f572..235480a 100644
--- a/libaom/test/encode_api_test.cc
+++ b/libaom/test/encode_api_test.cc
@@ -50,7 +50,7 @@ TEST(EncodeAPI, InvalidParams) {
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
               aom_codec_enc_init(&enc, kCodecs[i], NULL, 0));
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_enc_config_default(kCodecs[i], &cfg, 1));
+              aom_codec_enc_config_default(kCodecs[i], &cfg, 2));
 
     EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(kCodecs[i], &cfg, 0));
     EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
diff --git a/libaom/test/end_to_end_test.cc b/libaom/test/end_to_end_test.cc
index 9aa44c6..6ea09a6 100644
--- a/libaom/test/end_to_end_test.cc
+++ b/libaom/test/end_to_end_test.cc
@@ -53,6 +53,13 @@ typedef struct {
   unsigned int profile;
 } TestVideoParam;
 
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " input_bit_depth:" << test_arg.input_bit_depth
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << "}";
+}
+
 const TestVideoParam kTestVectors[] = {
   { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
   { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
diff --git a/libaom/test/error_block_test.cc b/libaom/test/error_block_test.cc
index 353947c..3664ccf 100644
--- a/libaom/test/error_block_test.cc
+++ b/libaom/test/error_block_test.cc
@@ -156,6 +156,70 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
       << "First failed at test case " << first_failure;
 }
 
+TEST_P(ErrorBlockTest, DISABLED_Speed) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+  intptr_t block_size;
+  int64_t ssz;
+  int num_iters = 100000;
+  int64_t ref_ssz;
+  int k;
+  const int msb = bit_depth_ + 8 - 1;
+  for (int i = 0; i < 9; ++i) {
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (k = 0; k < 9; k++) {
+      for (int j = 0; j < block_size; j++) {
+        if (k < 5) {
+          if (rnd(2)) {
+            // Positive number
+            coeff[j] = rnd(1 << msb);
+            dqcoeff[j] = rnd(1 << msb);
+          } else {
+            // Negative number
+            coeff[j] = -rnd(1 << msb);
+            dqcoeff[j] = -rnd(1 << msb);
+          }
+        } else {
+          if (rnd(2)) {
+            // Positive number
+            coeff[j] = rnd(1 << 14);
+            dqcoeff[j] = rnd(1 << 14);
+          } else {
+            // Negative number
+            coeff[j] = -rnd(1 << 14);
+            dqcoeff[j] = -rnd(1 << 14);
+          }
+        }
+      }
+      aom_usec_timer ref_timer, test_timer;
+
+      aom_usec_timer_start(&ref_timer);
+      for (int i = 0; i < num_iters; ++i) {
+        ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (int i = 0; i < num_iters; ++i) {
+        error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_);
+      }
+      aom_usec_timer_mark(&test_timer);
+
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          " c_time=%d \t simd_time=%d \t "
+          "gain=%d \n",
+          elapsed_time_c, elapsed_time_simd,
+          (elapsed_time_c / elapsed_time_simd));
+    }
+  }
+}
+
 #if (HAVE_SSE2 || HAVE_AVX)
 using ::testing::make_tuple;
 
@@ -168,4 +232,17 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(&av1_highbd_block_error_sse2,
                                  &av1_highbd_block_error_c, AOM_BITS_8)));
 #endif  // HAVE_SSE2
+
+#if (HAVE_AVX2)
+using ::testing::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, ErrorBlockTest,
+    ::testing::Values(make_tuple(&av1_highbd_block_error_avx2,
+                                 &av1_highbd_block_error_c, AOM_BITS_10),
+                      make_tuple(&av1_highbd_block_error_avx2,
+                                 &av1_highbd_block_error_c, AOM_BITS_12),
+                      make_tuple(&av1_highbd_block_error_avx2,
+                                 &av1_highbd_block_error_c, AOM_BITS_8)));
+#endif  // HAVE_AVX2
 }  // namespace
diff --git a/libaom/test/external_frame_buffer_test.cc b/libaom/test/external_frame_buffer_test.cc
index 6fcd9e7..4938a64 100644
--- a/libaom/test/external_frame_buffer_test.cc
+++ b/libaom/test/external_frame_buffer_test.cc
@@ -58,7 +58,7 @@ class ExternalFrameBufferList {
 
   // Searches the frame buffer list for a free frame buffer. Makes sure
   // that the frame buffer is at least |min_size| in bytes. Marks that the
-  // frame buffer is in use by libvpx. Finally sets |fb| to point to the
+  // frame buffer is in use by libaom. Finally sets |fb| to point to the
   // external frame buffer. Returns < 0 on an error.
   int GetFreeFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) {
     EXPECT_TRUE(fb != NULL);
@@ -114,9 +114,9 @@ class ExternalFrameBufferList {
     return 0;
   }
 
-  // Checks that the ximage data is contained within the external frame buffer
-  // private data passed back in the ximage.
-  void CheckXImageFrameBuffer(const aom_image_t *img) {
+  // Checks that the aom_image_t data is contained within the external frame
+  // buffer private data passed back in the aom_image_t.
+  void CheckImageFrameBuffer(const aom_image_t *img) {
     if (img->fb_priv != NULL) {
       const struct ExternalFrameBuffer *const ext_fb =
           reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv);
@@ -158,7 +158,7 @@ class ExternalFrameBufferList {
 
 #if CONFIG_WEBM_IO
 
-// Callback used by libvpx to request the application to return a frame
+// Callback used by libaom to request the application to return a frame
 // buffer of at least |min_size| in bytes.
 int get_aom_frame_buffer(void *user_priv, size_t min_size,
                          aom_codec_frame_buffer_t *fb) {
@@ -167,7 +167,7 @@ int get_aom_frame_buffer(void *user_priv, size_t min_size,
   return fb_list->GetFreeFrameBuffer(min_size, fb);
 }
 
-// Callback used by libvpx to tell the application that |fb| is not needed
+// Callback used by libaom to tell the application that |fb| is not needed
 // anymore.
 int release_aom_frame_buffer(void *user_priv, aom_codec_frame_buffer_t *fb) {
   ExternalFrameBufferList *const fb_list =
@@ -218,7 +218,7 @@ class ExternalFrameBufferMD5Test
       const libaom_test::CompressedVideoSource &video,
       libaom_test::Decoder *decoder) {
     if (num_buffers_ > 0 && video.frame_number() == 0) {
-      // Have libvpx use frame buffers we create.
+      // Have libaom use frame buffers we create.
       ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
       ASSERT_EQ(AOM_CODEC_OK,
                 decoder->SetFrameBufferFunctions(GetAV1FrameBuffer,
@@ -299,7 +299,7 @@ class ExternalFrameBufferMD5Test
 const char kAV1TestFile[] = "av1-1-b8-03-sizeup.mkv";
 const char kAV1NonRefTestFile[] = "av1-1-b8-01-size-226x226.ivf";
 
-// Class for testing passing in external frame buffers to libvpx.
+// Class for testing passing in external frame buffers to libaom.
 class ExternalFrameBufferTest : public ::testing::Test {
  protected:
   ExternalFrameBufferTest() : video_(NULL), decoder_(NULL), num_buffers_(0) {}
@@ -322,7 +322,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
     video_ = NULL;
   }
 
-  // Passes the external frame buffer information to libvpx.
+  // Passes the external frame buffer information to libaom.
   aom_codec_err_t SetFrameBufferFunctions(
       int num_buffers, aom_get_frame_buffer_cb_fn_t cb_get,
       aom_release_frame_buffer_cb_fn_t cb_release) {
@@ -359,7 +359,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
 
     // Get decompressed data
     while ((img = dec_iter.Next()) != NULL) {
-      fb_list_.CheckXImageFrameBuffer(img);
+      fb_list_.CheckImageFrameBuffer(img);
     }
   }
 
@@ -390,7 +390,7 @@ class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
 #endif  // CONFIG_WEBM_IO
 
 // This test runs through the set of test vectors, and decodes them.
-// Libvpx will call into the application to allocate a frame buffer when
+// Libaom will call into the application to allocate a frame buffer when
 // needed. The md5 checksums are computed for each frame in the video file.
 // If md5 checksums match the correct md5 data, then the test is passed.
 // Otherwise, the test failed.
diff --git a/libaom/test/fwd_kf_test.cc b/libaom/test/fwd_kf_test.cc
new file mode 100644
index 0000000..6c428d9
--- /dev/null
+++ b/libaom/test/fwd_kf_test.cc
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+typedef struct {
+  const int max_kf_dist;
+  const double psnr_thresh;
+} FwdKfTestParam;
+
+const FwdKfTestParam kTestParams[] = {
+  { 4, 37.3 },  { 6, 36.5 },  { 8, 35.8 },
+  { 12, 34.3 }, { 16, 34.3 }, { 18, 33.7 }
+};
+
+// Params: encoding mode and index into the kMaxKfDists array to control
+// kf-max-dist
+class ForwardKeyTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ForwardKeyTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        kf_max_dist_ind_(GET_PARAM(2)) {}
+  virtual ~ForwardKeyTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cpu_used_ = 2;
+    kf_max_dist_ = kTestParams[kf_max_dist_ind_].max_kf_dist;
+    psnr_threshold_ = kTestParams[kf_max_dist_ind_].psnr_thresh;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_target_bitrate = 200;
+    cfg_.g_lag_in_frames = 10;
+    cfg_.fwd_kf_enabled = 1;
+    cfg_.kf_max_dist = kf_max_dist_;
+    cfg_.g_threads = 0;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() { return psnr_threshold_; }
+
+  ::libaom_test::TestMode encoding_mode_;
+  const int kf_max_dist_ind_;
+  double psnr_threshold_;
+  int kf_max_dist_;
+  int cpu_used_;
+  int nframes_;
+  double psnr_;
+};
+
+TEST_P(ForwardKeyTest, ForwardKeyEncodeTest) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 20);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // TODO(sarahparker) Add functionality to assert the minimum number of
+  // keyframes were placed.
+  EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold())
+      << "kf max dist = " << kf_max_dist_;
+}
+
+AV1_INSTANTIATE_TEST_CASE(
+    ForwardKeyTest, ::testing::Values(::libaom_test::kTwoPassGood),
+    ::testing::Range(0, static_cast<int>(GTEST_ARRAY_SIZE_(kTestParams))));
+}  // namespace
diff --git a/libaom/test/gf_max_pyr_height_test.cc b/libaom/test/gf_max_pyr_height_test.cc
new file mode 100644
index 0000000..2d78493
--- /dev/null
+++ b/libaom/test/gf_max_pyr_height_test.cc
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+static const struct GFMaxPyrHeightTestParam {
+  int gf_max_pyr_height;
+  double psnr_thresh;
+} kTestParams[] = {
+  { 0, 34.75 }, { 1, 34.75 }, { 2, 35.25 }, { 3, 35.50 }, { 4, 35.50 },
+};
+
+// Compiler may decide to add some padding to the struct above for alignment,
+// which the gtest may try to print (on error for example). This would cause
+// valgrind to complain that the padding is uninitialized. To avoid that, we
+// provide our own function to print the struct.
+// This also makes '--gtest_list_tests' output more understandable.
+std::ostream &operator<<(std::ostream &os, const GFMaxPyrHeightTestParam &p) {
+  os << "GFMaxPyrHeightTestParam { "
+     << "gf_max_pyr_height = " << p.gf_max_pyr_height << ", "
+     << "psnr_thresh = " << p.psnr_thresh << " }";
+  return os;
+}
+
+// Params: encoding mode and GFMaxPyrHeightTestParam object.
+class GFMaxPyrHeightTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+                                                 GFMaxPyrHeightTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  GFMaxPyrHeightTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)) {
+    gf_max_pyr_height_ = GET_PARAM(2).gf_max_pyr_height;
+    psnr_threshold_ = GET_PARAM(2).psnr_thresh;
+  }
+  virtual ~GFMaxPyrHeightTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cpu_used_ = 4;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_target_bitrate = 200;
+    cfg_.g_lag_in_frames = 19;
+    cfg_.g_threads = 0;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+      encoder->Control(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, gf_max_pyr_height_);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() { return psnr_threshold_; }
+
+  ::libaom_test::TestMode encoding_mode_;
+  double psnr_threshold_;
+  int gf_max_pyr_height_;
+  int cpu_used_;
+  int nframes_;
+  double psnr_;
+};
+
+TEST_P(GFMaxPyrHeightTest, EncodeAndVerifyPSNR) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 32);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold())
+      << "GF Max Pyramid Height = " << gf_max_pyr_height_;
+}
+
+AV1_INSTANTIATE_TEST_CASE(GFMaxPyrHeightTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood),
+                          ::testing::ValuesIn(kTestParams));
+}  // namespace
diff --git a/libaom/test/hiprec_convolve_test_util.cc b/libaom/test/hiprec_convolve_test_util.cc
index f5bf56e..2672bce 100644
--- a/libaom/test/hiprec_convolve_test_util.cc
+++ b/libaom/test/hiprec_convolve_test_util.cc
@@ -31,7 +31,7 @@ static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
   hkernel[2] = hkernel[4] =
       WIENER_FILT_TAP2_MINV +
       rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
-  hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+  hkernel[3] = -(hkernel[0] + hkernel[1] + hkernel[2]);
   hkernel[7] = 0;
 
   vkernel[0] = vkernel[6] =
@@ -43,7 +43,7 @@ static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
   vkernel[2] = vkernel[4] =
       WIENER_FILT_TAP2_MINV +
       rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
-  vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
+  vkernel[3] = -(vkernel[0] + vkernel[1] + vkernel[2]);
   vkernel[7] = 0;
 }
 
diff --git a/libaom/test/horz_superres_test.cc b/libaom/test/horz_superres_test.cc
index 1627684..f2c2115 100644
--- a/libaom/test/horz_superres_test.cc
+++ b/libaom/test/horz_superres_test.cc
@@ -28,13 +28,8 @@ using ::testing::tuple;
 
 /* TESTING PARAMETERS */
 
-#define NUM_TEST_VIDEOS 3
-
 const int kBitrate = 40;
 
-// PSNR thresholds found by experiment
-const double kPSNRThresholds[] = { 26.0, 28.0, 20.0 };
-
 typedef struct {
   const char *filename;
   aom_img_fmt fmt;
@@ -42,18 +37,20 @@ typedef struct {
   unsigned int profile;
   unsigned int limit;
   unsigned int screen_content;
+  double psnr_threshold;
 } TestVideoParam;
 
 const TestVideoParam kTestVideoVectors[] = {
-  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0 },
-  { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0 },
-  { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1 },
+  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 26.0 },
+  { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 28.0 },
+  { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 20.0 },
+  // Image coding (single frame).
+  { "niklas_1280_720_30.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 1, 0, 32.0 },
 };
 
-// Superres modes tested
-// SUPERRES_QTHRESH is not included, as it has its own test
-const SUPERRES_MODE kSuperresModesNotQThresh[] = { SUPERRES_FIXED,
-                                                   SUPERRES_RANDOM };
+// Modes with extra params have their own tests.
+const SUPERRES_MODE kSuperresModesWithoutParams[] = { SUPERRES_RANDOM,
+                                                      SUPERRES_AUTO };
 
 // Superres denominators and superres kf denominators to be tested
 typedef tuple<int, int> SuperresDenominatorPair;
@@ -74,10 +71,8 @@ const SuperresQThresholdPair kSuperresQThresholds[] = {
 /* END (TESTING PARAMETERS) */
 
 // Test parameter list:
-//  <[needed for EncoderTest], test_video_idx_, superres_mode_,
-//  tuple(superres_denom_, superres_kf_denom_)>
-typedef tuple<const libaom_test::CodecFactory *, int, SUPERRES_MODE,
-              SuperresDenominatorPair>
+//  <[needed for EncoderTest], test_video_param_, superres_mode_>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam, SUPERRES_MODE>
     HorzSuperresTestParam;
 
 class HorzSuperresEndToEndTest
@@ -85,16 +80,113 @@ class HorzSuperresEndToEndTest
       public ::libaom_test::EncoderTest {
  protected:
   HorzSuperresEndToEndTest()
-      : EncoderTest(GET_PARAM(0)), test_video_idx_(GET_PARAM(1)),
-        superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) {
-    test_video_param_ = kTestVideoVectors[test_video_idx_];
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+        superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) {}
+
+  virtual ~HorzSuperresEndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 5;
+    cfg_.rc_end_usage = AOM_Q;
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    // Set superres parameters
+    cfg_.rc_superres_mode = superres_mode_;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    frame_count_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    frame_count_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+
+      // Set cpu-used = 8 for speed
+      encoder->Control(AOME_SET_CPUUSED, 8);
+
+      // Test screen coding tools
+      if (test_video_param_.screen_content)
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      else
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
 
-    SuperresDenominatorPair denoms = GET_PARAM(3);
+  double GetAveragePsnr() const {
+    if (frame_count_) return psnr_ / frame_count_;
+    return 0.0;
+  }
+
+  void DoTest() {
+    std::unique_ptr<libaom_test::VideoSource> video;
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                test_video_param_.limit));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, test_video_param_.psnr_threshold)
+        << "superres_mode_ = " << superres_mode_;
+
+    EXPECT_EQ(test_video_param_.limit, frame_count_)
+        << "superres_mode_ = " << superres_mode_;
+  }
+
+  TestVideoParam test_video_param_;
+  SUPERRES_MODE superres_mode_;
+
+ private:
+  double psnr_;
+  unsigned int frame_count_;
+};
+
+TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(HorzSuperresEndToEndTest,
+                          ::testing::ValuesIn(kTestVideoVectors),
+                          ::testing::ValuesIn(kSuperresModesWithoutParams));
+
+// Test parameter list:
+//  <[needed for EncoderTest], test_video_param_, tuple(superres_denom_,
+//  superres_kf_denom_)>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam,
+              SuperresDenominatorPair>
+    HorzSuperresFixedTestParam;
+
+class HorzSuperresFixedEndToEndTest
+    : public ::testing::TestWithParam<HorzSuperresFixedTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  HorzSuperresFixedEndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+        superres_mode_(SUPERRES_FIXED), psnr_(0.0), frame_count_(0) {
+    SuperresDenominatorPair denoms = GET_PARAM(2);
     superres_denom_ = ::testing::get<0>(denoms);
     superres_kf_denom_ = ::testing::get<1>(denoms);
   }
 
-  virtual ~HorzSuperresEndToEndTest() {}
+  virtual ~HorzSuperresFixedEndToEndTest() {}
 
   virtual void SetUp() {
     InitializeConfig();
@@ -151,8 +243,6 @@ class HorzSuperresEndToEndTest
     return 0.0;
   }
 
-  double GetPsnrThreshold() { return kPSNRThresholds[test_video_idx_]; }
-
   void DoTest() {
     std::unique_ptr<libaom_test::VideoSource> video;
     video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
@@ -161,7 +251,7 @@ class HorzSuperresEndToEndTest
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
     const double psnr = GetAveragePsnr();
-    EXPECT_GT(psnr, GetPsnrThreshold())
+    EXPECT_GT(psnr, test_video_param_.psnr_threshold)
         << "superres_mode_ = " << superres_mode_
         << ", superres_denom_ = " << superres_denom_
         << ", superres_kf_denom_ = " << superres_kf_denom_;
@@ -172,7 +262,6 @@ class HorzSuperresEndToEndTest
         << ", superres_kf_denom_ = " << superres_kf_denom_;
   }
 
-  int test_video_idx_;
   TestVideoParam test_video_param_;
   SUPERRES_MODE superres_mode_;
   int superres_denom_;
@@ -183,17 +272,16 @@ class HorzSuperresEndToEndTest
   unsigned int frame_count_;
 };
 
-TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); }
+TEST_P(HorzSuperresFixedEndToEndTest, HorzSuperresFixedTestParam) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(HorzSuperresEndToEndTest,
-                          ::testing::Range(0, NUM_TEST_VIDEOS),
-                          ::testing::ValuesIn(kSuperresModesNotQThresh),
+AV1_INSTANTIATE_TEST_CASE(HorzSuperresFixedEndToEndTest,
+                          ::testing::ValuesIn(kTestVideoVectors),
                           ::testing::ValuesIn(kSuperresDenominators));
 
 // Test parameter list:
-//  <[needed for EncoderTest], test_video_idx_, tuple(superres_denom_,
-//  superres_kf_denom_), tuple(superres_qthresh_,superres_kf_qthresh_)>
-typedef tuple<const libaom_test::CodecFactory *, int, SuperresDenominatorPair,
+//  <[needed for EncoderTest], test_video_param_,
+//  tuple(superres_qthresh_,superres_kf_qthresh_)>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam,
               SuperresQThresholdPair>
     HorzSuperresQThreshTestParam;
 
@@ -202,15 +290,9 @@ class HorzSuperresQThreshEndToEndTest
       public ::libaom_test::EncoderTest {
  protected:
   HorzSuperresQThreshEndToEndTest()
-      : EncoderTest(GET_PARAM(0)), test_video_idx_(GET_PARAM(1)),
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
         superres_mode_(SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) {
-    test_video_param_ = kTestVideoVectors[test_video_idx_];
-
-    SuperresDenominatorPair denoms = GET_PARAM(2);
-    superres_denom_ = ::testing::get<0>(denoms);
-    superres_kf_denom_ = ::testing::get<1>(denoms);
-
-    SuperresQThresholdPair qthresholds = GET_PARAM(3);
+    SuperresQThresholdPair qthresholds = GET_PARAM(2);
     superres_qthresh_ = ::testing::get<0>(qthresholds);
     superres_kf_qthresh_ = ::testing::get<1>(qthresholds);
   }
@@ -232,8 +314,6 @@ class HorzSuperresQThreshEndToEndTest
 
     // Set superres parameters
     cfg_.rc_superres_mode = superres_mode_;
-    cfg_.rc_superres_denominator = superres_denom_;
-    cfg_.rc_superres_kf_denominator = superres_kf_denom_;
     cfg_.rc_superres_qthresh = superres_qthresh_;
     cfg_.rc_superres_kf_qthresh = superres_kf_qthresh_;
   }
@@ -274,8 +354,6 @@ class HorzSuperresQThreshEndToEndTest
     return 0.0;
   }
 
-  double GetPsnrThreshold() { return kPSNRThresholds[test_video_idx_]; }
-
   void DoTest() {
     std::unique_ptr<libaom_test::VideoSource> video;
     video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
@@ -284,26 +362,19 @@ class HorzSuperresQThreshEndToEndTest
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
     const double psnr = GetAveragePsnr();
-    EXPECT_GT(psnr, GetPsnrThreshold())
+    EXPECT_GT(psnr, test_video_param_.psnr_threshold)
         << "superres_mode_ = " << superres_mode_
-        << ", superres_denom_ = " << superres_denom_
-        << ", superres_kf_denom_ = " << superres_kf_denom_
         << ", superres_qthresh_ = " << superres_qthresh_
         << ", superres_kf_qthresh_ = " << superres_kf_qthresh_;
 
     EXPECT_EQ(test_video_param_.limit, frame_count_)
         << "superres_mode_ = " << superres_mode_
-        << ", superres_denom_ = " << superres_denom_
-        << ", superres_kf_denom_ = " << superres_kf_denom_
         << ", superres_qthresh_ = " << superres_qthresh_
         << ", superres_kf_qthresh_ = " << superres_kf_qthresh_;
   }
 
-  int test_video_idx_;
   TestVideoParam test_video_param_;
   SUPERRES_MODE superres_mode_;
-  int superres_denom_;
-  int superres_kf_denom_;
   int superres_qthresh_;
   int superres_kf_qthresh_;
 
@@ -317,8 +388,7 @@ TEST_P(HorzSuperresQThreshEndToEndTest, HorzSuperresQThreshEndToEndPSNRTest) {
 }
 
 AV1_INSTANTIATE_TEST_CASE(HorzSuperresQThreshEndToEndTest,
-                          ::testing::Range(0, NUM_TEST_VIDEOS),
-                          ::testing::ValuesIn(kSuperresDenominators),
+                          ::testing::ValuesIn(kTestVideoVectors),
                           ::testing::ValuesIn(kSuperresQThresholds));
 
 }  // namespace
diff --git a/libaom/test/level_test.cc b/libaom/test/level_test.cc
new file mode 100644
index 0000000..e3b0ef1
--- /dev/null
+++ b/libaom/test/level_test.cc
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+// Speed settings tested
+static const int kCpuUsedVectors[] = {
+  1,
+  2,
+  3,
+  4,
+};
+
+class LevelTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  LevelTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), target_level_(31) {}
+
+  virtual ~LevelTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 5;
+      cfg_.rc_end_usage = AOM_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level_);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+    }
+  }
+
+  libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  int target_level_;
+};
+
+TEST_P(LevelTest, TestTargetLevelApi) {
+  static const aom_codec_iface_t *codec = &aom_codec_av1_cx_algo;
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(codec, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, codec, &cfg, 0));
+  for (int operating_point = 0; operating_point <= 32; ++operating_point) {
+    for (int level = 0; level <= 32; ++level) {
+      const int target_level = operating_point * 100 + level;
+      if ((level >= 0 && level <= 23) || level == 31 || operating_point > 31) {
+        EXPECT_EQ(AOM_CODEC_OK,
+                  aom_codec_control(&enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX,
+                                    target_level));
+      } else {
+        EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+                  aom_codec_control(&enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX,
+                                    target_level));
+      }
+    }
+  }
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST_P(LevelTest, TestTargetLevel19) {
+  std::unique_ptr<libaom_test::VideoSource> video;
+  video.reset(new libaom_test::Y4mVideoSource("park_joy_90p_8_420.y4m", 0, 10));
+  ASSERT_TRUE(video.get() != NULL);
+  // Level index 19 corresponding to level 6.3.
+  target_level_ = 19;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+AV1_INSTANTIATE_TEST_CASE(LevelTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood),
+                          ::testing::ValuesIn(kCpuUsedVectors));
+}  // namespace
diff --git a/libaom/test/quantize_func_test.cc b/libaom/test/quantize_func_test.cc
index 8dee864..067a981 100644
--- a/libaom/test/quantize_func_test.cc
+++ b/libaom/test/quantize_func_test.cc
@@ -63,7 +63,7 @@ void highbd_quan64x64_wrapper(QUAN_PARAM_LIST) {
   HBD_QUAN_FUNC;
 }
 
-typedef enum { TYPE_B, TYPE_DC, TYPE_FP } QuantType;
+enum { TYPE_B, TYPE_DC, TYPE_FP } UENUM1BYTE(QuantType);
 
 using ::testing::tuple;
 typedef tuple<QuantizeFunc, QuantizeFunc, TX_SIZE, QuantType, aom_bit_depth_t>
@@ -191,6 +191,13 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
     }
   }
 
+  void FillCoeffRandomRows(int num) {
+    FillCoeffZero();
+    for (int i = 0; i < num; ++i) {
+      coeff_[i] = GetRandomCoeff();
+    }
+  }
+
   void FillCoeffZero() { FillCoeff(0); }
 
   void FillCoeffConstant() {
@@ -287,28 +294,31 @@ TEST_P(QuantizeTest, DISABLED_Speed) {
   const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
   const int kNumTests = 5000000;
   aom_usec_timer timer, simd_timer;
+  int rows = tx_size_high[tx_size_];
+  int cols = tx_size_wide[tx_size_];
+  for (int cnt = 0; cnt <= rows; cnt++) {
+    FillCoeffRandomRows(cnt * cols);
+
+    aom_usec_timer_start(&timer);
+    for (int n = 0; n < kNumTests; ++n) {
+      quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift,
+                 qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan);
+    }
+    aom_usec_timer_mark(&timer);
 
-  FillCoeffRandom();
-
-  aom_usec_timer_start(&timer);
-  for (int n = 0; n < kNumTests; ++n) {
-    quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift,
-               qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan);
-  }
-  aom_usec_timer_mark(&timer);
+    aom_usec_timer_start(&simd_timer);
+    for (int n = 0; n < kNumTests; ++n) {
+      quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
+             dqcoeff, dequant, eob, sc->scan, sc->iscan);
+    }
+    aom_usec_timer_mark(&simd_timer);
 
-  aom_usec_timer_start(&simd_timer);
-  for (int n = 0; n < kNumTests; ++n) {
-    quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
-           dqcoeff, dequant, eob, sc->scan, sc->iscan);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    const int simd_elapsed_time =
+        static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
+    printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time,
+           simd_elapsed_time, (elapsed_time / simd_elapsed_time));
   }
-  aom_usec_timer_mark(&simd_timer);
-
-  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  const int simd_elapsed_time =
-      static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
-  printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time,
-         simd_elapsed_time, (elapsed_time / simd_elapsed_time));
 }
 
 using ::testing::make_tuple;
@@ -398,6 +408,24 @@ const QuantizeParam kQParamArraySSE2[] = {
              TX_32X32, TYPE_B, AOM_BITS_10),
   make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
              TX_32X32, TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+             TX_64X64, TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+             TX_64X64, TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+             TX_64X64, TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2,
+             TX_16X16, TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, TX_8X8,
+             TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, TX_4X4,
+             TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_32x32_adaptive_c,
+             &aom_quantize_b_32x32_adaptive_sse2, TX_32X16, TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_32x32_adaptive_c,
+             &aom_quantize_b_32x32_adaptive_sse2, TX_16X32, TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_32x32_adaptive_c,
+             &aom_quantize_b_32x32_adaptive_sse2, TX_32X32, TYPE_B, AOM_BITS_8)
 };
 
 INSTANTIATE_TEST_CASE_P(SSE2, QuantizeTest,
@@ -411,6 +439,9 @@ INSTANTIATE_TEST_CASE_P(
                                  TX_16X16, TYPE_B, AOM_BITS_8),
                       make_tuple(&aom_quantize_b_32x32_c,
                                  &aom_quantize_b_32x32_ssse3, TX_32X32, TYPE_B,
+                                 AOM_BITS_8),
+                      make_tuple(&aom_quantize_b_64x64_c,
+                                 &aom_quantize_b_64x64_ssse3, TX_64X64, TYPE_B,
                                  AOM_BITS_8)));
 
 #endif  // HAVE_SSSE3 && ARCH_X86_64
diff --git a/libaom/test/resize_test.cc b/libaom/test/resize_test.cc
index b270b83..39e7d1b 100644
--- a/libaom/test/resize_test.cc
+++ b/libaom/test/resize_test.cc
@@ -297,7 +297,7 @@ class ResizeInternalTestLarge : public ResizeTest {
 
   virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
-    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.5);
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 3.0);
   }
 
 #if WRITE_COMPRESSED_STREAM
@@ -374,6 +374,7 @@ class ResizeRealtimeTest
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_AQ_MODE, 3);
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
     }
 
     if (change_bitrate_ && video->frame() == 120) {
diff --git a/libaom/test/rt_end_to_end_test.cc b/libaom/test/rt_end_to_end_test.cc
new file mode 100644
index 0000000..9c3e96b
--- /dev/null
+++ b/libaom/test/rt_end_to_end_test.cc
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+
+// List of psnr thresholds for speed settings 0-8
+const double kPsnrThreshold[9] = { 36.9, 36.9, 36.85, 36.8, 36.6,
+                                   36.4, 36.0, 35.5,  35.0 };
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " input_bit_depth:" << test_arg.input_bit_depth
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << "}";
+}
+
+// TODO(kyslov): Add more test vectors
+const TestVideoParam kTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+};
+
+// Speed settings tested
+const int kCpuUsedVectors[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
+
+class RTEndToEndTest
+    : public ::libaom_test::CodecTestWith2Params<TestVideoParam, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  RTEndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
+
+  virtual ~RTEndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kRealTime);
+
+    cfg_.g_usage = 1;  // TODO(kyslov): Move it to encode_test_driver.cc
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 1);
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() { return kPsnrThreshold[cpu_used_]; }
+
+  void DoTest() {
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    std::unique_ptr<libaom_test::VideoSource> video;
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                kFrames));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold()) << "cpu used = " << cpu_used_;
+  }
+
+  TestVideoParam test_video_param_;
+  int cpu_used_;
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+};
+
+class RTEndToEndTestLarge : public RTEndToEndTest {};
+
+TEST_P(RTEndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(RTEndToEndTest, EndtoEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(RTEndToEndTestLarge,
+                          ::testing::ValuesIn(kTestVectors),
+                          ::testing::ValuesIn(kCpuUsedVectors));
+
+AV1_INSTANTIATE_TEST_CASE(RTEndToEndTest, ::testing::Values(kTestVectors[0]),
+                          ::testing::Values(kCpuUsedVectors[8]));
+}  // namespace
diff --git a/libaom/test/sad_test.cc b/libaom/test/sad_test.cc
index 845fe79..87dbb33 100644
--- a/libaom/test/sad_test.cc
+++ b/libaom/test/sad_test.cc
@@ -35,22 +35,25 @@ typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
                                   const uint8_t *second_pred);
 typedef ::testing::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
 
-typedef void (*JntCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
-                               int width, int height, const uint8_t *ref,
-                               int ref_stride,
-                               const JNT_COMP_PARAMS *jcp_param);
-typedef ::testing::tuple<int, int, JntCompAvgFunc, int> JntCompAvgParam;
-
-typedef unsigned int (*JntSadMxhFunc)(const uint8_t *src_ptr, int src_stride,
-                                      const uint8_t *ref_ptr, int ref_stride,
-                                      int width, int height);
-typedef ::testing::tuple<int, int, JntSadMxhFunc, int> JntSadMxhParam;
-
-typedef uint32_t (*JntSadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *ref_ptr, int ref_stride,
-                                     const uint8_t *second_pred,
-                                     const JNT_COMP_PARAMS *jcp_param);
-typedef ::testing::tuple<int, int, JntSadMxNAvgFunc, int> JntSadMxNAvgParam;
+typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
+                                   int width, int height, const uint8_t *ref,
+                                   int ref_stride,
+                                   const DIST_WTD_COMP_PARAMS *jcp_param);
+typedef ::testing::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
+
+typedef unsigned int (*DistWtdSadMxhFunc)(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int width,
+                                          int height);
+typedef ::testing::tuple<int, int, DistWtdSadMxhFunc, int> DistWtdSadMxhParam;
+
+typedef uint32_t (*DistWtdSadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         const uint8_t *second_pred,
+                                         const DIST_WTD_COMP_PARAMS *jcp_param);
+typedef ::testing::tuple<int, int, DistWtdSadMxNAvgFunc, int>
+    DistWtdSadMxNAvgParam;
 
 typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_ptr[], int ref_stride,
@@ -203,7 +206,7 @@ class SADTestBase : public ::testing::Test {
     return sad;
   }
 
-  void ReferenceJntCompAvg(int block_idx) {
+  void ReferenceDistWtdCompAvg(int block_idx) {
     const uint8_t *const reference8 = GetReference(block_idx);
     const uint8_t *const second_pred8 = second_pred_;
     uint8_t *const comp_pred8 = comp_pred_;
@@ -228,7 +231,7 @@ class SADTestBase : public ::testing::Test {
     }
   }
 
-  unsigned int ReferenceJntSADavg(int block_idx) {
+  unsigned int ReferenceDistWtdSADavg(int block_idx) {
     unsigned int sad = 0;
     const uint8_t *const reference8 = GetReference(block_idx);
     const uint8_t *const source8 = source_data_;
@@ -305,7 +308,7 @@ class SADTestBase : public ::testing::Test {
   static uint8_t *comp_pred_test_;
   static uint8_t *comp_pred8_test_;
   static uint16_t *comp_pred16_test_;
-  JNT_COMP_PARAMS jcp_param_;
+  DIST_WTD_COMP_PARAMS jcp_param_;
 
   ACMRandom rnd_;
 };
@@ -391,13 +394,15 @@ class SADavgTest : public ::testing::WithParamInterface<SadMxNAvgParam>,
   }
 };
 
-class JntCompAvgTest : public ::testing::WithParamInterface<JntCompAvgParam>,
-                       public SADTestBase {
+class DistWtdCompAvgTest
+    : public ::testing::WithParamInterface<DistWtdCompAvgParam>,
+      public SADTestBase {
  public:
-  JntCompAvgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+  DistWtdCompAvgTest()
+      : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
 
  protected:
-  void jnt_comp_avg(int block_idx) {
+  void dist_wtd_comp_avg(int block_idx) {
     const uint8_t *const reference = GetReference(block_idx);
 
     ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
@@ -411,8 +416,8 @@ class JntCompAvgTest : public ::testing::WithParamInterface<JntCompAvgParam>,
         jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
         jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
 
-        ReferenceJntCompAvg(0);
-        jnt_comp_avg(0);
+        ReferenceDistWtdCompAvg(0);
+        dist_wtd_comp_avg(0);
 
         for (int y = 0; y < height_; ++y)
           for (int x = 0; x < width_; ++x)
@@ -423,10 +428,10 @@ class JntCompAvgTest : public ::testing::WithParamInterface<JntCompAvgParam>,
   }
 };
 
-class JntSADTest : public ::testing::WithParamInterface<JntSadMxhParam>,
-                   public SADTestBase {
+class DistWtdSADTest : public ::testing::WithParamInterface<DistWtdSadMxhParam>,
+                       public SADTestBase {
  public:
-  JntSADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+  DistWtdSADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
 
  protected:
   unsigned int SAD(int block_idx) {
@@ -455,13 +460,14 @@ class JntSADTest : public ::testing::WithParamInterface<JntSadMxhParam>,
   }
 };
 
-class JntSADavgTest : public ::testing::WithParamInterface<JntSadMxNAvgParam>,
-                      public SADTestBase {
+class DistWtdSADavgTest
+    : public ::testing::WithParamInterface<DistWtdSadMxNAvgParam>,
+      public SADTestBase {
  public:
-  JntSADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+  DistWtdSADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
 
  protected:
-  unsigned int jnt_SAD_avg(int block_idx) {
+  unsigned int dist_wtd_SAD_avg(int block_idx) {
     unsigned int ret;
     const uint8_t *const reference = GetReference(block_idx);
 
@@ -477,8 +483,8 @@ class JntSADavgTest : public ::testing::WithParamInterface<JntSadMxNAvgParam>,
         jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
         jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
 
-        const unsigned int reference_sad = ReferenceJntSADavg(0);
-        const unsigned int exp_sad = jnt_SAD_avg(0);
+        const unsigned int reference_sad = ReferenceDistWtdSADavg(0);
+        const unsigned int exp_sad = dist_wtd_SAD_avg(0);
 
         ASSERT_EQ(reference_sad, exp_sad);
       }
@@ -608,19 +614,19 @@ TEST_P(SADavgTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
-TEST_P(JntCompAvgTest, MaxRef) {
+TEST_P(DistWtdCompAvgTest, MaxRef) {
   FillConstant(reference_data_, reference_stride_, mask_);
   FillConstant(second_pred_, width_, 0);
   CheckCompAvg();
 }
 
-TEST_P(JntCompAvgTest, MaxSecondPred) {
+TEST_P(DistWtdCompAvgTest, MaxSecondPred) {
   FillConstant(reference_data_, reference_stride_, 0);
   FillConstant(second_pred_, width_, mask_);
   CheckCompAvg();
 }
 
-TEST_P(JntCompAvgTest, ShortRef) {
+TEST_P(DistWtdCompAvgTest, ShortRef) {
   const int tmp_stride = reference_stride_;
   reference_stride_ >>= 1;
   FillRandom(reference_data_, reference_stride_);
@@ -629,7 +635,7 @@ TEST_P(JntCompAvgTest, ShortRef) {
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(JntCompAvgTest, UnalignedRef) {
+TEST_P(DistWtdCompAvgTest, UnalignedRef) {
   // The reference frame, but not the source frame, may be unaligned for
   // certain types of searches.
   const int tmp_stride = reference_stride_;
@@ -640,19 +646,19 @@ TEST_P(JntCompAvgTest, UnalignedRef) {
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(JntSADTest, MaxRef) {
+TEST_P(DistWtdSADTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, mask_);
   CheckSAD();
 }
 
-TEST_P(JntSADTest, MaxSrc) {
+TEST_P(DistWtdSADTest, MaxSrc) {
   FillConstant(source_data_, source_stride_, mask_);
   FillConstant(reference_data_, reference_stride_, 0);
   CheckSAD();
 }
 
-TEST_P(JntSADTest, ShortRef) {
+TEST_P(DistWtdSADTest, ShortRef) {
   const int tmp_stride = reference_stride_;
   reference_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
@@ -661,7 +667,7 @@ TEST_P(JntSADTest, ShortRef) {
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(JntSADTest, UnalignedRef) {
+TEST_P(DistWtdSADTest, UnalignedRef) {
   // The reference frame, but not the source frame, may be unaligned for
   // certain types of searches.
   const int tmp_stride = reference_stride_;
@@ -672,7 +678,7 @@ TEST_P(JntSADTest, UnalignedRef) {
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(JntSADTest, ShortSrc) {
+TEST_P(DistWtdSADTest, ShortSrc) {
   const int tmp_stride = source_stride_;
   source_stride_ >>= 1;
   int test_count = 2000;
@@ -685,20 +691,20 @@ TEST_P(JntSADTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
-TEST_P(JntSADavgTest, MaxRef) {
+TEST_P(DistWtdSADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, mask_);
   FillConstant(second_pred_, width_, 0);
   CheckSAD();
 }
-TEST_P(JntSADavgTest, MaxSrc) {
+TEST_P(DistWtdSADavgTest, MaxSrc) {
   FillConstant(source_data_, source_stride_, mask_);
   FillConstant(reference_data_, reference_stride_, 0);
   FillConstant(second_pred_, width_, 0);
   CheckSAD();
 }
 
-TEST_P(JntSADavgTest, ShortRef) {
+TEST_P(DistWtdSADavgTest, ShortRef) {
   const int tmp_stride = reference_stride_;
   reference_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
@@ -708,7 +714,7 @@ TEST_P(JntSADavgTest, ShortRef) {
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(JntSADavgTest, UnalignedRef) {
+TEST_P(DistWtdSADavgTest, UnalignedRef) {
   // The reference frame, but not the source frame, may be unaligned for
   // certain types of searches.
   const int tmp_stride = reference_stride_;
@@ -720,7 +726,7 @@ TEST_P(JntSADavgTest, UnalignedRef) {
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(JntSADavgTest, ShortSrc) {
+TEST_P(DistWtdSADavgTest, ShortSrc) {
   const int tmp_stride = source_stride_;
   source_stride_ >>= 1;
   int test_count = 2000;
@@ -947,47 +953,48 @@ const SadMxNAvgParam avg_c_tests[] = {
 INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
 // TODO(chengchen): add highbd tests
-const JntCompAvgParam jnt_comp_avg_c_tests[] = {
-  make_tuple(128, 128, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(128, 64, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(64, 128, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(64, 64, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(64, 32, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(32, 64, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(32, 32, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(32, 16, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(16, 32, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(16, 16, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(16, 8, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(8, 16, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(8, 8, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(8, 4, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(4, 8, &aom_jnt_comp_avg_pred_c, -1),
-  make_tuple(4, 4, &aom_jnt_comp_avg_pred_c, -1),
+const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
 };
 
-INSTANTIATE_TEST_CASE_P(C, JntCompAvgTest,
-                        ::testing::ValuesIn(jnt_comp_avg_c_tests));
-
-const JntSadMxNAvgParam jnt_avg_c_tests[] = {
-  make_tuple(128, 128, &aom_jnt_sad128x128_avg_c, -1),
-  make_tuple(128, 64, &aom_jnt_sad128x64_avg_c, -1),
-  make_tuple(64, 128, &aom_jnt_sad64x128_avg_c, -1),
-  make_tuple(64, 64, &aom_jnt_sad64x64_avg_c, -1),
-  make_tuple(64, 32, &aom_jnt_sad64x32_avg_c, -1),
-  make_tuple(32, 64, &aom_jnt_sad32x64_avg_c, -1),
-  make_tuple(32, 32, &aom_jnt_sad32x32_avg_c, -1),
-  make_tuple(32, 16, &aom_jnt_sad32x16_avg_c, -1),
-  make_tuple(16, 32, &aom_jnt_sad16x32_avg_c, -1),
-  make_tuple(16, 16, &aom_jnt_sad16x16_avg_c, -1),
-  make_tuple(16, 8, &aom_jnt_sad16x8_avg_c, -1),
-  make_tuple(8, 16, &aom_jnt_sad8x16_avg_c, -1),
-  make_tuple(8, 8, &aom_jnt_sad8x8_avg_c, -1),
-  make_tuple(8, 4, &aom_jnt_sad8x4_avg_c, -1),
-  make_tuple(4, 8, &aom_jnt_sad4x8_avg_c, -1),
-  make_tuple(4, 4, &aom_jnt_sad4x4_avg_c, -1),
+INSTANTIATE_TEST_CASE_P(C, DistWtdCompAvgTest,
+                        ::testing::ValuesIn(dist_wtd_comp_avg_c_tests));
+
+const DistWtdSadMxNAvgParam dist_wtd_avg_c_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_c, -1),
+  make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_c, -1),
+  make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_c, -1),
+  make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_c, -1),
+  make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_c, -1),
+  make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_c, -1),
+  make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_c, -1),
+  make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_c, -1),
+  make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_c, -1),
+  make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_c, -1),
+  make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_c, -1),
+  make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_c, -1),
+  make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_c, -1),
+  make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_c, -1),
+  make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_c, -1),
+  make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_c, -1),
 };
-INSTANTIATE_TEST_CASE_P(C, JntSADavgTest, ::testing::ValuesIn(jnt_avg_c_tests));
+INSTANTIATE_TEST_CASE_P(C, DistWtdSADavgTest,
+                        ::testing::ValuesIn(dist_wtd_avg_c_tests));
 
 const SadMxNx4Param x4d_c_tests[] = {
   make_tuple(128, 128, &aom_sad128x128x4d_c, -1),
@@ -1251,7 +1258,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
 #if HAVE_SSSE3
 // Note: These are named sse2, but part of ssse3 file and only built and linked
 // when ssse3 is enabled.
-const JntSadMxhParam jnt_sad_sse2_tests[] = {
+const DistWtdSadMxhParam dist_wtd_sad_sse2_tests[] = {
   make_tuple(4, 4, &aom_sad4xh_sse2, -1),
   make_tuple(4, 8, &aom_sad4xh_sse2, -1),
   make_tuple(8, 4, &aom_sad8xh_sse2, -1),
@@ -1275,8 +1282,8 @@ const JntSadMxhParam jnt_sad_sse2_tests[] = {
   make_tuple(16, 64, &aom_sad16xh_sse2, -1),
   make_tuple(64, 16, &aom_sad64xh_sse2, -1),
 };
-INSTANTIATE_TEST_CASE_P(SSE2, JntSADTest,
-                        ::testing::ValuesIn(jnt_sad_sse2_tests));
+INSTANTIATE_TEST_CASE_P(SSE2, DistWtdSADTest,
+                        ::testing::ValuesIn(dist_wtd_sad_sse2_tests));
 
 #endif  // HAVE_SSSE3
 
@@ -1285,49 +1292,49 @@ INSTANTIATE_TEST_CASE_P(SSE2, JntSADTest,
 #endif  // HAVE_SSE3
 
 #if HAVE_SSSE3
-const JntCompAvgParam jnt_comp_avg_ssse3_tests[] = {
-  make_tuple(128, 128, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(128, 64, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(64, 128, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(64, 64, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(64, 32, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(32, 64, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(32, 32, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(32, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 32, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 8, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(8, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(8, 8, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(8, 4, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(4, 8, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(4, 4, &aom_jnt_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
+const DistWtdCompAvgParam dist_wtd_comp_avg_ssse3_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
 };
 
-INSTANTIATE_TEST_CASE_P(SSSE3, JntCompAvgTest,
-                        ::testing::ValuesIn(jnt_comp_avg_ssse3_tests));
-
-const JntSadMxNAvgParam jnt_avg_ssse3_tests[] = {
-  make_tuple(128, 128, &aom_jnt_sad128x128_avg_ssse3, -1),
-  make_tuple(128, 64, &aom_jnt_sad128x64_avg_ssse3, -1),
-  make_tuple(64, 128, &aom_jnt_sad64x128_avg_ssse3, -1),
-  make_tuple(64, 64, &aom_jnt_sad64x64_avg_ssse3, -1),
-  make_tuple(64, 32, &aom_jnt_sad64x32_avg_ssse3, -1),
-  make_tuple(32, 64, &aom_jnt_sad32x64_avg_ssse3, -1),
-  make_tuple(32, 32, &aom_jnt_sad32x32_avg_ssse3, -1),
-  make_tuple(32, 16, &aom_jnt_sad32x16_avg_ssse3, -1),
-  make_tuple(16, 32, &aom_jnt_sad16x32_avg_ssse3, -1),
-  make_tuple(16, 16, &aom_jnt_sad16x16_avg_ssse3, -1),
-  make_tuple(16, 8, &aom_jnt_sad16x8_avg_ssse3, -1),
-  make_tuple(8, 16, &aom_jnt_sad8x16_avg_ssse3, -1),
-  make_tuple(8, 8, &aom_jnt_sad8x8_avg_ssse3, -1),
-  make_tuple(8, 4, &aom_jnt_sad8x4_avg_ssse3, -1),
-  make_tuple(4, 8, &aom_jnt_sad4x8_avg_ssse3, -1),
-  make_tuple(4, 4, &aom_jnt_sad4x4_avg_ssse3, -1),
+INSTANTIATE_TEST_CASE_P(SSSE3, DistWtdCompAvgTest,
+                        ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests));
+
+const DistWtdSadMxNAvgParam dist_wtd_avg_ssse3_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_ssse3, -1),
+  make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_ssse3, -1),
+  make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_ssse3, -1),
+  make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_ssse3, -1),
+  make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_ssse3, -1),
+  make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_ssse3, -1),
+  make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_ssse3, -1),
+  make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_ssse3, -1),
+  make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_ssse3, -1),
+  make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_ssse3, -1),
+  make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_ssse3, -1),
+  make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_ssse3, -1),
+  make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_ssse3, -1),
+  make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_ssse3, -1),
+  make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_ssse3, -1),
+  make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_ssse3, -1),
 };
-INSTANTIATE_TEST_CASE_P(SSSE3, JntSADavgTest,
-                        ::testing::ValuesIn(jnt_avg_ssse3_tests));
+INSTANTIATE_TEST_CASE_P(SSSE3, DistWtdSADavgTest,
+                        ::testing::ValuesIn(dist_wtd_avg_ssse3_tests));
 #endif  // HAVE_SSSE3
 
 #if HAVE_SSE4_1
diff --git a/libaom/test/sum_squares_test.cc b/libaom/test/sum_squares_test.cc
index cb518c8..f26a646 100644
--- a/libaom/test/sum_squares_test.cc
+++ b/libaom/test/sum_squares_test.cc
@@ -255,7 +255,7 @@ class SSETest : public ::testing::TestWithParam<SSETestParam> {
     aom_free(src_);
     aom_free(ref_);
   }
-  void RunTest(int isRandom, int width, int height);
+  void RunTest(int isRandom, int width, int height, int run_times);
 
   void GenRandomData(int width, int height, int stride) {
     uint16_t *pSrc = (uint16_t *)src_;
@@ -298,8 +298,9 @@ class SSETest : public ::testing::TestWithParam<SSETestParam> {
   ACMRandom rnd_;
 };
 
-void SSETest::RunTest(int isRandom, int width, int height) {
+void SSETest::RunTest(int isRandom, int width, int height, int run_times) {
   int failed = 0;
+  aom_usec_timer ref_timer, test_timer;
   for (int k = 0; k < 3; k++) {
     int stride = 4 << rnd_(7);  // Up to 256 stride
     while (stride < width) {    // Make sure it's valid
@@ -326,31 +327,58 @@ void SSETest::RunTest(int isRandom, int width, int height) {
       pRef = CONVERT_TO_BYTEPTR(ref_);
     }
     res_ref = params_.ref_func(pSrc, stride, pRef, stride, width, height);
+    res_tst = params_.tst_func(pSrc, stride, pRef, stride, width, height);
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(pSrc, stride, pRef, stride, width, height);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
 
-    ASM_REGISTER_STATE_CHECK(
-        res_tst = params_.tst_func(pSrc, stride, pRef, stride, width, height));
-
-    if (!failed) {
-      failed = res_ref != res_tst;
-      EXPECT_EQ(res_ref, res_tst)
-          << "Error:" << (isHbd_ ? "hbd " : " ") << k << " SSE Test [" << width
-          << "x" << height << "] C output does not match optimized output.";
+      aom_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(pSrc, stride, pRef, stride, width, height);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%d\n",
+          elapsed_time_c, elapsed_time_simd,
+          (elapsed_time_c / elapsed_time_simd));
+    } else {
+      if (!failed) {
+        failed = res_ref != res_tst;
+        EXPECT_EQ(res_ref, res_tst)
+            << "Error:" << (isHbd_ ? "hbd " : " ") << k << " SSE Test ["
+            << width << "x" << height
+            << "] C output does not match optimized output.";
+      }
     }
   }
 }
 
 TEST_P(SSETest, OperationCheck) {
   for (int height = 4; height <= 128; height += 4) {
-    RunTest(1, width_, height);  // GenRandomData
+    RunTest(1, width_, height, 1);  // GenRandomData
   }
 }
 
 TEST_P(SSETest, ExtremeValues) {
   for (int height = 4; height <= 128; height += 4) {
-    RunTest(0, width_, height);
+    RunTest(0, width_, height, 1);
   }
 }
 
+TEST_P(SSETest, DISABLED_Speed) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(1, width_, height, 100);
+  }
+}
 #if HAVE_SSE4_1
 TestSSEFuncs sse_sse4[] = { TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1),
                             TestSSEFuncs(&aom_highbd_sse_c,
diff --git a/libaom/test/test-data.sha1 b/libaom/test/test-data.sha1
index 95342a8..bd63206 100644
--- a/libaom/test/test-data.sha1
+++ b/libaom/test/test-data.sha1
@@ -532,3 +532,9 @@ e94687eb0e90179b3800b6d5e11eb7e9bfb34eec *av1-1-b8-22-svc-L1T2.ivf
 2bc12b16385ea14323bc79607fb8dfbd7edaf8ef *av1-1-b8-22-svc-L1T2.ivf.md5
 32ef2f14ee9cb11a24a22934f4c065e926e5d236 *av1-1-b8-22-svc-L2T2.ivf
 f476a10ff06d750129f8229755d51e17ff141b2a *av1-1-b8-22-svc-L2T2.ivf.md5
+afca5502a489692b0a3c120370b0f43b8fc572a1 *av1-1-b8-04-cdfupdate.ivf
+13b9423155a08d5e3a2fd9ae4a973bb046718cdf *av1-1-b8-04-cdfupdate.ivf.md5
+f064290d7fcd3b3de19020e8aec6c43c88d3a505 *av1-1-b8-05-mv.ivf
+bff316e63ded5559116bdc2fa4aa97ad7b1a1761 *av1-1-b8-05-mv.ivf.md5
+b48a717c7c003b8dd23c3c2caed1ac673380fdb3 *av1-1-b8-06-mfmv.ivf
+1424e3cb53e00eb56b94f4c725826274212c42b6 *av1-1-b8-06-mfmv.ivf.md5
diff --git a/libaom/test/test.cmake b/libaom/test/test.cmake
index 12f2319..a44737a 100644
--- a/libaom/test/test.cmake
+++ b/libaom/test/test.cmake
@@ -64,10 +64,14 @@ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
             "${AOM_ROOT}/test/encode_test_driver.cc"
             "${AOM_ROOT}/test/encode_test_driver.h"
             "${AOM_ROOT}/test/end_to_end_test.cc"
+            "${AOM_ROOT}/test/fwd_kf_test.cc"
+            "${AOM_ROOT}/test/gf_max_pyr_height_test.cc"
+            "${AOM_ROOT}/test/rt_end_to_end_test.cc"
             "${AOM_ROOT}/test/error_resilience_test.cc"
             "${AOM_ROOT}/test/frame_size_tests.cc"
             "${AOM_ROOT}/test/horz_superres_test.cc"
             "${AOM_ROOT}/test/i420_video_source.h"
+            "${AOM_ROOT}/test/level_test.cc"
             "${AOM_ROOT}/test/lossless_test.cc"
             "${AOM_ROOT}/test/monochrome_test.cc"
             "${AOM_ROOT}/test/qm_test.cc"
@@ -120,7 +124,8 @@ if(NOT BUILD_SHARED_LIBS)
                 "${AOM_ROOT}/test/film_grain_table_test.cc"
                 "${AOM_ROOT}/test/segment_binarization_sync.cc"
                 "${AOM_ROOT}/test/superframe_test.cc"
-                "${AOM_ROOT}/test/tile_independence_test.cc")
+                "${AOM_ROOT}/test/tile_independence_test.cc"
+                "${AOM_ROOT}/test/yuv_temporal_filter_test.cc")
   endif()
 
   list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON
@@ -233,13 +238,6 @@ if(ENABLE_TESTS)
               "make sure it's in your PATH.")
   endif()
 
-  if(MSVC) # Force static run time to avoid collisions with googletest.
-    include("${AOM_ROOT}/build/cmake/msvc_runtime.cmake")
-    if(BUILD_SHARED_LIBS)
-      set(AOM_DISABLE_GTEST_CMAKE 1)
-    endif()
-  endif()
-
   if(BUILD_SHARED_LIBS AND APPLE) # Silence an RPATH warning.
     set(CMAKE_MACOSX_RPATH 1)
   endif()
@@ -247,15 +245,16 @@ if(ENABLE_TESTS)
   include_directories(
     "${AOM_ROOT}/third_party/googletest/src/googletest/include")
 
-  if(AOM_DISABLE_GTEST_CMAKE)
-    include_directories("${AOM_ROOT}/third_party/googletest/src/googletest")
-    add_library(
-      gtest
-      STATIC
-      "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
+  include_directories("${AOM_ROOT}/third_party/googletest/src/googletest")
+  add_library(
+    aom_gtest
+    STATIC "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
+  if(MSVC OR WIN32)
+    target_compile_definitions(aom_gtest PRIVATE GTEST_OS_WINDOWS=1)
+  elseif(CONFIG_MULTITHREAD AND CMAKE_USE_PTHREADS_INIT)
+    target_compile_definitions(aom_gtest PRIVATE GTEST_HAS_PTHREAD=1)
   else()
-    add_subdirectory("${AOM_ROOT}/third_party/googletest/src/googletest"
-                     EXCLUDE_FROM_ALL)
+    target_compile_definitions(aom_gtest PRIVATE GTEST_HAS_PTHREAD=0)
   endif()
 endif()
 
@@ -307,12 +306,12 @@ function(setup_aom_test_targets)
       add_executable(test_intra_pred_speed ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
                      $<TARGET_OBJECTS:aom_common_app_util>)
       target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom
-                            gtest)
+                            aom_gtest)
       list(APPEND AOM_APP_TARGETS test_intra_pred_speed)
     endif()
   endif()
 
-  target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom gtest)
+  target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom aom_gtest)
 
   if(CONFIG_LIBYUV)
     target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:yuv>)
diff --git a/libaom/test/test_data_util.cmake b/libaom/test/test_data_util.cmake
index 6d684cb..c3c86aa 100644
--- a/libaom/test/test_data_util.cmake
+++ b/libaom/test/test_data_util.cmake
@@ -500,6 +500,12 @@ if(CONFIG_AV1_DECODER)
               "av1-1-b8-03-sizeup.mkv.md5"
               "av1-1-b8-03-sizedown.mkv"
               "av1-1-b8-03-sizedown.mkv.md5"
+              "av1-1-b8-04-cdfupdate.ivf"
+              "av1-1-b8-04-cdfupdate.ivf.md5"
+              "av1-1-b8-05-mv.ivf"
+              "av1-1-b8-05-mv.ivf.md5"
+              "av1-1-b8-06-mfmv.ivf"
+              "av1-1-b8-06-mfmv.ivf.md5"
               "av1-1-b8-22-svc-L2T1.ivf"
               "av1-1-b8-22-svc-L2T1.ivf.md5"
               "av1-1-b8-22-svc-L1T2.ivf"
diff --git a/libaom/test/test_vectors.cc b/libaom/test/test_vectors.cc
index d2f333f..d2cd901 100644
--- a/libaom/test/test_vectors.cc
+++ b/libaom/test/test_vectors.cc
@@ -16,125 +16,243 @@ namespace libaom_test {
 #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
 
 #if CONFIG_AV1_DECODER
-const char *const kAV1TestVectors[] = {
-  "av1-1-b8-00-quantizer-00.ivf",  "av1-1-b8-00-quantizer-01.ivf",
-  "av1-1-b8-00-quantizer-02.ivf",  "av1-1-b8-00-quantizer-03.ivf",
-  "av1-1-b8-00-quantizer-04.ivf",  "av1-1-b8-00-quantizer-05.ivf",
-  "av1-1-b8-00-quantizer-06.ivf",  "av1-1-b8-00-quantizer-07.ivf",
-  "av1-1-b8-00-quantizer-08.ivf",  "av1-1-b8-00-quantizer-09.ivf",
-  "av1-1-b8-00-quantizer-10.ivf",  "av1-1-b8-00-quantizer-11.ivf",
-  "av1-1-b8-00-quantizer-12.ivf",  "av1-1-b8-00-quantizer-13.ivf",
-  "av1-1-b8-00-quantizer-14.ivf",  "av1-1-b8-00-quantizer-15.ivf",
-  "av1-1-b8-00-quantizer-16.ivf",  "av1-1-b8-00-quantizer-17.ivf",
-  "av1-1-b8-00-quantizer-18.ivf",  "av1-1-b8-00-quantizer-19.ivf",
-  "av1-1-b8-00-quantizer-20.ivf",  "av1-1-b8-00-quantizer-21.ivf",
-  "av1-1-b8-00-quantizer-22.ivf",  "av1-1-b8-00-quantizer-23.ivf",
-  "av1-1-b8-00-quantizer-24.ivf",  "av1-1-b8-00-quantizer-25.ivf",
-  "av1-1-b8-00-quantizer-26.ivf",  "av1-1-b8-00-quantizer-27.ivf",
-  "av1-1-b8-00-quantizer-28.ivf",  "av1-1-b8-00-quantizer-29.ivf",
-  "av1-1-b8-00-quantizer-30.ivf",  "av1-1-b8-00-quantizer-31.ivf",
-  "av1-1-b8-00-quantizer-32.ivf",  "av1-1-b8-00-quantizer-33.ivf",
-  "av1-1-b8-00-quantizer-34.ivf",  "av1-1-b8-00-quantizer-35.ivf",
-  "av1-1-b8-00-quantizer-36.ivf",  "av1-1-b8-00-quantizer-37.ivf",
-  "av1-1-b8-00-quantizer-38.ivf",  "av1-1-b8-00-quantizer-39.ivf",
-  "av1-1-b8-00-quantizer-40.ivf",  "av1-1-b8-00-quantizer-41.ivf",
-  "av1-1-b8-00-quantizer-42.ivf",  "av1-1-b8-00-quantizer-43.ivf",
-  "av1-1-b8-00-quantizer-44.ivf",  "av1-1-b8-00-quantizer-45.ivf",
-  "av1-1-b8-00-quantizer-46.ivf",  "av1-1-b8-00-quantizer-47.ivf",
-  "av1-1-b8-00-quantizer-48.ivf",  "av1-1-b8-00-quantizer-49.ivf",
-  "av1-1-b8-00-quantizer-50.ivf",  "av1-1-b8-00-quantizer-51.ivf",
-  "av1-1-b8-00-quantizer-52.ivf",  "av1-1-b8-00-quantizer-53.ivf",
-  "av1-1-b8-00-quantizer-54.ivf",  "av1-1-b8-00-quantizer-55.ivf",
-  "av1-1-b8-00-quantizer-56.ivf",  "av1-1-b8-00-quantizer-57.ivf",
-  "av1-1-b8-00-quantizer-58.ivf",  "av1-1-b8-00-quantizer-59.ivf",
-  "av1-1-b8-00-quantizer-60.ivf",  "av1-1-b8-00-quantizer-61.ivf",
-  "av1-1-b8-00-quantizer-62.ivf",  "av1-1-b8-00-quantizer-63.ivf",
-  "av1-1-b10-00-quantizer-00.ivf", "av1-1-b10-00-quantizer-01.ivf",
-  "av1-1-b10-00-quantizer-02.ivf", "av1-1-b10-00-quantizer-03.ivf",
-  "av1-1-b10-00-quantizer-04.ivf", "av1-1-b10-00-quantizer-05.ivf",
-  "av1-1-b10-00-quantizer-06.ivf", "av1-1-b10-00-quantizer-07.ivf",
-  "av1-1-b10-00-quantizer-08.ivf", "av1-1-b10-00-quantizer-09.ivf",
-  "av1-1-b10-00-quantizer-10.ivf", "av1-1-b10-00-quantizer-11.ivf",
-  "av1-1-b10-00-quantizer-12.ivf", "av1-1-b10-00-quantizer-13.ivf",
-  "av1-1-b10-00-quantizer-14.ivf", "av1-1-b10-00-quantizer-15.ivf",
-  "av1-1-b10-00-quantizer-16.ivf", "av1-1-b10-00-quantizer-17.ivf",
-  "av1-1-b10-00-quantizer-18.ivf", "av1-1-b10-00-quantizer-19.ivf",
-  "av1-1-b10-00-quantizer-20.ivf", "av1-1-b10-00-quantizer-21.ivf",
-  "av1-1-b10-00-quantizer-22.ivf", "av1-1-b10-00-quantizer-23.ivf",
-  "av1-1-b10-00-quantizer-24.ivf", "av1-1-b10-00-quantizer-25.ivf",
-  "av1-1-b10-00-quantizer-26.ivf", "av1-1-b10-00-quantizer-27.ivf",
-  "av1-1-b10-00-quantizer-28.ivf", "av1-1-b10-00-quantizer-29.ivf",
-  "av1-1-b10-00-quantizer-30.ivf", "av1-1-b10-00-quantizer-31.ivf",
-  "av1-1-b10-00-quantizer-32.ivf", "av1-1-b10-00-quantizer-33.ivf",
-  "av1-1-b10-00-quantizer-34.ivf", "av1-1-b10-00-quantizer-35.ivf",
-  "av1-1-b10-00-quantizer-36.ivf", "av1-1-b10-00-quantizer-37.ivf",
-  "av1-1-b10-00-quantizer-38.ivf", "av1-1-b10-00-quantizer-39.ivf",
-  "av1-1-b10-00-quantizer-40.ivf", "av1-1-b10-00-quantizer-41.ivf",
-  "av1-1-b10-00-quantizer-42.ivf", "av1-1-b10-00-quantizer-43.ivf",
-  "av1-1-b10-00-quantizer-44.ivf", "av1-1-b10-00-quantizer-45.ivf",
-  "av1-1-b10-00-quantizer-46.ivf", "av1-1-b10-00-quantizer-47.ivf",
-  "av1-1-b10-00-quantizer-48.ivf", "av1-1-b10-00-quantizer-49.ivf",
-  "av1-1-b10-00-quantizer-50.ivf", "av1-1-b10-00-quantizer-51.ivf",
-  "av1-1-b10-00-quantizer-52.ivf", "av1-1-b10-00-quantizer-53.ivf",
-  "av1-1-b10-00-quantizer-54.ivf", "av1-1-b10-00-quantizer-55.ivf",
-  "av1-1-b10-00-quantizer-56.ivf", "av1-1-b10-00-quantizer-57.ivf",
-  "av1-1-b10-00-quantizer-58.ivf", "av1-1-b10-00-quantizer-59.ivf",
-  "av1-1-b10-00-quantizer-60.ivf", "av1-1-b10-00-quantizer-61.ivf",
-  "av1-1-b10-00-quantizer-62.ivf", "av1-1-b10-00-quantizer-63.ivf",
-  "av1-1-b8-01-size-16x16.ivf",    "av1-1-b8-01-size-16x18.ivf",
-  "av1-1-b8-01-size-16x32.ivf",    "av1-1-b8-01-size-16x34.ivf",
-  "av1-1-b8-01-size-16x64.ivf",    "av1-1-b8-01-size-16x66.ivf",
-  "av1-1-b8-01-size-18x16.ivf",    "av1-1-b8-01-size-18x18.ivf",
-  "av1-1-b8-01-size-18x32.ivf",    "av1-1-b8-01-size-18x34.ivf",
-  "av1-1-b8-01-size-18x64.ivf",    "av1-1-b8-01-size-18x66.ivf",
-  "av1-1-b8-01-size-196x196.ivf",  "av1-1-b8-01-size-196x198.ivf",
-  "av1-1-b8-01-size-196x200.ivf",  "av1-1-b8-01-size-196x202.ivf",
-  "av1-1-b8-01-size-196x208.ivf",  "av1-1-b8-01-size-196x210.ivf",
-  "av1-1-b8-01-size-196x224.ivf",  "av1-1-b8-01-size-196x226.ivf",
-  "av1-1-b8-01-size-198x196.ivf",  "av1-1-b8-01-size-198x198.ivf",
-  "av1-1-b8-01-size-198x200.ivf",  "av1-1-b8-01-size-198x202.ivf",
-  "av1-1-b8-01-size-198x208.ivf",  "av1-1-b8-01-size-198x210.ivf",
-  "av1-1-b8-01-size-198x224.ivf",  "av1-1-b8-01-size-198x226.ivf",
-  "av1-1-b8-01-size-200x196.ivf",  "av1-1-b8-01-size-200x198.ivf",
-  "av1-1-b8-01-size-200x200.ivf",  "av1-1-b8-01-size-200x202.ivf",
-  "av1-1-b8-01-size-200x208.ivf",  "av1-1-b8-01-size-200x210.ivf",
-  "av1-1-b8-01-size-200x224.ivf",  "av1-1-b8-01-size-200x226.ivf",
-  "av1-1-b8-01-size-202x196.ivf",  "av1-1-b8-01-size-202x198.ivf",
-  "av1-1-b8-01-size-202x200.ivf",  "av1-1-b8-01-size-202x202.ivf",
-  "av1-1-b8-01-size-202x208.ivf",  "av1-1-b8-01-size-202x210.ivf",
-  "av1-1-b8-01-size-202x224.ivf",  "av1-1-b8-01-size-202x226.ivf",
-  "av1-1-b8-01-size-208x196.ivf",  "av1-1-b8-01-size-208x198.ivf",
-  "av1-1-b8-01-size-208x200.ivf",  "av1-1-b8-01-size-208x202.ivf",
-  "av1-1-b8-01-size-208x208.ivf",  "av1-1-b8-01-size-208x210.ivf",
-  "av1-1-b8-01-size-208x224.ivf",  "av1-1-b8-01-size-208x226.ivf",
-  "av1-1-b8-01-size-210x196.ivf",  "av1-1-b8-01-size-210x198.ivf",
-  "av1-1-b8-01-size-210x200.ivf",  "av1-1-b8-01-size-210x202.ivf",
-  "av1-1-b8-01-size-210x208.ivf",  "av1-1-b8-01-size-210x210.ivf",
-  "av1-1-b8-01-size-210x224.ivf",  "av1-1-b8-01-size-210x226.ivf",
-  "av1-1-b8-01-size-224x196.ivf",  "av1-1-b8-01-size-224x198.ivf",
-  "av1-1-b8-01-size-224x200.ivf",  "av1-1-b8-01-size-224x202.ivf",
-  "av1-1-b8-01-size-224x208.ivf",  "av1-1-b8-01-size-224x210.ivf",
-  "av1-1-b8-01-size-224x224.ivf",  "av1-1-b8-01-size-224x226.ivf",
-  "av1-1-b8-01-size-226x196.ivf",  "av1-1-b8-01-size-226x198.ivf",
-  "av1-1-b8-01-size-226x200.ivf",  "av1-1-b8-01-size-226x202.ivf",
-  "av1-1-b8-01-size-226x208.ivf",  "av1-1-b8-01-size-226x210.ivf",
-  "av1-1-b8-01-size-226x224.ivf",  "av1-1-b8-01-size-226x226.ivf",
-  "av1-1-b8-01-size-32x16.ivf",    "av1-1-b8-01-size-32x18.ivf",
-  "av1-1-b8-01-size-32x32.ivf",    "av1-1-b8-01-size-32x34.ivf",
-  "av1-1-b8-01-size-32x64.ivf",    "av1-1-b8-01-size-32x66.ivf",
-  "av1-1-b8-01-size-34x16.ivf",    "av1-1-b8-01-size-34x18.ivf",
-  "av1-1-b8-01-size-34x32.ivf",    "av1-1-b8-01-size-34x34.ivf",
-  "av1-1-b8-01-size-34x64.ivf",    "av1-1-b8-01-size-34x66.ivf",
-  "av1-1-b8-01-size-64x16.ivf",    "av1-1-b8-01-size-64x18.ivf",
-  "av1-1-b8-01-size-64x32.ivf",    "av1-1-b8-01-size-64x34.ivf",
-  "av1-1-b8-01-size-64x64.ivf",    "av1-1-b8-01-size-64x66.ivf",
-  "av1-1-b8-01-size-66x16.ivf",    "av1-1-b8-01-size-66x18.ivf",
-  "av1-1-b8-01-size-66x32.ivf",    "av1-1-b8-01-size-66x34.ivf",
-  "av1-1-b8-01-size-66x64.ivf",    "av1-1-b8-01-size-66x66.ivf",
-  "av1-1-b8-02-allintra.ivf",      "av1-1-b8-03-sizedown.mkv",
-  "av1-1-b8-03-sizeup.mkv",        "av1-1-b8-22-svc-L1T2.ivf",
-  "av1-1-b8-22-svc-L2T1.ivf",      "av1-1-b8-22-svc-L2T2.ivf"
-};
+const char *const kAV1TestVectors[] = { "av1-1-b8-00-quantizer-00.ivf",
+                                        "av1-1-b8-00-quantizer-01.ivf",
+                                        "av1-1-b8-00-quantizer-02.ivf",
+                                        "av1-1-b8-00-quantizer-03.ivf",
+                                        "av1-1-b8-00-quantizer-04.ivf",
+                                        "av1-1-b8-00-quantizer-05.ivf",
+                                        "av1-1-b8-00-quantizer-06.ivf",
+                                        "av1-1-b8-00-quantizer-07.ivf",
+                                        "av1-1-b8-00-quantizer-08.ivf",
+                                        "av1-1-b8-00-quantizer-09.ivf",
+                                        "av1-1-b8-00-quantizer-10.ivf",
+                                        "av1-1-b8-00-quantizer-11.ivf",
+                                        "av1-1-b8-00-quantizer-12.ivf",
+                                        "av1-1-b8-00-quantizer-13.ivf",
+                                        "av1-1-b8-00-quantizer-14.ivf",
+                                        "av1-1-b8-00-quantizer-15.ivf",
+                                        "av1-1-b8-00-quantizer-16.ivf",
+                                        "av1-1-b8-00-quantizer-17.ivf",
+                                        "av1-1-b8-00-quantizer-18.ivf",
+                                        "av1-1-b8-00-quantizer-19.ivf",
+                                        "av1-1-b8-00-quantizer-20.ivf",
+                                        "av1-1-b8-00-quantizer-21.ivf",
+                                        "av1-1-b8-00-quantizer-22.ivf",
+                                        "av1-1-b8-00-quantizer-23.ivf",
+                                        "av1-1-b8-00-quantizer-24.ivf",
+                                        "av1-1-b8-00-quantizer-25.ivf",
+                                        "av1-1-b8-00-quantizer-26.ivf",
+                                        "av1-1-b8-00-quantizer-27.ivf",
+                                        "av1-1-b8-00-quantizer-28.ivf",
+                                        "av1-1-b8-00-quantizer-29.ivf",
+                                        "av1-1-b8-00-quantizer-30.ivf",
+                                        "av1-1-b8-00-quantizer-31.ivf",
+                                        "av1-1-b8-00-quantizer-32.ivf",
+                                        "av1-1-b8-00-quantizer-33.ivf",
+                                        "av1-1-b8-00-quantizer-34.ivf",
+                                        "av1-1-b8-00-quantizer-35.ivf",
+                                        "av1-1-b8-00-quantizer-36.ivf",
+                                        "av1-1-b8-00-quantizer-37.ivf",
+                                        "av1-1-b8-00-quantizer-38.ivf",
+                                        "av1-1-b8-00-quantizer-39.ivf",
+                                        "av1-1-b8-00-quantizer-40.ivf",
+                                        "av1-1-b8-00-quantizer-41.ivf",
+                                        "av1-1-b8-00-quantizer-42.ivf",
+                                        "av1-1-b8-00-quantizer-43.ivf",
+                                        "av1-1-b8-00-quantizer-44.ivf",
+                                        "av1-1-b8-00-quantizer-45.ivf",
+                                        "av1-1-b8-00-quantizer-46.ivf",
+                                        "av1-1-b8-00-quantizer-47.ivf",
+                                        "av1-1-b8-00-quantizer-48.ivf",
+                                        "av1-1-b8-00-quantizer-49.ivf",
+                                        "av1-1-b8-00-quantizer-50.ivf",
+                                        "av1-1-b8-00-quantizer-51.ivf",
+                                        "av1-1-b8-00-quantizer-52.ivf",
+                                        "av1-1-b8-00-quantizer-53.ivf",
+                                        "av1-1-b8-00-quantizer-54.ivf",
+                                        "av1-1-b8-00-quantizer-55.ivf",
+                                        "av1-1-b8-00-quantizer-56.ivf",
+                                        "av1-1-b8-00-quantizer-57.ivf",
+                                        "av1-1-b8-00-quantizer-58.ivf",
+                                        "av1-1-b8-00-quantizer-59.ivf",
+                                        "av1-1-b8-00-quantizer-60.ivf",
+                                        "av1-1-b8-00-quantizer-61.ivf",
+                                        "av1-1-b8-00-quantizer-62.ivf",
+                                        "av1-1-b8-00-quantizer-63.ivf",
+                                        "av1-1-b10-00-quantizer-00.ivf",
+                                        "av1-1-b10-00-quantizer-01.ivf",
+                                        "av1-1-b10-00-quantizer-02.ivf",
+                                        "av1-1-b10-00-quantizer-03.ivf",
+                                        "av1-1-b10-00-quantizer-04.ivf",
+                                        "av1-1-b10-00-quantizer-05.ivf",
+                                        "av1-1-b10-00-quantizer-06.ivf",
+                                        "av1-1-b10-00-quantizer-07.ivf",
+                                        "av1-1-b10-00-quantizer-08.ivf",
+                                        "av1-1-b10-00-quantizer-09.ivf",
+                                        "av1-1-b10-00-quantizer-10.ivf",
+                                        "av1-1-b10-00-quantizer-11.ivf",
+                                        "av1-1-b10-00-quantizer-12.ivf",
+                                        "av1-1-b10-00-quantizer-13.ivf",
+                                        "av1-1-b10-00-quantizer-14.ivf",
+                                        "av1-1-b10-00-quantizer-15.ivf",
+                                        "av1-1-b10-00-quantizer-16.ivf",
+                                        "av1-1-b10-00-quantizer-17.ivf",
+                                        "av1-1-b10-00-quantizer-18.ivf",
+                                        "av1-1-b10-00-quantizer-19.ivf",
+                                        "av1-1-b10-00-quantizer-20.ivf",
+                                        "av1-1-b10-00-quantizer-21.ivf",
+                                        "av1-1-b10-00-quantizer-22.ivf",
+                                        "av1-1-b10-00-quantizer-23.ivf",
+                                        "av1-1-b10-00-quantizer-24.ivf",
+                                        "av1-1-b10-00-quantizer-25.ivf",
+                                        "av1-1-b10-00-quantizer-26.ivf",
+                                        "av1-1-b10-00-quantizer-27.ivf",
+                                        "av1-1-b10-00-quantizer-28.ivf",
+                                        "av1-1-b10-00-quantizer-29.ivf",
+                                        "av1-1-b10-00-quantizer-30.ivf",
+                                        "av1-1-b10-00-quantizer-31.ivf",
+                                        "av1-1-b10-00-quantizer-32.ivf",
+                                        "av1-1-b10-00-quantizer-33.ivf",
+                                        "av1-1-b10-00-quantizer-34.ivf",
+                                        "av1-1-b10-00-quantizer-35.ivf",
+                                        "av1-1-b10-00-quantizer-36.ivf",
+                                        "av1-1-b10-00-quantizer-37.ivf",
+                                        "av1-1-b10-00-quantizer-38.ivf",
+                                        "av1-1-b10-00-quantizer-39.ivf",
+                                        "av1-1-b10-00-quantizer-40.ivf",
+                                        "av1-1-b10-00-quantizer-41.ivf",
+                                        "av1-1-b10-00-quantizer-42.ivf",
+                                        "av1-1-b10-00-quantizer-43.ivf",
+                                        "av1-1-b10-00-quantizer-44.ivf",
+                                        "av1-1-b10-00-quantizer-45.ivf",
+                                        "av1-1-b10-00-quantizer-46.ivf",
+                                        "av1-1-b10-00-quantizer-47.ivf",
+                                        "av1-1-b10-00-quantizer-48.ivf",
+                                        "av1-1-b10-00-quantizer-49.ivf",
+                                        "av1-1-b10-00-quantizer-50.ivf",
+                                        "av1-1-b10-00-quantizer-51.ivf",
+                                        "av1-1-b10-00-quantizer-52.ivf",
+                                        "av1-1-b10-00-quantizer-53.ivf",
+                                        "av1-1-b10-00-quantizer-54.ivf",
+                                        "av1-1-b10-00-quantizer-55.ivf",
+                                        "av1-1-b10-00-quantizer-56.ivf",
+                                        "av1-1-b10-00-quantizer-57.ivf",
+                                        "av1-1-b10-00-quantizer-58.ivf",
+                                        "av1-1-b10-00-quantizer-59.ivf",
+                                        "av1-1-b10-00-quantizer-60.ivf",
+                                        "av1-1-b10-00-quantizer-61.ivf",
+                                        "av1-1-b10-00-quantizer-62.ivf",
+                                        "av1-1-b10-00-quantizer-63.ivf",
+                                        "av1-1-b8-01-size-16x16.ivf",
+                                        "av1-1-b8-01-size-16x18.ivf",
+                                        "av1-1-b8-01-size-16x32.ivf",
+                                        "av1-1-b8-01-size-16x34.ivf",
+                                        "av1-1-b8-01-size-16x64.ivf",
+                                        "av1-1-b8-01-size-16x66.ivf",
+                                        "av1-1-b8-01-size-18x16.ivf",
+                                        "av1-1-b8-01-size-18x18.ivf",
+                                        "av1-1-b8-01-size-18x32.ivf",
+                                        "av1-1-b8-01-size-18x34.ivf",
+                                        "av1-1-b8-01-size-18x64.ivf",
+                                        "av1-1-b8-01-size-18x66.ivf",
+                                        "av1-1-b8-01-size-196x196.ivf",
+                                        "av1-1-b8-01-size-196x198.ivf",
+                                        "av1-1-b8-01-size-196x200.ivf",
+                                        "av1-1-b8-01-size-196x202.ivf",
+                                        "av1-1-b8-01-size-196x208.ivf",
+                                        "av1-1-b8-01-size-196x210.ivf",
+                                        "av1-1-b8-01-size-196x224.ivf",
+                                        "av1-1-b8-01-size-196x226.ivf",
+                                        "av1-1-b8-01-size-198x196.ivf",
+                                        "av1-1-b8-01-size-198x198.ivf",
+                                        "av1-1-b8-01-size-198x200.ivf",
+                                        "av1-1-b8-01-size-198x202.ivf",
+                                        "av1-1-b8-01-size-198x208.ivf",
+                                        "av1-1-b8-01-size-198x210.ivf",
+                                        "av1-1-b8-01-size-198x224.ivf",
+                                        "av1-1-b8-01-size-198x226.ivf",
+                                        "av1-1-b8-01-size-200x196.ivf",
+                                        "av1-1-b8-01-size-200x198.ivf",
+                                        "av1-1-b8-01-size-200x200.ivf",
+                                        "av1-1-b8-01-size-200x202.ivf",
+                                        "av1-1-b8-01-size-200x208.ivf",
+                                        "av1-1-b8-01-size-200x210.ivf",
+                                        "av1-1-b8-01-size-200x224.ivf",
+                                        "av1-1-b8-01-size-200x226.ivf",
+                                        "av1-1-b8-01-size-202x196.ivf",
+                                        "av1-1-b8-01-size-202x198.ivf",
+                                        "av1-1-b8-01-size-202x200.ivf",
+                                        "av1-1-b8-01-size-202x202.ivf",
+                                        "av1-1-b8-01-size-202x208.ivf",
+                                        "av1-1-b8-01-size-202x210.ivf",
+                                        "av1-1-b8-01-size-202x224.ivf",
+                                        "av1-1-b8-01-size-202x226.ivf",
+                                        "av1-1-b8-01-size-208x196.ivf",
+                                        "av1-1-b8-01-size-208x198.ivf",
+                                        "av1-1-b8-01-size-208x200.ivf",
+                                        "av1-1-b8-01-size-208x202.ivf",
+                                        "av1-1-b8-01-size-208x208.ivf",
+                                        "av1-1-b8-01-size-208x210.ivf",
+                                        "av1-1-b8-01-size-208x224.ivf",
+                                        "av1-1-b8-01-size-208x226.ivf",
+                                        "av1-1-b8-01-size-210x196.ivf",
+                                        "av1-1-b8-01-size-210x198.ivf",
+                                        "av1-1-b8-01-size-210x200.ivf",
+                                        "av1-1-b8-01-size-210x202.ivf",
+                                        "av1-1-b8-01-size-210x208.ivf",
+                                        "av1-1-b8-01-size-210x210.ivf",
+                                        "av1-1-b8-01-size-210x224.ivf",
+                                        "av1-1-b8-01-size-210x226.ivf",
+                                        "av1-1-b8-01-size-224x196.ivf",
+                                        "av1-1-b8-01-size-224x198.ivf",
+                                        "av1-1-b8-01-size-224x200.ivf",
+                                        "av1-1-b8-01-size-224x202.ivf",
+                                        "av1-1-b8-01-size-224x208.ivf",
+                                        "av1-1-b8-01-size-224x210.ivf",
+                                        "av1-1-b8-01-size-224x224.ivf",
+                                        "av1-1-b8-01-size-224x226.ivf",
+                                        "av1-1-b8-01-size-226x196.ivf",
+                                        "av1-1-b8-01-size-226x198.ivf",
+                                        "av1-1-b8-01-size-226x200.ivf",
+                                        "av1-1-b8-01-size-226x202.ivf",
+                                        "av1-1-b8-01-size-226x208.ivf",
+                                        "av1-1-b8-01-size-226x210.ivf",
+                                        "av1-1-b8-01-size-226x224.ivf",
+                                        "av1-1-b8-01-size-226x226.ivf",
+                                        "av1-1-b8-01-size-32x16.ivf",
+                                        "av1-1-b8-01-size-32x18.ivf",
+                                        "av1-1-b8-01-size-32x32.ivf",
+                                        "av1-1-b8-01-size-32x34.ivf",
+                                        "av1-1-b8-01-size-32x64.ivf",
+                                        "av1-1-b8-01-size-32x66.ivf",
+                                        "av1-1-b8-01-size-34x16.ivf",
+                                        "av1-1-b8-01-size-34x18.ivf",
+                                        "av1-1-b8-01-size-34x32.ivf",
+                                        "av1-1-b8-01-size-34x34.ivf",
+                                        "av1-1-b8-01-size-34x64.ivf",
+                                        "av1-1-b8-01-size-34x66.ivf",
+                                        "av1-1-b8-01-size-64x16.ivf",
+                                        "av1-1-b8-01-size-64x18.ivf",
+                                        "av1-1-b8-01-size-64x32.ivf",
+                                        "av1-1-b8-01-size-64x34.ivf",
+                                        "av1-1-b8-01-size-64x64.ivf",
+                                        "av1-1-b8-01-size-64x66.ivf",
+                                        "av1-1-b8-01-size-66x16.ivf",
+                                        "av1-1-b8-01-size-66x18.ivf",
+                                        "av1-1-b8-01-size-66x32.ivf",
+                                        "av1-1-b8-01-size-66x34.ivf",
+                                        "av1-1-b8-01-size-66x64.ivf",
+                                        "av1-1-b8-01-size-66x66.ivf",
+                                        "av1-1-b8-02-allintra.ivf",
+                                        "av1-1-b8-03-sizedown.mkv",
+                                        "av1-1-b8-03-sizeup.mkv",
+                                        "av1-1-b8-04-cdfupdate.ivf",
+                                        "av1-1-b8-05-mv.ivf",
+                                        "av1-1-b8-06-mfmv.ivf",
+                                        "av1-1-b8-22-svc-L1T2.ivf",
+                                        "av1-1-b8-22-svc-L2T1.ivf",
+                                        "av1-1-b8-22-svc-L2T2.ivf" };
 const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors);
 #endif  // CONFIG_AV1_DECODER
 
diff --git a/libaom/test/variance_test.cc b/libaom/test/variance_test.cc
index 0df314b..1942de0 100644
--- a/libaom/test/variance_test.cc
+++ b/libaom/test/variance_test.cc
@@ -43,10 +43,10 @@ typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
 typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
                                       const uint8_t *b, int b_stride);
 typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
-typedef unsigned int (*JntSubpixAvgVarMxNFunc)(
+typedef unsigned int (*DistWtdSubpixAvgVarMxNFunc)(
     const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
     int b_stride, uint32_t *sse, const uint8_t *second_pred,
-    const JNT_COMP_PARAMS *jcp_param);
+    const DIST_WTD_COMP_PARAMS *jcp_param);
 typedef uint32_t (*ObmcSubpelVarFunc)(const uint8_t *pre, int pre_stride,
                                       int xoffset, int yoffset,
                                       const int32_t *wsrc, const int32_t *mask,
@@ -216,10 +216,10 @@ static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src,
   return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
 
-static uint32_t jnt_subpel_avg_variance_ref(
+static uint32_t dist_wtd_subpel_avg_variance_ref(
     const uint8_t *ref, const uint8_t *src, const uint8_t *second_pred, int l2w,
     int l2h, int xoff, int yoff, uint32_t *sse_ptr, bool use_high_bit_depth,
-    aom_bit_depth_t bit_depth, JNT_COMP_PARAMS *jcp_param) {
+    aom_bit_depth_t bit_depth, DIST_WTD_COMP_PARAMS *jcp_param) {
   int64_t se = 0;
   uint64_t sse = 0;
   const int w = 1 << l2w;
@@ -703,13 +703,14 @@ class SubpelVarianceTest
  protected:
   void RefTest();
   void ExtremeRefTest();
+  void SpeedTest();
 
   ACMRandom rnd_;
   uint8_t *src_;
   uint8_t *ref_;
   uint8_t *sec_;
   TestParams<FunctionType> params_;
-  JNT_COMP_PARAMS jcp_param_;
+  DIST_WTD_COMP_PARAMS jcp_param_;
 
   // some relay helpers
   bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
@@ -785,6 +786,41 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
   }
 }
 
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::SpeedTest() {
+  if (!use_high_bit_depth()) {
+    for (int j = 0; j < block_size(); j++) {
+      src_[j] = rnd_.Rand8();
+    }
+    for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+      ref_[j] = rnd_.Rand8();
+    }
+  } else {
+    for (int j = 0; j < block_size(); j++) {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+    }
+    for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+    }
+  }
+
+  unsigned int sse1;
+  int run_time = 1000000000 / block_size();
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_time; ++i) {
+    int x = rnd_(8);
+    int y = rnd_(8);
+    params_.func(ref_, width() + 1, x, y, src_, width(), &sse1);
+  }
+  aom_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(),
+         params_.bit_depth, elapsed_time);
+}
+
 template <>
 void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
   for (int x = 0; x < 8; ++x) {
@@ -820,7 +856,7 @@ void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
 }
 
 template <>
-void SubpelVarianceTest<JntSubpixAvgVarMxNFunc>::RefTest() {
+void SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>::RefTest() {
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
       if (!use_high_bit_depth()) {
@@ -849,7 +885,7 @@ void SubpelVarianceTest<JntSubpixAvgVarMxNFunc>::RefTest() {
           ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y,
                                                        src_, width(), &sse1,
                                                        sec_, &jcp_param_));
-          var2 = jnt_subpel_avg_variance_ref(
+          var2 = dist_wtd_subpel_avg_variance_ref(
               ref_, src_, sec_, params_.log2width, params_.log2height, x, y,
               &sse2, use_high_bit_depth(), params_.bit_depth, &jcp_param_);
           EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
@@ -1022,7 +1058,8 @@ typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest;
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest;
-typedef SubpelVarianceTest<JntSubpixAvgVarMxNFunc> AvxJntSubpelAvgVarianceTest;
+typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>
+    AvxDistWtdSubpelAvgVarianceTest;
 typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxObmcSubpelVarianceTest;
 
 TEST_P(AvxSseTest, RefSse) { RefTestSse(); }
@@ -1039,7 +1076,7 @@ TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
-TEST_P(AvxJntSubpelAvgVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
@@ -1121,36 +1158,35 @@ INSTANTIATE_TEST_CASE_P(
         SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0),
         SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0)));
 
-typedef TestParams<JntSubpixAvgVarMxNFunc> JntSubpelAvgVarianceParams;
+typedef TestParams<DistWtdSubpixAvgVarMxNFunc> DistWtdSubpelAvgVarianceParams;
 INSTANTIATE_TEST_CASE_P(
-    C, AvxJntSubpelAvgVarianceTest,
-    ::testing::Values(
-        JntSubpelAvgVarianceParams(6, 6, &aom_jnt_sub_pixel_avg_variance64x64_c,
-                                   0),
-        JntSubpelAvgVarianceParams(6, 5, &aom_jnt_sub_pixel_avg_variance64x32_c,
-                                   0),
-        JntSubpelAvgVarianceParams(5, 6, &aom_jnt_sub_pixel_avg_variance32x64_c,
-                                   0),
-        JntSubpelAvgVarianceParams(5, 5, &aom_jnt_sub_pixel_avg_variance32x32_c,
-                                   0),
-        JntSubpelAvgVarianceParams(5, 4, &aom_jnt_sub_pixel_avg_variance32x16_c,
-                                   0),
-        JntSubpelAvgVarianceParams(4, 5, &aom_jnt_sub_pixel_avg_variance16x32_c,
-                                   0),
-        JntSubpelAvgVarianceParams(4, 4, &aom_jnt_sub_pixel_avg_variance16x16_c,
-                                   0),
-        JntSubpelAvgVarianceParams(4, 3, &aom_jnt_sub_pixel_avg_variance16x8_c,
-                                   0),
-        JntSubpelAvgVarianceParams(3, 4, &aom_jnt_sub_pixel_avg_variance8x16_c,
-                                   0),
-        JntSubpelAvgVarianceParams(3, 3, &aom_jnt_sub_pixel_avg_variance8x8_c,
-                                   0),
-        JntSubpelAvgVarianceParams(3, 2, &aom_jnt_sub_pixel_avg_variance8x4_c,
-                                   0),
-        JntSubpelAvgVarianceParams(2, 3, &aom_jnt_sub_pixel_avg_variance4x8_c,
-                                   0),
-        JntSubpelAvgVarianceParams(2, 2, &aom_jnt_sub_pixel_avg_variance4x4_c,
-                                   0)));
+    C, AvxDistWtdSubpelAvgVarianceTest,
+    ::testing::Values(DistWtdSubpelAvgVarianceParams(
+                          6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     C, AvxObmcSubpelVarianceTest,
@@ -1188,6 +1224,7 @@ TEST_P(AvxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
 TEST_P(AvxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(AvxHBDSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxHBDSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
 
 /* TODO(debargha): This test does not support the highbd version
@@ -1677,6 +1714,9 @@ INSTANTIATE_TEST_CASE_P(AVX2, AvxHBDVarianceTest,
 #endif  // HAVE_AVX2
 
 const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
+  SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_sse2, 12),
+  SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_sse2, 12),
+  SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_sse2, 12),
   SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_sse2, 12),
   SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_sse2, 12),
   SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_sse2, 12),
@@ -1688,6 +1728,9 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
   SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_sse2, 12),
   SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_sse2, 12),
   SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_sse2, 12),
+  SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_sse2, 10),
+  SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_sse2, 10),
+  SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_sse2, 10),
   SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_sse2, 10),
   SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_sse2, 10),
   SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_sse2, 10),
@@ -1699,6 +1742,9 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
   SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_sse2, 10),
   SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_sse2, 10),
   SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_sse2, 10),
+  SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_sse2, 8),
+  SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_sse2, 8),
+  SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_sse2, 8),
   SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_sse2, 8),
   SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_sse2, 8),
   SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_sse2, 8),
@@ -1711,7 +1757,6 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
   SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_sse2, 8),
   SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_sse2, 8)
 };
-
 INSTANTIATE_TEST_CASE_P(SSE2, AvxHBDSubpelVarianceTest,
                         ::testing::ValuesIn(kArrayHBDSubpelVariance_sse2));
 
@@ -1840,44 +1885,34 @@ INSTANTIATE_TEST_CASE_P(
                                 0)));
 
 INSTANTIATE_TEST_CASE_P(
-    SSSE3, AvxJntSubpelAvgVarianceTest,
+    SSSE3, AvxDistWtdSubpelAvgVarianceTest,
     ::testing::Values(
-        JntSubpelAvgVarianceParams(6, 6,
-                                   &aom_jnt_sub_pixel_avg_variance64x64_ssse3,
-                                   0),
-        JntSubpelAvgVarianceParams(6, 5,
-                                   &aom_jnt_sub_pixel_avg_variance64x32_ssse3,
-                                   0),
-        JntSubpelAvgVarianceParams(5, 6,
-                                   &aom_jnt_sub_pixel_avg_variance32x64_ssse3,
-                                   0),
-        JntSubpelAvgVarianceParams(5, 5,
-                                   &aom_jnt_sub_pixel_avg_variance32x32_ssse3,
-                                   0),
-        JntSubpelAvgVarianceParams(5, 4,
-                                   &aom_jnt_sub_pixel_avg_variance32x16_ssse3,
-                                   0),
-        JntSubpelAvgVarianceParams(4, 5,
-                                   &aom_jnt_sub_pixel_avg_variance16x32_ssse3,
-                                   0),
-        JntSubpelAvgVarianceParams(4, 4,
-                                   &aom_jnt_sub_pixel_avg_variance16x16_ssse3,
-                                   0),
-        JntSubpelAvgVarianceParams(4, 3,
-                                   &aom_jnt_sub_pixel_avg_variance16x8_ssse3,
-                                   0),
-        JntSubpelAvgVarianceParams(3, 4,
-                                   &aom_jnt_sub_pixel_avg_variance8x16_ssse3,
-                                   0),
-        JntSubpelAvgVarianceParams(3, 3,
-                                   &aom_jnt_sub_pixel_avg_variance8x8_ssse3, 0),
-        JntSubpelAvgVarianceParams(3, 2,
-                                   &aom_jnt_sub_pixel_avg_variance8x4_ssse3, 0),
-        JntSubpelAvgVarianceParams(2, 3,
-                                   &aom_jnt_sub_pixel_avg_variance4x8_ssse3, 0),
-        JntSubpelAvgVarianceParams(2, 2,
-                                   &aom_jnt_sub_pixel_avg_variance4x4_ssse3,
-                                   0)));
+        DistWtdSubpelAvgVarianceParams(
+            6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0)));
 #endif  // HAVE_SSSE3
 
 #if HAVE_SSE4_1
diff --git a/libaom/test/warp_filter_test_util.cc b/libaom/test/warp_filter_test_util.cc
index 69b2ed4..9208af8 100644
--- a/libaom/test/warp_filter_test_util.cc
+++ b/libaom/test/warp_filter_test_util.cc
@@ -149,7 +149,7 @@ void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
   int do_average = 0;
 
   conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
-  conv_params.use_jnt_comp_avg = 0;
+  conv_params.use_dist_wtd_comp_avg = 0;
 
   const int num_loops = 1000000000 / (out_w + out_h);
   aom_usec_timer timer;
@@ -222,9 +222,9 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
                 conv_params = get_conv_params(0, 0, bd);
               }
               if (jj >= 4) {
-                conv_params.use_jnt_comp_avg = 0;
+                conv_params.use_dist_wtd_comp_avg = 0;
               } else {
-                conv_params.use_jnt_comp_avg = 1;
+                conv_params.use_dist_wtd_comp_avg = 1;
                 conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
                 conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
               }
@@ -236,9 +236,9 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
                     get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
               }
               if (jj >= 4) {
-                conv_params.use_jnt_comp_avg = 0;
+                conv_params.use_dist_wtd_comp_avg = 0;
               } else {
-                conv_params.use_jnt_comp_avg = 1;
+                conv_params.use_dist_wtd_comp_avg = 1;
                 conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
                 conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
               }
@@ -342,7 +342,7 @@ void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
   sub_x = 0;
   sub_y = 0;
   int do_average = 0;
-  conv_params.use_jnt_comp_avg = 0;
+  conv_params.use_dist_wtd_comp_avg = 0;
   conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
 
   const int num_loops = 1000000000 / (out_w + out_h);
@@ -419,9 +419,9 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
                 conv_params = get_conv_params(0, 0, bd);
               }
               if (jj >= 4) {
-                conv_params.use_jnt_comp_avg = 0;
+                conv_params.use_dist_wtd_comp_avg = 0;
               } else {
-                conv_params.use_jnt_comp_avg = 1;
+                conv_params.use_dist_wtd_comp_avg = 1;
                 conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
                 conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
               }
@@ -436,9 +436,9 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
                     get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
               }
               if (jj >= 4) {
-                conv_params.use_jnt_comp_avg = 0;
+                conv_params.use_dist_wtd_comp_avg = 0;
               } else {
-                conv_params.use_jnt_comp_avg = 1;
+                conv_params.use_dist_wtd_comp_avg = 1;
                 conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
                 conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
               }
diff --git a/libaom/test/yuv_temporal_filter_test.cc b/libaom/test/yuv_temporal_filter_test.cc
new file mode 100644
index 0000000..fcaf0df
--- /dev/null
+++ b/libaom/test/yuv_temporal_filter_test.cc
@@ -0,0 +1,726 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+
+namespace {
+
+using ::libaom_test::ACMRandom;
+
+const int MAX_WIDTH = 32;
+const int MAX_HEIGHT = 32;
+
+typedef void (*YUVTemporalFilterFunc)(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
+    uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
+    uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
+struct TemporalFilterWithBd {
+  TemporalFilterWithBd(YUVTemporalFilterFunc func, int bitdepth)
+      : temporal_filter(func), bd(bitdepth) {}
+
+  YUVTemporalFilterFunc temporal_filter;
+  int bd;
+};
+
+std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) {
+  return os << "Bitdepth: " << tf.bd;
+}
+
+int GetFilterWeight(unsigned int row, unsigned int col,
+                    unsigned int block_height, unsigned int block_width,
+                    const int *const blk_fw, int use_32x32) {
+  if (use_32x32) {
+    return blk_fw[0];
+  }
+
+  return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)];
+}
+
+template <typename PixelType>
+int GetModIndex(int sum_dist, int index, int rounding, int strength,
+                int filter_weight) {
+  int mod = sum_dist * 3 / index;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = AOMMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+// Lowbitdepth version
+template <>
+int GetModIndex<uint8_t>(int sum_dist, int index, int rounding, int strength,
+                         int filter_weight) {
+  unsigned int index_mult[14] = {
+    0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124
+  };
+
+  assert(index >= 0 && index <= 13);
+  assert(index_mult[index] != 0);
+
+  int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = AOMMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+// Highbitdepth version
+template <>
+int GetModIndex<uint16_t>(int sum_dist, int index, int rounding, int strength,
+                          int filter_weight) {
+  int64_t index_mult[14] = { 0U,          0U,          0U,          0U,
+                             3221225472U, 2576980378U, 2147483648U, 1840700270U,
+                             1610612736U, 1431655766U, 1288490189U, 1171354718U,
+                             0U,          991146300U };
+
+  assert(index >= 0 && index <= 13);
+  assert(index_mult[index] != 0);
+
+  int mod = static_cast<int>((sum_dist * index_mult[index]) >> 32);
+  mod += rounding;
+  mod >>= strength;
+
+  mod = AOMMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+template <typename PixelType>
+void SetArray(PixelType *pixel_array, int width, int height, int stride,
+              int val) {
+  for (int row = 0; row < height; row++) {
+    for (int col = 0; col < width; col++) {
+      pixel_array[col] = val;
+    }
+    pixel_array += stride;
+  }
+}
+
+template <typename PixelType>
+void SetArray(PixelType *pixel_array, int width, int height, int stride,
+              ACMRandom *rnd, int low_val, int high_val) {
+  EXPECT_LE(low_val, high_val);
+
+  for (int row = 0; row < height; row++) {
+    for (int col = 0; col < width; col++) {
+      const int val =
+          static_cast<int>((*rnd).PseudoUniform(high_val - low_val));
+      pixel_array[col] = low_val + val;
+    }
+    pixel_array += stride;
+  }
+}
+
+template <typename ValueType>
+bool CheckArrayEqual(const ValueType *arr_1, const ValueType *arr_2, int width,
+                     int height, int stride_1, int stride_2) {
+  for (int row = 0; row < height; row++) {
+    for (int col = 0; col < width; col++) {
+      if (arr_1[col] != arr_2[col]) {
+        return false;
+      }
+    }
+    arr_1 += stride_1;
+    arr_2 += stride_2;
+  }
+  return true;
+}
+
+template <typename ValueType>
+void PrintArrayDiff(const ValueType *arr_1, const ValueType *arr_2, int width,
+                    int height, int stride_1, int stride_2) {
+  const ValueType *arr_1_start = arr_1, *arr_2_start = arr_2;
+
+  printf("Array 1:\n");
+  for (int row = 0; row < height; ++row) {
+    for (int col = 0; col < width; ++col) {
+      if (arr_1[col] != arr_2[col]) {
+        printf("*%3d", arr_1[col]);
+      } else {
+        printf("%4d", arr_1[col]);
+      }
+    }
+    printf("\n");
+    arr_1 += stride_1;
+    arr_2 += stride_2;
+  }
+
+  arr_1 = arr_1_start;
+  arr_2 = arr_2_start;
+
+  printf("Array 2:\n");
+  for (int row = 0; row < height; ++row) {
+    for (int col = 0; col < width; ++col) {
+      if (arr_1[col] != arr_2[col]) {
+        printf("*%3d", arr_2[col]);
+      } else {
+        printf("%4d", arr_2[col]);
+      }
+    }
+    printf("\n");
+    arr_1 += stride_1;
+    arr_2 += stride_2;
+  }
+
+  arr_1 = arr_1_start;
+  arr_2 = arr_2_start;
+  printf("Difference:\n");
+  for (int row = 0; row < height; ++row) {
+    for (int col = 0; col < width; ++col) {
+      printf("%4d", arr_1[col] - arr_2[col]);
+    }
+    printf("\n");
+    arr_1 += stride_1;
+    arr_2 += stride_2;
+  }
+}
+
+template <typename PixelType>
+void ApplyReferenceFilter(const PixelType *y_src, const PixelType *y_pre,
+                          const PixelType *u_src, const PixelType *v_src,
+                          const PixelType *u_pre, const PixelType *v_pre,
+                          unsigned int block_width, unsigned int block_height,
+                          int ss_x, int ss_y, int strength,
+                          const int *const blk_fw, int use_32x32,
+                          uint32_t *y_accum, uint16_t *y_count,
+                          uint32_t *u_accum, uint16_t *u_count,
+                          uint32_t *v_accum, uint16_t *v_count) {
+  const int uv_block_width = block_width >> ss_x,
+            uv_block_height = block_height >> ss_y;
+  const int y_src_stride = block_width, y_pre_stride = block_width;
+  const int uv_src_stride = uv_block_width, uv_pre_stride = uv_block_width;
+  const int y_diff_stride = block_width, uv_diff_stride = uv_block_width;
+  const int y_count_stride = block_width, u_count_stride = uv_block_width,
+            v_count_stride = uv_block_width;
+  const int y_accum_stride = block_width, u_accum_stride = uv_block_width,
+            v_accum_stride = uv_block_width;
+
+  int y_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  int u_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  int v_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+
+  const int rounding = (1 << strength) >> 1;
+
+  // Get the square diffs
+  for (int row = 0; row < (int)block_height; row++) {
+    for (int col = 0; col < (int)block_width; col++) {
+      const int diff =
+          y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col];
+      y_dif[row * y_diff_stride + col] = diff * diff;
+    }
+  }
+
+  for (int row = 0; row < (int)uv_block_height; row++) {
+    for (int col = 0; col < (int)uv_block_width; col++) {
+      const int u_diff =
+          u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col];
+      const int v_diff =
+          v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col];
+      u_dif[row * uv_diff_stride + col] = u_diff * u_diff;
+      v_dif[row * uv_diff_stride + col] = v_diff * v_diff;
+    }
+  }
+
+  // Apply the filter to luma
+  for (int row = 0; row < (int)block_height; row++) {
+    for (int col = 0; col < (int)block_width; col++) {
+      const int uv_row = row >> ss_y;
+      const int uv_col = col >> ss_x;
+      const int filter_weight = GetFilterWeight(row, col, block_height,
+                                                block_width, blk_fw, use_32x32);
+
+      // First we get the modifier for the current y pixel
+      const int y_pixel = y_pre[row * y_pre_stride + col];
+      int y_num_used = 0;
+      int y_mod = 0;
+
+      // Sum the neighboring 3x3 y pixels
+      for (int row_step = -1; row_step <= 1; row_step++) {
+        for (int col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = row + row_step;
+          const int sub_col = col + col_step;
+
+          if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 &&
+              sub_col < (int)block_width) {
+            y_mod += y_dif[sub_row * y_diff_stride + sub_col];
+            y_num_used++;
+          }
+        }
+      }
+
+      // Sum the corresponding uv pixels to the current y modifier
+      // Note we are rounding down instead of rounding to the nearest pixel.
+      y_mod += u_dif[uv_row * uv_diff_stride + uv_col];
+      y_mod += v_dif[uv_row * uv_diff_stride + uv_col];
+
+      y_num_used += 2;
+
+      // Set the modifier
+      y_mod = GetModIndex<PixelType>(y_mod, y_num_used, rounding, strength,
+                                     filter_weight);
+
+      // Accumulate the result
+      y_count[row * y_count_stride + col] += y_mod;
+      y_accum[row * y_accum_stride + col] += y_mod * y_pixel;
+    }
+  }
+
+  // Apply the filter to chroma
+  for (int uv_row = 0; uv_row < (int)uv_block_height; uv_row++) {
+    for (int uv_col = 0; uv_col < (int)uv_block_width; uv_col++) {
+      const int y_row = uv_row << ss_y;
+      const int y_col = uv_col << ss_x;
+      const int filter_weight = GetFilterWeight(
+          uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
+
+      const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col];
+      const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col];
+
+      int uv_num_used = 0;
+      int u_mod = 0, v_mod = 0;
+
+      // Sum the neighboring 3x3 chromal pixels to the chroma modifier
+      for (int row_step = -1; row_step <= 1; row_step++) {
+        for (int col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = uv_row + row_step;
+          const int sub_col = uv_col + col_step;
+
+          if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
+              sub_col < uv_block_width) {
+            u_mod += u_dif[sub_row * uv_diff_stride + sub_col];
+            v_mod += v_dif[sub_row * uv_diff_stride + sub_col];
+            uv_num_used++;
+          }
+        }
+      }
+
+      // Sum all the luma pixels associated with the current luma pixel
+      for (int row_step = 0; row_step < 1 + ss_y; row_step++) {
+        for (int col_step = 0; col_step < 1 + ss_x; col_step++) {
+          const int sub_row = y_row + row_step;
+          const int sub_col = y_col + col_step;
+          const int y_diff = y_dif[sub_row * y_diff_stride + sub_col];
+
+          u_mod += y_diff;
+          v_mod += y_diff;
+          uv_num_used++;
+        }
+      }
+
+      // Set the modifier
+      u_mod = GetModIndex<PixelType>(u_mod, uv_num_used, rounding, strength,
+                                     filter_weight);
+      v_mod = GetModIndex<PixelType>(v_mod, uv_num_used, rounding, strength,
+                                     filter_weight);
+
+      // Accumulate the result
+      u_count[uv_row * u_count_stride + uv_col] += u_mod;
+      u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel;
+      v_count[uv_row * v_count_stride + uv_col] += v_mod;
+      v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel;
+    }
+  }
+}
+
+class YUVTemporalFilterTest
+    : public ::testing::TestWithParam<TemporalFilterWithBd> {
+ public:
+  virtual void SetUp() {
+    filter_func_ = GetParam().temporal_filter;
+    bd_ = GetParam().bd;
+    use_highbd_ = (bd_ != 8);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    saturate_test_ = 0;
+    num_repeats_ = 10;
+
+    ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12);
+  }
+
+ protected:
+  template <typename PixelType>
+  void CompareTestWithParam(int width, int height, int ss_x, int ss_y,
+                            int filter_strength, int use_32x32,
+                            const int *filter_weight);
+  template <typename PixelType>
+  void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y,
+                              int filter_strength, int use_32x32,
+                              const int *filter_weight);
+  template <typename PixelType>
+  void ApplyTestFilter(const PixelType *y_src, int y_src_stride,
+                       const PixelType *y_pre, int y_pre_stride,
+                       const PixelType *u_src, const PixelType *v_src,
+                       int uv_src_stride, const PixelType *u_pre,
+                       const PixelType *v_pre, int uv_pre_stride,
+                       unsigned int block_width, unsigned int block_height,
+                       int ss_x, int ss_y, int strength, const int *blk_fw,
+                       int use_32x32, uint32_t *y_accum, uint16_t *y_count,
+                       uint32_t *u_accumu, uint16_t *u_count, uint32_t *v_accum,
+                       uint16_t *v_count);
+
+  YUVTemporalFilterFunc filter_func_;
+  ACMRandom rnd_;
+  int saturate_test_;
+  int num_repeats_;
+  int use_highbd_;
+  int bd_;
+};
+
+template <>
+void YUVTemporalFilterTest::ApplyTestFilter<uint8_t>(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
+    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count) {
+  ASM_REGISTER_STATE_CHECK(
+      filter_func_(y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src,
+                   uv_src_stride, u_pre, v_pre, uv_pre_stride, block_width,
+                   block_height, ss_x, ss_y, strength, blk_fw, use_32x32,
+                   y_accum, y_count, u_accum, u_count, v_accum, v_count));
+}
+
+template <>
+void YUVTemporalFilterTest::ApplyTestFilter<uint16_t>(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
+    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count) {
+  ASM_REGISTER_STATE_CHECK(filter_func_(
+      CONVERT_TO_BYTEPTR(y_src), y_src_stride, CONVERT_TO_BYTEPTR(y_pre),
+      y_pre_stride, CONVERT_TO_BYTEPTR(u_src), CONVERT_TO_BYTEPTR(v_src),
+      uv_src_stride, CONVERT_TO_BYTEPTR(u_pre), CONVERT_TO_BYTEPTR(v_pre),
+      uv_pre_stride, block_width, block_height, ss_x, ss_y, strength, blk_fw,
+      use_32x32, y_accum, y_count, u_accum, u_count, v_accum, v_count));
+}
+
+template <typename PixelType>
+void YUVTemporalFilterTest::CompareTestWithParam(int width, int height,
+                                                 int ss_x, int ss_y,
+                                                 int filter_strength,
+                                                 int use_32x32,
+                                                 const int *filter_weight) {
+  const int uv_width = width >> ss_x, uv_height = height >> ss_y;
+  const int y_stride = width, uv_stride = uv_width;
+
+  DECLARE_ALIGNED(16, PixelType, y_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, PixelType, y_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, y_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, y_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, y_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, y_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+
+  DECLARE_ALIGNED(16, PixelType, u_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, PixelType, u_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+
+  DECLARE_ALIGNED(16, PixelType, v_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, PixelType, v_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+
+  for (int repeats = 0; repeats < num_repeats_; repeats++) {
+    if (saturate_test_) {
+      const int max_val = (1 << bd_) - 1;
+      SetArray(y_src, width, height, y_stride, max_val);
+      SetArray(y_pre, width, height, y_stride, 0);
+      SetArray(u_src, uv_width, uv_height, uv_stride, max_val);
+      SetArray(u_pre, uv_width, uv_height, uv_stride, 0);
+      SetArray(v_src, uv_width, uv_height, uv_stride, max_val);
+      SetArray(v_pre, uv_width, uv_height, uv_stride, 0);
+    } else {
+      const int max_val = 7 << (bd_ - 8);
+      SetArray(y_src, width, height, y_stride, &rnd_, 0, max_val);
+      SetArray(y_pre, width, height, y_stride, &rnd_, 0, max_val);
+      SetArray(u_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
+      SetArray(u_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
+      SetArray(v_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
+      SetArray(v_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
+    }
+
+    ApplyReferenceFilter<PixelType>(
+        y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y,
+        filter_strength, filter_weight, use_32x32, y_accum_ref, y_count_ref,
+        u_accum_ref, u_count_ref, v_accum_ref, v_count_ref);
+
+    ApplyTestFilter(y_src, y_stride, y_pre, y_stride, u_src, v_src, uv_stride,
+                    u_pre, v_pre, uv_stride, width, height, ss_x, ss_y,
+                    filter_strength, filter_weight, use_32x32, y_accum_tst,
+                    y_count_tst, u_accum_tst, u_count_tst, v_accum_tst,
+                    v_count_tst);
+
+    EXPECT_TRUE(CheckArrayEqual(y_accum_tst, y_accum_ref, width, height,
+                                y_stride, y_stride));
+    EXPECT_TRUE(CheckArrayEqual(y_count_tst, y_count_ref, width, height,
+                                y_stride, y_stride));
+    EXPECT_TRUE(CheckArrayEqual(u_accum_tst, u_accum_ref, uv_width, uv_height,
+                                uv_stride, uv_stride));
+    EXPECT_TRUE(CheckArrayEqual(u_count_tst, u_count_ref, uv_width, uv_height,
+                                uv_stride, uv_stride));
+    EXPECT_TRUE(CheckArrayEqual(v_accum_tst, v_accum_ref, uv_width, uv_height,
+                                uv_stride, uv_stride));
+    EXPECT_TRUE(CheckArrayEqual(v_count_tst, v_count_ref, uv_width, uv_height,
+                                uv_stride, uv_stride));
+
+    if (HasFailure()) {
+      if (use_32x32) {
+        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y,
+               filter_strength, *filter_weight);
+      } else {
+        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x,
+               ss_y, filter_strength, filter_weight[0], filter_weight[1],
+               filter_weight[2], filter_weight[3]);
+      }
+
+      PrintArrayDiff(y_accum_ref, y_accum_tst, width, height, y_stride,
+                     y_stride);
+      PrintArrayDiff(y_count_ref, y_count_tst, width, height, y_stride,
+                     y_stride);
+      PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride,
+                     uv_stride);
+      PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride,
+                     uv_stride);
+      PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride,
+                     uv_stride);
+      PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride,
+                     uv_stride);
+
+      return;
+    }
+  }
+}
+
+template <typename PixelType>
+void YUVTemporalFilterTest::RunTestFilterWithParam(int width, int height,
+                                                   int ss_x, int ss_y,
+                                                   int filter_strength,
+                                                   int use_32x32,
+                                                   const int *filter_weight) {
+  PixelType y_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  PixelType y_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  uint16_t y_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  uint32_t y_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+
+  PixelType u_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  PixelType u_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  uint16_t u_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  uint32_t u_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+
+  PixelType v_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  PixelType v_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  uint16_t v_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  uint32_t v_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+
+  SetArray(y_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+  SetArray(y_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+  SetArray(u_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+  SetArray(u_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+  SetArray(v_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+  SetArray(v_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+
+  for (int repeats = 0; repeats < num_repeats_; repeats++) {
+    ApplyTestFilter(y_src, MAX_WIDTH, y_pre, MAX_WIDTH, u_src, v_src, MAX_WIDTH,
+                    u_pre, v_pre, MAX_WIDTH, width, height, ss_x, ss_y,
+                    filter_strength, filter_weight, use_32x32, y_accum, y_count,
+                    u_accum, u_count, v_accum, v_count);
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, Use32x32) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 1;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_strength = 0; filter_strength <= 6;
+           filter_strength += 2) {
+        for (int filter_weight = 0; filter_weight <= 2; filter_weight++) {
+          if (use_highbd_) {
+            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                           adjusted_strength, use_32x32,
+                                           &filter_weight);
+          } else {
+            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                          filter_strength, use_32x32,
+                                          &filter_weight);
+          }
+          ASSERT_FALSE(HasFailure());
+        }
+      }
+    }
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, Use16x16) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 0;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) {
+        // Set up the filter
+        int filter_weight[4];
+        int filter_idx_cp = filter_idx;
+        for (int idx = 0; idx < 4; idx++) {
+          filter_weight[idx] = filter_idx_cp % 3;
+          filter_idx_cp /= 3;
+        }
+
+        // Test each parameter
+        for (int filter_strength = 0; filter_strength <= 6;
+             filter_strength += 2) {
+          if (use_highbd_) {
+            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                           adjusted_strength, use_32x32,
+                                           filter_weight);
+          } else {
+            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                          filter_strength, use_32x32,
+                                          filter_weight);
+          }
+
+          ASSERT_FALSE(HasFailure());
+        }
+      }
+    }
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, SaturationTest) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 1;
+  const int filter_weight = 1;
+  saturate_test_ = 1;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_strength = 0; filter_strength <= 6;
+           filter_strength += 2) {
+        if (use_highbd_) {
+          const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+          CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                         adjusted_strength, use_32x32,
+                                         &filter_weight);
+        } else {
+          CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                        filter_strength, use_32x32,
+                                        &filter_weight);
+        }
+
+        ASSERT_FALSE(HasFailure());
+      }
+    }
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, DISABLED_Speed) {
+  const int width = 32, height = 32;
+  num_repeats_ = 1000;
+
+  for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) {
+    const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3;
+    for (int ss_x = 0; ss_x <= 1; ss_x++) {
+      for (int ss_y = 0; ss_y <= 1; ss_y++) {
+        for (int filter_idx = 0; filter_idx < num_filter_weights;
+             filter_idx++) {
+          // Set up the filter
+          int filter_weight[4];
+          int filter_idx_cp = filter_idx;
+          for (int idx = 0; idx < 4; idx++) {
+            filter_weight[idx] = filter_idx_cp % 3;
+            filter_idx_cp /= 3;
+          }
+
+          // Test each parameter
+          for (int filter_strength = 0; filter_strength <= 6;
+               filter_strength += 2) {
+            aom_usec_timer timer;
+            aom_usec_timer_start(&timer);
+
+            if (use_highbd_) {
+              RunTestFilterWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                               filter_strength, use_32x32,
+                                               filter_weight);
+            } else {
+              RunTestFilterWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                              filter_strength, use_32x32,
+                                              filter_weight);
+            }
+
+            aom_usec_timer_mark(&timer);
+            const int elapsed_time =
+                static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+            printf(
+                "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: "
+                "%d, Strength: %d, Time: %5d\n",
+                bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength,
+                elapsed_time);
+          }
+        }
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, YUVTemporalFilterTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&av1_apply_temporal_filter_c, 8),
+        TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_c, 10),
+        TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_c, 12)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, YUVTemporalFilterTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&av1_apply_temporal_filter_sse4_1, 8),
+        TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_sse4_1, 10),
+        TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_sse4_1, 12)));
+#endif  // HAVE_SSE4_1
+
+}  // namespace
diff --git a/libaom/third_party/libwebm/AUTHORS.TXT b/libaom/third_party/libwebm/AUTHORS.TXT
index 8ab6f79..9686ac1 100644
--- a/libaom/third_party/libwebm/AUTHORS.TXT
+++ b/libaom/third_party/libwebm/AUTHORS.TXT
@@ -1,4 +1,4 @@
-# Names should be added to this file like so:
-# Name or Organization <email address>
-
-Google Inc.
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
diff --git a/libaom/third_party/libwebm/README.libaom b/libaom/third_party/libwebm/README.libaom
index bd288d2..17b2f47 100644
--- a/libaom/third_party/libwebm/README.libaom
+++ b/libaom/third_party/libwebm/README.libaom
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: af81f26025b7435fa9a14ad07c58b44cf9280430
+Version: 9f23fbc50e7a76c815b1d3f0309abe1066301331
 License: BSD
 License File: LICENSE.txt
 
@@ -7,8 +7,6 @@ Description:
 libwebm is used to handle WebM container I/O.
 
 Local Changes:
-Add av1 codec as an eligible codec for webm:
- https://aomedia-review.googlesource.com/c/aom/+/15103
 Only keep:
  - Android.mk
  - AUTHORS.TXT
diff --git a/libaom/third_party/libwebm/common/file_util.cc b/libaom/third_party/libwebm/common/file_util.cc
index 618ffc0..e6109d5 100644
--- a/libaom/third_party/libwebm/common/file_util.cc
+++ b/libaom/third_party/libwebm/common/file_util.cc
@@ -46,7 +46,7 @@ std::string GetTempFileName() {
   errno_t err = tmpnam_s(tmp_file_name);
 #else
   char* fname_pointer = tmpnam(tmp_file_name);
-  errno_t err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1;
+  int err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1;
 #endif
   if (err == 0) {
     return std::string(tmp_file_name);
diff --git a/libaom/third_party/libwebm/common/webmids.h b/libaom/third_party/libwebm/common/webmids.h
index 89d722a..fc0c208 100644
--- a/libaom/third_party/libwebm/common/webmids.h
+++ b/libaom/third_party/libwebm/common/webmids.h
@@ -93,6 +93,7 @@ enum MkvId {
   kMkvDisplayHeight = 0x54BA,
   kMkvDisplayUnit = 0x54B2,
   kMkvAspectRatioType = 0x54B3,
+  kMkvColourSpace = 0x2EB524,
   kMkvFrameRate = 0x2383E3,
   // end video
   // colour
diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index bae2c99..5120312 100644
--- a/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -773,6 +773,14 @@ bool Track::Write(IMkvWriter* writer) const {
   if (!type_ || !codec_id_)
     return false;
 
+  // AV1 tracks require a CodecPrivate. See
+  // https://github.com/Matroska-Org/matroska-specification/blob/av1-mappin/codec/av1.md
+  // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to
+  // point to a stable version once it is finalized, or our own WebM mappings
+  // page on webmproject.org should we decide to release them.
+  if (!strcmp(codec_id_, Tracks::kAv1CodecId) && !codec_private_)
+    return false;
+
   // |size| may be bigger than what is written out in this function because
   // derived classes may write out more data in the Track element.
   const uint64_t payload_size = PayloadSize();
@@ -1027,19 +1035,16 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const {
       !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) {
     return false;
   }
-  if (r_ &&
-      !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX,
-                 libwebm::kMkvPrimaryRChromaticityY)) {
+  if (r_ && !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX,
+                       libwebm::kMkvPrimaryRChromaticityY)) {
     return false;
   }
-  if (g_ &&
-      !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX,
-                 libwebm::kMkvPrimaryGChromaticityY)) {
+  if (g_ && !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX,
+                       libwebm::kMkvPrimaryGChromaticityY)) {
     return false;
   }
-  if (b_ &&
-      !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX,
-                 libwebm::kMkvPrimaryBChromaticityY)) {
+  if (b_ && !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX,
+                       libwebm::kMkvPrimaryBChromaticityY)) {
     return false;
   }
   if (white_point_ &&
@@ -1421,6 +1426,7 @@ VideoTrack::VideoTrack(unsigned int* seed)
       stereo_mode_(0),
       alpha_mode_(0),
       width_(0),
+      colour_space_(NULL),
       colour_(NULL),
       projection_(NULL) {}
 
@@ -1518,6 +1524,10 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
                           static_cast<uint64>(alpha_mode_)))
       return false;
   }
+  if (colour_space_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvColourSpace, colour_space_))
+      return false;
+  }
   if (frame_rate_ > 0.0) {
     if (!WriteEbmlElement(writer, libwebm::kMkvFrameRate,
                           static_cast<float>(frame_rate_))) {
@@ -1542,6 +1552,22 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
   return true;
 }
 
+void VideoTrack::set_colour_space(const char* colour_space) {
+  if (colour_space) {
+    delete[] colour_space_;
+
+    const size_t length = strlen(colour_space) + 1;
+    colour_space_ = new (std::nothrow) char[length];  // NOLINT
+    if (colour_space_) {
+#ifdef _MSC_VER
+      strcpy_s(colour_space_, length, colour_space);
+#else
+      strcpy(colour_space_, colour_space);
+#endif
+    }
+  }
+}
+
 bool VideoTrack::SetColour(const Colour& colour) {
   std::unique_ptr<Colour> colour_ptr(new Colour());
   if (!colour_ptr.get())
@@ -1625,6 +1651,8 @@ uint64_t VideoTrack::VideoPayloadSize() const {
   if (frame_rate_ > 0.0)
     size += EbmlElementSize(libwebm::kMkvFrameRate,
                             static_cast<float>(frame_rate_));
+  if (colour_space_)
+    size += EbmlElementSize(libwebm::kMkvColourSpace, colour_space_);
   if (colour_)
     size += colour_->ColourSize();
   if (projection_)
@@ -1702,10 +1730,9 @@ bool AudioTrack::Write(IMkvWriter* writer) const {
 
 const char Tracks::kOpusCodecId[] = "A_OPUS";
 const char Tracks::kVorbisCodecId[] = "A_VORBIS";
+const char Tracks::kAv1CodecId[] = "V_AV1";
 const char Tracks::kVp8CodecId[] = "V_VP8";
 const char Tracks::kVp9CodecId[] = "V_VP9";
-const char Tracks::kVp10CodecId[] = "V_VP10";
-const char Tracks::kAV1CodecId[] = "V_AV1";
 const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS";
 const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS";
 const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA";
@@ -4161,15 +4188,15 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) {
 }
 
 bool Segment::DocTypeIsWebm() const {
-  const int kNumCodecIds = 10;
+  const int kNumCodecIds = 9;
 
   // TODO(vigneshv): Tweak .clang-format.
   const char* kWebmCodecIds[kNumCodecIds] = {
       Tracks::kOpusCodecId,          Tracks::kVorbisCodecId,
-      Tracks::kVp8CodecId,           Tracks::kVp9CodecId,
-      Tracks::kVp10CodecId,          Tracks::kAV1CodecId,
-      Tracks::kWebVttCaptionsId,     Tracks::kWebVttDescriptionsId,
-      Tracks::kWebVttMetadataId,     Tracks::kWebVttSubtitlesId};
+      Tracks::kAv1CodecId,           Tracks::kVp8CodecId,
+      Tracks::kVp9CodecId,           Tracks::kWebVttCaptionsId,
+      Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId,
+      Tracks::kWebVttSubtitlesId};
 
   const int num_tracks = static_cast<int>(tracks_.track_entries_size());
   for (int track_index = 0; track_index < num_tracks; ++track_index) {
diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h b/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h
index 9e817bc..f2db377 100644
--- a/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h
+++ b/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -795,6 +795,8 @@ class VideoTrack : public Track {
   uint64_t alpha_mode() { return alpha_mode_; }
   void set_width(uint64_t width) { width_ = width; }
   uint64_t width() const { return width_; }
+  void set_colour_space(const char* colour_space);
+  const char* colour_space() const { return colour_space_; }
 
   Colour* colour() { return colour_; }
 
@@ -824,6 +826,7 @@ class VideoTrack : public Track {
   uint64_t stereo_mode_;
   uint64_t alpha_mode_;
   uint64_t width_;
+  char* colour_space_;
 
   Colour* colour_;
   Projection* projection_;
@@ -871,10 +874,9 @@ class Tracks {
 
   static const char kOpusCodecId[];
   static const char kVorbisCodecId[];
+  static const char kAv1CodecId[];
   static const char kVp8CodecId[];
   static const char kVp9CodecId[];
-  static const char kVp10CodecId[];
-  static const char kAV1CodecId[];
   static const char kWebVttCaptionsId[];
   static const char kWebVttDescriptionsId[];
   static const char kWebVttMetadataId[];
diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 355d4e2..3bff7cd 100644
--- a/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -136,9 +136,8 @@ uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
     return false;
   }
 
-  if (!frame->is_key() &&
-      !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
-                        reference_block_timestamp)) {
+  if (!frame->is_key() && !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
+                                            reference_block_timestamp)) {
     return false;
   }
 
diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc b/libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc
index 84655d8..d668384 100644
--- a/libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc
+++ b/libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -78,6 +78,8 @@ int32 MkvWriter::Position(int64 position) {
 
 #ifdef _MSC_VER
   return _fseeki64(file_, position, SEEK_SET);
+#elif defined(_WIN32)
+  return fseeko64(file_, static_cast<off_t>(position), SEEK_SET);
 #else
   return fseeko(file_, static_cast<off_t>(position), SEEK_SET);
 #endif
diff --git a/libaom/third_party/libwebm/mkvparser/mkvparser.cc b/libaom/third_party/libwebm/mkvparser/mkvparser.cc
index e7b76f7..9c78ead 100644
--- a/libaom/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/libaom/third_party/libwebm/mkvparser/mkvparser.cc
@@ -36,8 +36,6 @@ inline bool isnan(double val) { return std::isnan(val); }
 inline bool isinf(double val) { return std::isinf(val); }
 #endif  // MSC_COMPAT
 
-IMkvReader::~IMkvReader() {}
-
 template <typename Type>
 Type* SafeArrayAlloc(unsigned long long num_elements,
                      unsigned long long element_size) {
@@ -5274,6 +5272,7 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
 VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
                        long long element_size)
     : Track(pSegment, element_start, element_size),
+      m_colour_space(NULL),
       m_colour(NULL),
       m_projection(NULL) {}
 
@@ -5299,6 +5298,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   long long stereo_mode = 0;
 
   double rate = 0.0;
+  char* colour_space = NULL;
 
   IMkvReader* const pReader = pSegment->m_pReader;
 
@@ -5312,7 +5312,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   const long long stop = pos + s.size;
 
   Colour* colour = NULL;
-  Projection* projection = NULL;
+  std::unique_ptr<Projection> projection_ptr;
 
   while (pos < stop) {
     long long id, size;
@@ -5364,8 +5364,16 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
       if (!Colour::Parse(pReader, pos, size, &colour))
         return E_FILE_FORMAT_INVALID;
     } else if (id == libwebm::kMkvProjection) {
-      if (!Projection::Parse(pReader, pos, size, &projection))
+      Projection* projection = NULL;
+      if (!Projection::Parse(pReader, pos, size, &projection)) {
         return E_FILE_FORMAT_INVALID;
+      } else {
+        projection_ptr.reset(projection);
+      }
+    } else if (id == libwebm::kMkvColourSpace) {
+      const long status = UnserializeString(pReader, pos, size, colour_space);
+      if (status < 0)
+        return status;
     }
 
     pos += size;  // consume payload
@@ -5397,7 +5405,8 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   pTrack->m_stereo_mode = stereo_mode;
   pTrack->m_rate = rate;
   pTrack->m_colour = colour;
-  pTrack->m_projection = projection;
+  pTrack->m_colour_space = colour_space;
+  pTrack->m_projection = projection_ptr.release();
 
   pResult = pTrack;
   return 0;  // success
diff --git a/libaom/third_party/libwebm/mkvparser/mkvparser.h b/libaom/third_party/libwebm/mkvparser/mkvparser.h
index 26c2b7e..848d01f 100644
--- a/libaom/third_party/libwebm/mkvparser/mkvparser.h
+++ b/libaom/third_party/libwebm/mkvparser/mkvparser.h
@@ -22,7 +22,7 @@ class IMkvReader {
   virtual int Length(long long* total, long long* available) = 0;
 
  protected:
-  virtual ~IMkvReader();
+  virtual ~IMkvReader() {}
 };
 
 template <typename Type>
@@ -527,6 +527,8 @@ class VideoTrack : public Track {
 
   Projection* GetProjection() const;
 
+  const char* GetColourSpace() const { return m_colour_space; }
+
  private:
   long long m_width;
   long long m_height;
@@ -534,7 +536,7 @@ class VideoTrack : public Track {
   long long m_display_height;
   long long m_display_unit;
   long long m_stereo_mode;
-
+  char* m_colour_space;
   double m_rate;
 
   Colour* m_colour;
diff --git a/libaom/third_party/libwebm/mkvparser/mkvreader.cc b/libaom/third_party/libwebm/mkvparser/mkvreader.cc
index 23d68f5..9d19c1b 100644
--- a/libaom/third_party/libwebm/mkvparser/mkvreader.cc
+++ b/libaom/third_party/libwebm/mkvparser/mkvreader.cc
@@ -118,6 +118,8 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
 
   if (status)
     return -1;  // error
+#elif defined(_WIN32)
+  fseeko64(m_file, static_cast<off_t>(offset), SEEK_SET);
 #else
   fseeko(m_file, static_cast<off_t>(offset), SEEK_SET);
 #endif
diff --git a/libaom/tools/txfm_analyzer/txfm_graph.h b/libaom/tools/txfm_analyzer/txfm_graph.h
index 2e3c955..8dc3614 100644
--- a/libaom/tools/txfm_analyzer/txfm_graph.h
+++ b/libaom/tools/txfm_analyzer/txfm_graph.h
@@ -23,7 +23,6 @@ struct Node {
   int visited;
 };
 
-#define PI (3.141592653589793238462643383279502884)
 #define STAGENUM (10)
 #define NODENUM (32)
 #define COS_MOD (128)
author	Ray Essick <essick@google.com>	2019-03-29 15:30:55 -0700
committer	Ray Essick <essick@google.com>	2019-05-03 21:28:43 +0000
commit	ec6586dd308c18c15b581e3579894b4204c834bc (patch)
tree	97c131f6ce3576d63a07f047e11bc004ed19f117
parent	b2a64d5cd5a1ee0c01456cbeb86c45a72eca9618 (diff)
download	platform_external_libaom-ec6586dd308c18c15b581e3579894b4204c834bc.tar.gz platform_external_libaom-ec6586dd308c18c15b581e3579894b4204c834bc.tar.bz2 platform_external_libaom-ec6586dd308c18c15b581e3579894b4204c834bc.zip